tree-vect-loop-manip.c (vect_create_cond_for_alias_checks): Use new names for group...
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
6
7 This file is part of GCC.
8
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
13
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
22
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "tree-pretty-print.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "cfgloop.h"
35 #include "cfglayout.h"
36 #include "expr.h"
37 #include "recog.h"
38 #include "optabs.h"
39 #include "params.h"
40 #include "diagnostic-core.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "target.h"
45
46 /* Loop Vectorization Pass.
47
48 This pass tries to vectorize loops.
49
50 For example, the vectorizer transforms the following simple loop:
51
52 short a[N]; short b[N]; short c[N]; int i;
53
54 for (i=0; i<N; i++){
55 a[i] = b[i] + c[i];
56 }
57
58 as if it was manually vectorized by rewriting the source code into:
59
60 typedef int __attribute__((mode(V8HI))) v8hi;
61 short a[N]; short b[N]; short c[N]; int i;
62 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
63 v8hi va, vb, vc;
64
65 for (i=0; i<N/8; i++){
66 vb = pb[i];
67 vc = pc[i];
68 va = vb + vc;
69 pa[i] = va;
70 }
71
72 The main entry to this pass is vectorize_loops(), in which
73 the vectorizer applies a set of analyses on a given set of loops,
74 followed by the actual vectorization transformation for the loops that
75 had successfully passed the analysis phase.
76 Throughout this pass we make a distinction between two types of
77 data: scalars (which are represented by SSA_NAMES), and memory references
78 ("data-refs"). These two types of data require different handling both
79 during analysis and transformation. The types of data-refs that the
80 vectorizer currently supports are ARRAY_REFS which base is an array DECL
81 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
82 accesses are required to have a simple (consecutive) access pattern.
83
84 Analysis phase:
85 ===============
86 The driver for the analysis phase is vect_analyze_loop().
87 It applies a set of analyses, some of which rely on the scalar evolution
88 analyzer (scev) developed by Sebastian Pop.
89
90 During the analysis phase the vectorizer records some information
91 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
92 loop, as well as general information about the loop as a whole, which is
93 recorded in a "loop_vec_info" struct attached to each loop.
94
95 Transformation phase:
96 =====================
97 The loop transformation phase scans all the stmts in the loop, and
98 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
99 the loop that needs to be vectorized. It inserts the vector code sequence
100 just before the scalar stmt S, and records a pointer to the vector code
101 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
102 attached to S). This pointer will be used for the vectorization of following
103 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
104 otherwise, we rely on dead code elimination for removing it.
105
106 For example, say stmt S1 was vectorized into stmt VS1:
107
108 VS1: vb = px[i];
109 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
110 S2: a = b;
111
112 To vectorize stmt S2, the vectorizer first finds the stmt that defines
113 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
114 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
115 resulting sequence would be:
116
117 VS1: vb = px[i];
118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
119 VS2: va = vb;
120 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
121
122 Operands that are not SSA_NAMEs, are data-refs that appear in
123 load/store operations (like 'x[i]' in S1), and are handled differently.
124
125 Target modeling:
126 =================
127 Currently the only target specific information that is used is the
128 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
129 Targets that can support different sizes of vectors, for now will need
130 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
131 flexibility will be added in the future.
132
133 Since we only vectorize operations which vector form can be
134 expressed using existing tree codes, to verify that an operation is
135 supported, the vectorizer checks the relevant optab at the relevant
136 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
137 the value found is CODE_FOR_nothing, then there's no target support, and
138 we can't vectorize the stmt.
139
140 For additional information on this project see:
141 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
142 */
143
144 /* Function vect_determine_vectorization_factor
145
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
151
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
155 in the loop.
156
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
158 original loop:
159 for (i=0; i<N; i++){
160 a[i] = b[i] + c[i];
161 }
162
163 vectorized loop:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
166 }
167 */
168
169 static bool
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
171 {
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
177 tree scalar_type;
178 gimple phi;
179 tree vectype;
180 unsigned int nunits;
181 stmt_vec_info stmt_info;
182 int i;
183 HOST_WIDE_INT dummy;
184
185 if (vect_print_dump_info (REPORT_DETAILS))
186 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
187
188 for (i = 0; i < nbbs; i++)
189 {
190 basic_block bb = bbs[i];
191
192 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
193 {
194 phi = gsi_stmt (si);
195 stmt_info = vinfo_for_stmt (phi);
196 if (vect_print_dump_info (REPORT_DETAILS))
197 {
198 fprintf (vect_dump, "==> examining phi: ");
199 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
200 }
201
202 gcc_assert (stmt_info);
203
204 if (STMT_VINFO_RELEVANT_P (stmt_info))
205 {
206 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
207 scalar_type = TREE_TYPE (PHI_RESULT (phi));
208
209 if (vect_print_dump_info (REPORT_DETAILS))
210 {
211 fprintf (vect_dump, "get vectype for scalar type: ");
212 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
213 }
214
215 vectype = get_vectype_for_scalar_type (scalar_type);
216 if (!vectype)
217 {
218 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
219 {
220 fprintf (vect_dump,
221 "not vectorized: unsupported data-type ");
222 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
223 }
224 return false;
225 }
226 STMT_VINFO_VECTYPE (stmt_info) = vectype;
227
228 if (vect_print_dump_info (REPORT_DETAILS))
229 {
230 fprintf (vect_dump, "vectype: ");
231 print_generic_expr (vect_dump, vectype, TDF_SLIM);
232 }
233
234 nunits = TYPE_VECTOR_SUBPARTS (vectype);
235 if (vect_print_dump_info (REPORT_DETAILS))
236 fprintf (vect_dump, "nunits = %d", nunits);
237
238 if (!vectorization_factor
239 || (nunits > vectorization_factor))
240 vectorization_factor = nunits;
241 }
242 }
243
244 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
245 {
246 tree vf_vectype;
247 gimple stmt = gsi_stmt (si);
248 stmt_info = vinfo_for_stmt (stmt);
249
250 if (vect_print_dump_info (REPORT_DETAILS))
251 {
252 fprintf (vect_dump, "==> examining statement: ");
253 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
254 }
255
256 gcc_assert (stmt_info);
257
258 /* skip stmts which do not need to be vectorized. */
259 if (!STMT_VINFO_RELEVANT_P (stmt_info)
260 && !STMT_VINFO_LIVE_P (stmt_info))
261 {
262 if (vect_print_dump_info (REPORT_DETAILS))
263 fprintf (vect_dump, "skip.");
264 continue;
265 }
266
267 if (gimple_get_lhs (stmt) == NULL_TREE)
268 {
269 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
270 {
271 fprintf (vect_dump, "not vectorized: irregular stmt.");
272 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
273 }
274 return false;
275 }
276
277 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
278 {
279 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
280 {
281 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
282 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
283 }
284 return false;
285 }
286
287 if (STMT_VINFO_VECTYPE (stmt_info))
288 {
289 /* The only case when a vectype had been already set is for stmts
290 that contain a dataref, or for "pattern-stmts" (stmts generated
291 by the vectorizer to represent/replace a certain idiom). */
292 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
293 || is_pattern_stmt_p (stmt_info));
294 vectype = STMT_VINFO_VECTYPE (stmt_info);
295 }
296 else
297 {
298 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
299 && !is_pattern_stmt_p (stmt_info));
300
301 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
302 if (vect_print_dump_info (REPORT_DETAILS))
303 {
304 fprintf (vect_dump, "get vectype for scalar type: ");
305 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
306 }
307 vectype = get_vectype_for_scalar_type (scalar_type);
308 if (!vectype)
309 {
310 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
311 {
312 fprintf (vect_dump,
313 "not vectorized: unsupported data-type ");
314 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
315 }
316 return false;
317 }
318
319 STMT_VINFO_VECTYPE (stmt_info) = vectype;
320 }
321
322 /* The vectorization factor is according to the smallest
323 scalar type (or the largest vector size, but we only
324 support one vector size per loop). */
325 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
326 &dummy);
327 if (vect_print_dump_info (REPORT_DETAILS))
328 {
329 fprintf (vect_dump, "get vectype for scalar type: ");
330 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
331 }
332 vf_vectype = get_vectype_for_scalar_type (scalar_type);
333 if (!vf_vectype)
334 {
335 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
336 {
337 fprintf (vect_dump,
338 "not vectorized: unsupported data-type ");
339 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
340 }
341 return false;
342 }
343
344 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
345 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
346 {
347 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
348 {
349 fprintf (vect_dump,
350 "not vectorized: different sized vector "
351 "types in statement, ");
352 print_generic_expr (vect_dump, vectype, TDF_SLIM);
353 fprintf (vect_dump, " and ");
354 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
355 }
356 return false;
357 }
358
359 if (vect_print_dump_info (REPORT_DETAILS))
360 {
361 fprintf (vect_dump, "vectype: ");
362 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
363 }
364
365 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
366 if (vect_print_dump_info (REPORT_DETAILS))
367 fprintf (vect_dump, "nunits = %d", nunits);
368
369 if (!vectorization_factor
370 || (nunits > vectorization_factor))
371 vectorization_factor = nunits;
372 }
373 }
374
375 /* TODO: Analyze cost. Decide if worth while to vectorize. */
376 if (vect_print_dump_info (REPORT_DETAILS))
377 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
378 if (vectorization_factor <= 1)
379 {
380 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
381 fprintf (vect_dump, "not vectorized: unsupported data-type");
382 return false;
383 }
384 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
385
386 return true;
387 }
388
389
390 /* Function vect_is_simple_iv_evolution.
391
392 FORNOW: A simple evolution of an induction variables in the loop is
393 considered a polynomial evolution with constant step. */
394
395 static bool
396 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
397 tree * step)
398 {
399 tree init_expr;
400 tree step_expr;
401 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
402
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
407
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
412
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
415
416 if (vect_print_dump_info (REPORT_DETAILS))
417 {
418 fprintf (vect_dump, "step: ");
419 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
420 fprintf (vect_dump, ", init: ");
421 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
422 }
423
424 *init = init_expr;
425 *step = step_expr;
426
427 if (TREE_CODE (step_expr) != INTEGER_CST)
428 {
429 if (vect_print_dump_info (REPORT_DETAILS))
430 fprintf (vect_dump, "step unknown.");
431 return false;
432 }
433
434 return true;
435 }
436
437 /* Function vect_analyze_scalar_cycles_1.
438
439 Examine the cross iteration def-use cycles of scalar variables
440 in LOOP. LOOP_VINFO represents the loop that is now being
441 considered for vectorization (can be LOOP, or an outer-loop
442 enclosing LOOP). */
443
444 static void
445 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
446 {
447 basic_block bb = loop->header;
448 tree dumy;
449 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
450 gimple_stmt_iterator gsi;
451 bool double_reduc;
452
453 if (vect_print_dump_info (REPORT_DETAILS))
454 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
455
456 /* First - identify all inductions. Reduction detection assumes that all the
457 inductions have been identified, therefore, this order must not be
458 changed. */
459 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
460 {
461 gimple phi = gsi_stmt (gsi);
462 tree access_fn = NULL;
463 tree def = PHI_RESULT (phi);
464 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
465
466 if (vect_print_dump_info (REPORT_DETAILS))
467 {
468 fprintf (vect_dump, "Analyze phi: ");
469 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
470 }
471
472 /* Skip virtual phi's. The data dependences that are associated with
473 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
474 if (!is_gimple_reg (SSA_NAME_VAR (def)))
475 continue;
476
477 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
478
479 /* Analyze the evolution function. */
480 access_fn = analyze_scalar_evolution (loop, def);
481 if (access_fn)
482 STRIP_NOPS (access_fn);
483 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
484 {
485 fprintf (vect_dump, "Access function of PHI: ");
486 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
487 }
488
489 if (!access_fn
490 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
491 {
492 VEC_safe_push (gimple, heap, worklist, phi);
493 continue;
494 }
495
496 if (vect_print_dump_info (REPORT_DETAILS))
497 fprintf (vect_dump, "Detected induction.");
498 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
499 }
500
501
502 /* Second - identify all reductions and nested cycles. */
503 while (VEC_length (gimple, worklist) > 0)
504 {
505 gimple phi = VEC_pop (gimple, worklist);
506 tree def = PHI_RESULT (phi);
507 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
508 gimple reduc_stmt;
509 bool nested_cycle;
510
511 if (vect_print_dump_info (REPORT_DETAILS))
512 {
513 fprintf (vect_dump, "Analyze phi: ");
514 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
515 }
516
517 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
518 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
519
520 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
521 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
522 &double_reduc);
523 if (reduc_stmt)
524 {
525 if (double_reduc)
526 {
527 if (vect_print_dump_info (REPORT_DETAILS))
528 fprintf (vect_dump, "Detected double reduction.");
529
530 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
531 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
532 vect_double_reduction_def;
533 }
534 else
535 {
536 if (nested_cycle)
537 {
538 if (vect_print_dump_info (REPORT_DETAILS))
539 fprintf (vect_dump, "Detected vectorizable nested cycle.");
540
541 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
542 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
543 vect_nested_cycle;
544 }
545 else
546 {
547 if (vect_print_dump_info (REPORT_DETAILS))
548 fprintf (vect_dump, "Detected reduction.");
549
550 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
551 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
552 vect_reduction_def;
553 /* Store the reduction cycles for possible vectorization in
554 loop-aware SLP. */
555 VEC_safe_push (gimple, heap,
556 LOOP_VINFO_REDUCTIONS (loop_vinfo),
557 reduc_stmt);
558 }
559 }
560 }
561 else
562 if (vect_print_dump_info (REPORT_DETAILS))
563 fprintf (vect_dump, "Unknown def-use cycle pattern.");
564 }
565
566 VEC_free (gimple, heap, worklist);
567 }
568
569
570 /* Function vect_analyze_scalar_cycles.
571
572 Examine the cross iteration def-use cycles of scalar variables, by
573 analyzing the loop-header PHIs of scalar variables. Classify each
574 cycle as one of the following: invariant, induction, reduction, unknown.
575 We do that for the loop represented by LOOP_VINFO, and also to its
576 inner-loop, if exists.
577 Examples for scalar cycles:
578
579 Example1: reduction:
580
581 loop1:
582 for (i=0; i<N; i++)
583 sum += a[i];
584
585 Example2: induction:
586
587 loop2:
588 for (i=0; i<N; i++)
589 a[i] = i; */
590
591 static void
592 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
593 {
594 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
595
596 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
597
598 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
599 Reductions in such inner-loop therefore have different properties than
600 the reductions in the nest that gets vectorized:
601 1. When vectorized, they are executed in the same order as in the original
602 scalar loop, so we can't change the order of computation when
603 vectorizing them.
604 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
605 current checks are too strict. */
606
607 if (loop->inner)
608 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
609 }
610
611 /* Function vect_get_loop_niters.
612
613 Determine how many iterations the loop is executed.
614 If an expression that represents the number of iterations
615 can be constructed, place it in NUMBER_OF_ITERATIONS.
616 Return the loop exit condition. */
617
618 static gimple
619 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
620 {
621 tree niters;
622
623 if (vect_print_dump_info (REPORT_DETAILS))
624 fprintf (vect_dump, "=== get_loop_niters ===");
625
626 niters = number_of_exit_cond_executions (loop);
627
628 if (niters != NULL_TREE
629 && niters != chrec_dont_know)
630 {
631 *number_of_iterations = niters;
632
633 if (vect_print_dump_info (REPORT_DETAILS))
634 {
635 fprintf (vect_dump, "==> get_loop_niters:" );
636 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
637 }
638 }
639
640 return get_loop_exit_condition (loop);
641 }
642
643
644 /* Function bb_in_loop_p
645
646 Used as predicate for dfs order traversal of the loop bbs. */
647
648 static bool
649 bb_in_loop_p (const_basic_block bb, const void *data)
650 {
651 const struct loop *const loop = (const struct loop *)data;
652 if (flow_bb_inside_loop_p (loop, bb))
653 return true;
654 return false;
655 }
656
657
658 /* Function new_loop_vec_info.
659
660 Create and initialize a new loop_vec_info struct for LOOP, as well as
661 stmt_vec_info structs for all the stmts in LOOP. */
662
663 static loop_vec_info
664 new_loop_vec_info (struct loop *loop)
665 {
666 loop_vec_info res;
667 basic_block *bbs;
668 gimple_stmt_iterator si;
669 unsigned int i, nbbs;
670
671 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
672 LOOP_VINFO_LOOP (res) = loop;
673
674 bbs = get_loop_body (loop);
675
676 /* Create/Update stmt_info for all stmts in the loop. */
677 for (i = 0; i < loop->num_nodes; i++)
678 {
679 basic_block bb = bbs[i];
680
681 /* BBs in a nested inner-loop will have been already processed (because
682 we will have called vect_analyze_loop_form for any nested inner-loop).
683 Therefore, for stmts in an inner-loop we just want to update the
684 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
685 loop_info of the outer-loop we are currently considering to vectorize
686 (instead of the loop_info of the inner-loop).
687 For stmts in other BBs we need to create a stmt_info from scratch. */
688 if (bb->loop_father != loop)
689 {
690 /* Inner-loop bb. */
691 gcc_assert (loop->inner && bb->loop_father == loop->inner);
692 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
693 {
694 gimple phi = gsi_stmt (si);
695 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
696 loop_vec_info inner_loop_vinfo =
697 STMT_VINFO_LOOP_VINFO (stmt_info);
698 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
699 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
700 }
701 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
702 {
703 gimple stmt = gsi_stmt (si);
704 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
705 loop_vec_info inner_loop_vinfo =
706 STMT_VINFO_LOOP_VINFO (stmt_info);
707 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
708 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
709 }
710 }
711 else
712 {
713 /* bb in current nest. */
714 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
715 {
716 gimple phi = gsi_stmt (si);
717 gimple_set_uid (phi, 0);
718 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
719 }
720
721 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
722 {
723 gimple stmt = gsi_stmt (si);
724 gimple_set_uid (stmt, 0);
725 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
726 }
727 }
728 }
729
730 /* CHECKME: We want to visit all BBs before their successors (except for
731 latch blocks, for which this assertion wouldn't hold). In the simple
732 case of the loop forms we allow, a dfs order of the BBs would the same
733 as reversed postorder traversal, so we are safe. */
734
735 free (bbs);
736 bbs = XCNEWVEC (basic_block, loop->num_nodes);
737 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
738 bbs, loop->num_nodes, loop);
739 gcc_assert (nbbs == loop->num_nodes);
740
741 LOOP_VINFO_BBS (res) = bbs;
742 LOOP_VINFO_NITERS (res) = NULL;
743 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
744 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
745 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
746 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
747 LOOP_VINFO_VECT_FACTOR (res) = 0;
748 LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
749 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
750 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
751 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
752 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
753 VEC_alloc (gimple, heap,
754 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
755 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
756 VEC_alloc (ddr_p, heap,
757 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
758 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
759 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
760 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
761 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
762 LOOP_VINFO_PEELING_HTAB (res) = NULL;
763
764 return res;
765 }
766
767
768 /* Function destroy_loop_vec_info.
769
770 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
771 stmts in the loop. */
772
773 void
774 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
775 {
776 struct loop *loop;
777 basic_block *bbs;
778 int nbbs;
779 gimple_stmt_iterator si;
780 int j;
781 VEC (slp_instance, heap) *slp_instances;
782 slp_instance instance;
783
784 if (!loop_vinfo)
785 return;
786
787 loop = LOOP_VINFO_LOOP (loop_vinfo);
788
789 bbs = LOOP_VINFO_BBS (loop_vinfo);
790 nbbs = loop->num_nodes;
791
792 if (!clean_stmts)
793 {
794 free (LOOP_VINFO_BBS (loop_vinfo));
795 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
796 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
797 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
798 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
799 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
800
801 free (loop_vinfo);
802 loop->aux = NULL;
803 return;
804 }
805
806 for (j = 0; j < nbbs; j++)
807 {
808 basic_block bb = bbs[j];
809 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
810 free_stmt_vec_info (gsi_stmt (si));
811
812 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
813 {
814 gimple stmt = gsi_stmt (si);
815 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
816
817 if (stmt_info)
818 {
819 /* Check if this is a "pattern stmt" (introduced by the
820 vectorizer during the pattern recognition pass). */
821 bool remove_stmt_p = false;
822 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
823 if (orig_stmt)
824 {
825 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
826 if (orig_stmt_info
827 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
828 remove_stmt_p = true;
829 }
830
831 /* Free stmt_vec_info. */
832 free_stmt_vec_info (stmt);
833
834 /* Remove dead "pattern stmts". */
835 if (remove_stmt_p)
836 gsi_remove (&si, true);
837 }
838 gsi_next (&si);
839 }
840 }
841
842 free (LOOP_VINFO_BBS (loop_vinfo));
843 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
844 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
845 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
846 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
847 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
848 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
849 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
850 vect_free_slp_instance (instance);
851
852 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
853 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
854 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
855
856 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
857 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
858
859 free (loop_vinfo);
860 loop->aux = NULL;
861 }
862
863
864 /* Function vect_analyze_loop_1.
865
866 Apply a set of analyses on LOOP, and create a loop_vec_info struct
867 for it. The different analyses will record information in the
868 loop_vec_info struct. This is a subset of the analyses applied in
869 vect_analyze_loop, to be applied on an inner-loop nested in the loop
870 that is now considered for (outer-loop) vectorization. */
871
872 static loop_vec_info
873 vect_analyze_loop_1 (struct loop *loop)
874 {
875 loop_vec_info loop_vinfo;
876
877 if (vect_print_dump_info (REPORT_DETAILS))
878 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
879
880 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
881
882 loop_vinfo = vect_analyze_loop_form (loop);
883 if (!loop_vinfo)
884 {
885 if (vect_print_dump_info (REPORT_DETAILS))
886 fprintf (vect_dump, "bad inner-loop form.");
887 return NULL;
888 }
889
890 return loop_vinfo;
891 }
892
893
894 /* Function vect_analyze_loop_form.
895
896 Verify that certain CFG restrictions hold, including:
897 - the loop has a pre-header
898 - the loop has a single entry and exit
899 - the loop exit condition is simple enough, and the number of iterations
900 can be analyzed (a countable loop). */
901
902 loop_vec_info
903 vect_analyze_loop_form (struct loop *loop)
904 {
905 loop_vec_info loop_vinfo;
906 gimple loop_cond;
907 tree number_of_iterations = NULL;
908 loop_vec_info inner_loop_vinfo = NULL;
909
910 if (vect_print_dump_info (REPORT_DETAILS))
911 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
912
913 /* Different restrictions apply when we are considering an inner-most loop,
914 vs. an outer (nested) loop.
915 (FORNOW. May want to relax some of these restrictions in the future). */
916
917 if (!loop->inner)
918 {
919 /* Inner-most loop. We currently require that the number of BBs is
920 exactly 2 (the header and latch). Vectorizable inner-most loops
921 look like this:
922
923 (pre-header)
924 |
925 header <--------+
926 | | |
927 | +--> latch --+
928 |
929 (exit-bb) */
930
931 if (loop->num_nodes != 2)
932 {
933 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
934 fprintf (vect_dump, "not vectorized: control flow in loop.");
935 return NULL;
936 }
937
938 if (empty_block_p (loop->header))
939 {
940 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
941 fprintf (vect_dump, "not vectorized: empty loop.");
942 return NULL;
943 }
944 }
945 else
946 {
947 struct loop *innerloop = loop->inner;
948 edge entryedge;
949
950 /* Nested loop. We currently require that the loop is doubly-nested,
951 contains a single inner loop, and the number of BBs is exactly 5.
952 Vectorizable outer-loops look like this:
953
954 (pre-header)
955 |
956 header <---+
957 | |
958 inner-loop |
959 | |
960 tail ------+
961 |
962 (exit-bb)
963
964 The inner-loop has the properties expected of inner-most loops
965 as described above. */
966
967 if ((loop->inner)->inner || (loop->inner)->next)
968 {
969 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
970 fprintf (vect_dump, "not vectorized: multiple nested loops.");
971 return NULL;
972 }
973
974 /* Analyze the inner-loop. */
975 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
976 if (!inner_loop_vinfo)
977 {
978 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
979 fprintf (vect_dump, "not vectorized: Bad inner loop.");
980 return NULL;
981 }
982
983 if (!expr_invariant_in_loop_p (loop,
984 LOOP_VINFO_NITERS (inner_loop_vinfo)))
985 {
986 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
987 fprintf (vect_dump,
988 "not vectorized: inner-loop count not invariant.");
989 destroy_loop_vec_info (inner_loop_vinfo, true);
990 return NULL;
991 }
992
993 if (loop->num_nodes != 5)
994 {
995 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
996 fprintf (vect_dump, "not vectorized: control flow in loop.");
997 destroy_loop_vec_info (inner_loop_vinfo, true);
998 return NULL;
999 }
1000
1001 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1002 entryedge = EDGE_PRED (innerloop->header, 0);
1003 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1004 entryedge = EDGE_PRED (innerloop->header, 1);
1005
1006 if (entryedge->src != loop->header
1007 || !single_exit (innerloop)
1008 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1009 {
1010 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1011 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1012 destroy_loop_vec_info (inner_loop_vinfo, true);
1013 return NULL;
1014 }
1015
1016 if (vect_print_dump_info (REPORT_DETAILS))
1017 fprintf (vect_dump, "Considering outer-loop vectorization.");
1018 }
1019
1020 if (!single_exit (loop)
1021 || EDGE_COUNT (loop->header->preds) != 2)
1022 {
1023 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1024 {
1025 if (!single_exit (loop))
1026 fprintf (vect_dump, "not vectorized: multiple exits.");
1027 else if (EDGE_COUNT (loop->header->preds) != 2)
1028 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1029 }
1030 if (inner_loop_vinfo)
1031 destroy_loop_vec_info (inner_loop_vinfo, true);
1032 return NULL;
1033 }
1034
1035 /* We assume that the loop exit condition is at the end of the loop. i.e,
1036 that the loop is represented as a do-while (with a proper if-guard
1037 before the loop if needed), where the loop header contains all the
1038 executable statements, and the latch is empty. */
1039 if (!empty_block_p (loop->latch)
1040 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1041 {
1042 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1043 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1044 if (inner_loop_vinfo)
1045 destroy_loop_vec_info (inner_loop_vinfo, true);
1046 return NULL;
1047 }
1048
1049 /* Make sure there exists a single-predecessor exit bb: */
1050 if (!single_pred_p (single_exit (loop)->dest))
1051 {
1052 edge e = single_exit (loop);
1053 if (!(e->flags & EDGE_ABNORMAL))
1054 {
1055 split_loop_exit_edge (e);
1056 if (vect_print_dump_info (REPORT_DETAILS))
1057 fprintf (vect_dump, "split exit edge.");
1058 }
1059 else
1060 {
1061 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1062 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1063 if (inner_loop_vinfo)
1064 destroy_loop_vec_info (inner_loop_vinfo, true);
1065 return NULL;
1066 }
1067 }
1068
1069 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1070 if (!loop_cond)
1071 {
1072 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1073 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1074 if (inner_loop_vinfo)
1075 destroy_loop_vec_info (inner_loop_vinfo, true);
1076 return NULL;
1077 }
1078
1079 if (!number_of_iterations)
1080 {
1081 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1082 fprintf (vect_dump,
1083 "not vectorized: number of iterations cannot be computed.");
1084 if (inner_loop_vinfo)
1085 destroy_loop_vec_info (inner_loop_vinfo, true);
1086 return NULL;
1087 }
1088
1089 if (chrec_contains_undetermined (number_of_iterations))
1090 {
1091 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1092 fprintf (vect_dump, "Infinite number of iterations.");
1093 if (inner_loop_vinfo)
1094 destroy_loop_vec_info (inner_loop_vinfo, true);
1095 return NULL;
1096 }
1097
1098 if (!NITERS_KNOWN_P (number_of_iterations))
1099 {
1100 if (vect_print_dump_info (REPORT_DETAILS))
1101 {
1102 fprintf (vect_dump, "Symbolic number of iterations is ");
1103 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1104 }
1105 }
1106 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1107 {
1108 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1109 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1110 if (inner_loop_vinfo)
1111 destroy_loop_vec_info (inner_loop_vinfo, false);
1112 return NULL;
1113 }
1114
1115 loop_vinfo = new_loop_vec_info (loop);
1116 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1117 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1118
1119 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1120
1121 /* CHECKME: May want to keep it around it in the future. */
1122 if (inner_loop_vinfo)
1123 destroy_loop_vec_info (inner_loop_vinfo, false);
1124
1125 gcc_assert (!loop->aux);
1126 loop->aux = loop_vinfo;
1127 return loop_vinfo;
1128 }
1129
1130
1131 /* Get cost by calling cost target builtin. */
1132
1133 static inline int
1134 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1135 {
1136 tree dummy_type = NULL;
1137 int dummy = 0;
1138
1139 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1140 dummy_type, dummy);
1141 }
1142
1143
1144 /* Function vect_analyze_loop_operations.
1145
1146 Scan the loop stmts and make sure they are all vectorizable. */
1147
1148 static bool
1149 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1150 {
1151 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1152 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1153 int nbbs = loop->num_nodes;
1154 gimple_stmt_iterator si;
1155 unsigned int vectorization_factor = 0;
1156 int i;
1157 gimple phi;
1158 stmt_vec_info stmt_info;
1159 bool need_to_vectorize = false;
1160 int min_profitable_iters;
1161 int min_scalar_loop_bound;
1162 unsigned int th;
1163 bool only_slp_in_loop = true, ok;
1164
1165 if (vect_print_dump_info (REPORT_DETAILS))
1166 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1167
1168 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1169 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1170 if (slp)
1171 {
1172 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1173 vectorization factor of the loop is the unrolling factor required by
1174 the SLP instances. If that unrolling factor is 1, we say, that we
1175 perform pure SLP on loop - cross iteration parallelism is not
1176 exploited. */
1177 for (i = 0; i < nbbs; i++)
1178 {
1179 basic_block bb = bbs[i];
1180 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1181 {
1182 gimple stmt = gsi_stmt (si);
1183 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1184 gcc_assert (stmt_info);
1185 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1186 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1187 && !PURE_SLP_STMT (stmt_info))
1188 /* STMT needs both SLP and loop-based vectorization. */
1189 only_slp_in_loop = false;
1190 }
1191 }
1192
1193 if (only_slp_in_loop)
1194 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1195 else
1196 vectorization_factor = least_common_multiple (vectorization_factor,
1197 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1198
1199 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1200 if (vect_print_dump_info (REPORT_DETAILS))
1201 fprintf (vect_dump, "Updating vectorization factor to %d ",
1202 vectorization_factor);
1203 }
1204
1205 for (i = 0; i < nbbs; i++)
1206 {
1207 basic_block bb = bbs[i];
1208
1209 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1210 {
1211 phi = gsi_stmt (si);
1212 ok = true;
1213
1214 stmt_info = vinfo_for_stmt (phi);
1215 if (vect_print_dump_info (REPORT_DETAILS))
1216 {
1217 fprintf (vect_dump, "examining phi: ");
1218 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1219 }
1220
1221 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1222 (i.e., a phi in the tail of the outer-loop). */
1223 if (! is_loop_header_bb_p (bb))
1224 {
1225 /* FORNOW: we currently don't support the case that these phis
1226 are not used in the outerloop (unless it is double reduction,
1227 i.e., this phi is vect_reduction_def), cause this case
1228 requires to actually do something here. */
1229 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1230 || STMT_VINFO_LIVE_P (stmt_info))
1231 && STMT_VINFO_DEF_TYPE (stmt_info)
1232 != vect_double_reduction_def)
1233 {
1234 if (vect_print_dump_info (REPORT_DETAILS))
1235 fprintf (vect_dump,
1236 "Unsupported loop-closed phi in outer-loop.");
1237 return false;
1238 }
1239
1240 /* If PHI is used in the outer loop, we check that its operand
1241 is defined in the inner loop. */
1242 if (STMT_VINFO_RELEVANT_P (stmt_info))
1243 {
1244 tree phi_op;
1245 gimple op_def_stmt;
1246
1247 if (gimple_phi_num_args (phi) != 1)
1248 return false;
1249
1250 phi_op = PHI_ARG_DEF (phi, 0);
1251 if (TREE_CODE (phi_op) != SSA_NAME)
1252 return false;
1253
1254 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1255 if (!op_def_stmt || !vinfo_for_stmt (op_def_stmt))
1256 return false;
1257
1258 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1259 != vect_used_in_outer
1260 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1261 != vect_used_in_outer_by_reduction)
1262 return false;
1263 }
1264
1265 continue;
1266 }
1267
1268 gcc_assert (stmt_info);
1269
1270 if (STMT_VINFO_LIVE_P (stmt_info))
1271 {
1272 /* FORNOW: not yet supported. */
1273 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1274 fprintf (vect_dump, "not vectorized: value used after loop.");
1275 return false;
1276 }
1277
1278 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1279 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1280 {
1281 /* A scalar-dependence cycle that we don't support. */
1282 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1283 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1284 return false;
1285 }
1286
1287 if (STMT_VINFO_RELEVANT_P (stmt_info))
1288 {
1289 need_to_vectorize = true;
1290 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1291 ok = vectorizable_induction (phi, NULL, NULL);
1292 }
1293
1294 if (!ok)
1295 {
1296 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1297 {
1298 fprintf (vect_dump,
1299 "not vectorized: relevant phi not supported: ");
1300 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1301 }
1302 return false;
1303 }
1304 }
1305
1306 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1307 {
1308 gimple stmt = gsi_stmt (si);
1309 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1310 return false;
1311 }
1312 } /* bbs */
1313
1314 /* All operations in the loop are either irrelevant (deal with loop
1315 control, or dead), or only used outside the loop and can be moved
1316 out of the loop (e.g. invariants, inductions). The loop can be
1317 optimized away by scalar optimizations. We're better off not
1318 touching this loop. */
1319 if (!need_to_vectorize)
1320 {
1321 if (vect_print_dump_info (REPORT_DETAILS))
1322 fprintf (vect_dump,
1323 "All the computation can be taken out of the loop.");
1324 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1325 fprintf (vect_dump,
1326 "not vectorized: redundant loop. no profit to vectorize.");
1327 return false;
1328 }
1329
1330 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1331 && vect_print_dump_info (REPORT_DETAILS))
1332 fprintf (vect_dump,
1333 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1334 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1335
1336 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1337 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1338 {
1339 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1340 fprintf (vect_dump, "not vectorized: iteration count too small.");
1341 if (vect_print_dump_info (REPORT_DETAILS))
1342 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1343 "vectorization factor.");
1344 return false;
1345 }
1346
1347 /* Analyze cost. Decide if worth while to vectorize. */
1348
1349 /* Once VF is set, SLP costs should be updated since the number of created
1350 vector stmts depends on VF. */
1351 vect_update_slp_costs_according_to_vf (loop_vinfo);
1352
1353 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1354 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1355
1356 if (min_profitable_iters < 0)
1357 {
1358 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1359 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1360 if (vect_print_dump_info (REPORT_DETAILS))
1361 fprintf (vect_dump, "not vectorized: vector version will never be "
1362 "profitable.");
1363 return false;
1364 }
1365
1366 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1367 * vectorization_factor) - 1);
1368
1369 /* Use the cost model only if it is more conservative than user specified
1370 threshold. */
1371
1372 th = (unsigned) min_scalar_loop_bound;
1373 if (min_profitable_iters
1374 && (!min_scalar_loop_bound
1375 || min_profitable_iters > min_scalar_loop_bound))
1376 th = (unsigned) min_profitable_iters;
1377
1378 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1379 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1380 {
1381 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1382 fprintf (vect_dump, "not vectorized: vectorization not "
1383 "profitable.");
1384 if (vect_print_dump_info (REPORT_DETAILS))
1385 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1386 "user specified loop bound parameter or minimum "
1387 "profitable iterations (whichever is more conservative).");
1388 return false;
1389 }
1390
1391 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1392 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1393 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1394 {
1395 if (vect_print_dump_info (REPORT_DETAILS))
1396 fprintf (vect_dump, "epilog loop required.");
1397 if (!vect_can_advance_ivs_p (loop_vinfo))
1398 {
1399 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1400 fprintf (vect_dump,
1401 "not vectorized: can't create epilog loop 1.");
1402 return false;
1403 }
1404 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1405 {
1406 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1407 fprintf (vect_dump,
1408 "not vectorized: can't create epilog loop 2.");
1409 return false;
1410 }
1411 }
1412
1413 return true;
1414 }
1415
1416
1417 /* Function vect_analyze_loop_2.
1418
1419 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1420 for it. The different analyses will record information in the
1421 loop_vec_info struct. */
1422 static bool
1423 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1424 {
1425 bool ok, dummy, slp = false;
1426 int max_vf = MAX_VECTORIZATION_FACTOR;
1427 int min_vf = 2;
1428
1429 /* Find all data references in the loop (which correspond to vdefs/vuses)
1430 and analyze their evolution in the loop. Also adjust the minimal
1431 vectorization factor according to the loads and stores.
1432
1433 FORNOW: Handle only simple, array references, which
1434 alignment can be forced, and aligned pointer-references. */
1435
1436 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1437 if (!ok)
1438 {
1439 if (vect_print_dump_info (REPORT_DETAILS))
1440 fprintf (vect_dump, "bad data references.");
1441 return false;
1442 }
1443
1444 /* Classify all cross-iteration scalar data-flow cycles.
1445 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1446
1447 vect_analyze_scalar_cycles (loop_vinfo);
1448
1449 vect_pattern_recog (loop_vinfo);
1450
1451 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1452
1453 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1454 if (!ok)
1455 {
1456 if (vect_print_dump_info (REPORT_DETAILS))
1457 fprintf (vect_dump, "unexpected pattern.");
1458 return false;
1459 }
1460
1461 /* Analyze data dependences between the data-refs in the loop
1462 and adjust the maximum vectorization factor according to
1463 the dependences.
1464 FORNOW: fail at the first data dependence that we encounter. */
1465
1466 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy);
1467 if (!ok
1468 || max_vf < min_vf)
1469 {
1470 if (vect_print_dump_info (REPORT_DETAILS))
1471 fprintf (vect_dump, "bad data dependence.");
1472 return false;
1473 }
1474
1475 ok = vect_determine_vectorization_factor (loop_vinfo);
1476 if (!ok)
1477 {
1478 if (vect_print_dump_info (REPORT_DETAILS))
1479 fprintf (vect_dump, "can't determine vectorization factor.");
1480 return false;
1481 }
1482 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1483 {
1484 if (vect_print_dump_info (REPORT_DETAILS))
1485 fprintf (vect_dump, "bad data dependence.");
1486 return false;
1487 }
1488
1489 /* Analyze the alignment of the data-refs in the loop.
1490 Fail if a data reference is found that cannot be vectorized. */
1491
1492 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1493 if (!ok)
1494 {
1495 if (vect_print_dump_info (REPORT_DETAILS))
1496 fprintf (vect_dump, "bad data alignment.");
1497 return false;
1498 }
1499
1500 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1501 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1502
1503 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1504 if (!ok)
1505 {
1506 if (vect_print_dump_info (REPORT_DETAILS))
1507 fprintf (vect_dump, "bad data access.");
1508 return false;
1509 }
1510
1511 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1512 It is important to call pruning after vect_analyze_data_ref_accesses,
1513 since we use grouping information gathered by interleaving analysis. */
1514 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1515 if (!ok)
1516 {
1517 if (vect_print_dump_info (REPORT_DETAILS))
1518 fprintf (vect_dump, "too long list of versioning for alias "
1519 "run-time tests.");
1520 return false;
1521 }
1522
1523 /* This pass will decide on using loop versioning and/or loop peeling in
1524 order to enhance the alignment of data references in the loop. */
1525
1526 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1527 if (!ok)
1528 {
1529 if (vect_print_dump_info (REPORT_DETAILS))
1530 fprintf (vect_dump, "bad data alignment.");
1531 return false;
1532 }
1533
1534 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1535 ok = vect_analyze_slp (loop_vinfo, NULL);
1536 if (ok)
1537 {
1538 /* Decide which possible SLP instances to SLP. */
1539 slp = vect_make_slp_decision (loop_vinfo);
1540
1541 /* Find stmts that need to be both vectorized and SLPed. */
1542 vect_detect_hybrid_slp (loop_vinfo);
1543 }
1544
1545 /* Scan all the operations in the loop and make sure they are
1546 vectorizable. */
1547
1548 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1549 if (!ok)
1550 {
1551 if (vect_print_dump_info (REPORT_DETAILS))
1552 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1553 return false;
1554 }
1555
1556 return true;
1557 }
1558
1559 /* Function vect_analyze_loop.
1560
1561 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1562 for it. The different analyses will record information in the
1563 loop_vec_info struct. */
1564 loop_vec_info
1565 vect_analyze_loop (struct loop *loop)
1566 {
1567 loop_vec_info loop_vinfo;
1568 unsigned int vector_sizes;
1569
1570 /* Autodetect first vector size we try. */
1571 current_vector_size = 0;
1572 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1573
1574 if (vect_print_dump_info (REPORT_DETAILS))
1575 fprintf (vect_dump, "===== analyze_loop_nest =====");
1576
1577 if (loop_outer (loop)
1578 && loop_vec_info_for_loop (loop_outer (loop))
1579 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1580 {
1581 if (vect_print_dump_info (REPORT_DETAILS))
1582 fprintf (vect_dump, "outer-loop already vectorized.");
1583 return NULL;
1584 }
1585
1586 while (1)
1587 {
1588 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1589 loop_vinfo = vect_analyze_loop_form (loop);
1590 if (!loop_vinfo)
1591 {
1592 if (vect_print_dump_info (REPORT_DETAILS))
1593 fprintf (vect_dump, "bad loop form.");
1594 return NULL;
1595 }
1596
1597 if (vect_analyze_loop_2 (loop_vinfo))
1598 {
1599 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1600
1601 return loop_vinfo;
1602 }
1603
1604 destroy_loop_vec_info (loop_vinfo, true);
1605
1606 vector_sizes &= ~current_vector_size;
1607 if (vector_sizes == 0
1608 || current_vector_size == 0)
1609 return NULL;
1610
1611 /* Try the next biggest vector size. */
1612 current_vector_size = 1 << floor_log2 (vector_sizes);
1613 if (vect_print_dump_info (REPORT_DETAILS))
1614 fprintf (vect_dump, "***** Re-trying analysis with "
1615 "vector size %d\n", current_vector_size);
1616 }
1617 }
1618
1619
1620 /* Function reduction_code_for_scalar_code
1621
1622 Input:
1623 CODE - tree_code of a reduction operations.
1624
1625 Output:
1626 REDUC_CODE - the corresponding tree-code to be used to reduce the
1627 vector of partial results into a single scalar result (which
1628 will also reside in a vector) or ERROR_MARK if the operation is
1629 a supported reduction operation, but does not have such tree-code.
1630
1631 Return FALSE if CODE currently cannot be vectorized as reduction. */
1632
1633 static bool
1634 reduction_code_for_scalar_code (enum tree_code code,
1635 enum tree_code *reduc_code)
1636 {
1637 switch (code)
1638 {
1639 case MAX_EXPR:
1640 *reduc_code = REDUC_MAX_EXPR;
1641 return true;
1642
1643 case MIN_EXPR:
1644 *reduc_code = REDUC_MIN_EXPR;
1645 return true;
1646
1647 case PLUS_EXPR:
1648 *reduc_code = REDUC_PLUS_EXPR;
1649 return true;
1650
1651 case MULT_EXPR:
1652 case MINUS_EXPR:
1653 case BIT_IOR_EXPR:
1654 case BIT_XOR_EXPR:
1655 case BIT_AND_EXPR:
1656 *reduc_code = ERROR_MARK;
1657 return true;
1658
1659 default:
1660 return false;
1661 }
1662 }
1663
1664
1665 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1666 STMT is printed with a message MSG. */
1667
1668 static void
1669 report_vect_op (gimple stmt, const char *msg)
1670 {
1671 fprintf (vect_dump, "%s", msg);
1672 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1673 }
1674
1675
1676 /* Function vect_is_simple_reduction_1
1677
1678 (1) Detect a cross-iteration def-use cycle that represents a simple
1679 reduction computation. We look for the following pattern:
1680
1681 loop_header:
1682 a1 = phi < a0, a2 >
1683 a3 = ...
1684 a2 = operation (a3, a1)
1685
1686 such that:
1687 1. operation is commutative and associative and it is safe to
1688 change the order of the computation (if CHECK_REDUCTION is true)
1689 2. no uses for a2 in the loop (a2 is used out of the loop)
1690 3. no uses of a1 in the loop besides the reduction operation
1691 4. no uses of a1 outside the loop.
1692
1693 Conditions 1,4 are tested here.
1694 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1695
1696 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1697 nested cycles, if CHECK_REDUCTION is false.
1698
1699 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1700 reductions:
1701
1702 a1 = phi < a0, a2 >
1703 inner loop (def of a3)
1704 a2 = phi < a3 >
1705
1706 If MODIFY is true it tries also to rework the code in-place to enable
1707 detection of more reduction patterns. For the time being we rewrite
1708 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1709 */
1710
1711 static gimple
1712 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1713 bool check_reduction, bool *double_reduc,
1714 bool modify)
1715 {
1716 struct loop *loop = (gimple_bb (phi))->loop_father;
1717 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1718 edge latch_e = loop_latch_edge (loop);
1719 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1720 gimple def_stmt, def1 = NULL, def2 = NULL;
1721 enum tree_code orig_code, code;
1722 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1723 tree type;
1724 int nloop_uses;
1725 tree name;
1726 imm_use_iterator imm_iter;
1727 use_operand_p use_p;
1728 bool phi_def;
1729
1730 *double_reduc = false;
1731
1732 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1733 otherwise, we assume outer loop vectorization. */
1734 gcc_assert ((check_reduction && loop == vect_loop)
1735 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1736
1737 name = PHI_RESULT (phi);
1738 nloop_uses = 0;
1739 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1740 {
1741 gimple use_stmt = USE_STMT (use_p);
1742 if (is_gimple_debug (use_stmt))
1743 continue;
1744
1745 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1746 {
1747 if (vect_print_dump_info (REPORT_DETAILS))
1748 fprintf (vect_dump, "intermediate value used outside loop.");
1749
1750 return NULL;
1751 }
1752
1753 if (vinfo_for_stmt (use_stmt)
1754 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1755 nloop_uses++;
1756 if (nloop_uses > 1)
1757 {
1758 if (vect_print_dump_info (REPORT_DETAILS))
1759 fprintf (vect_dump, "reduction used in loop.");
1760 return NULL;
1761 }
1762 }
1763
1764 if (TREE_CODE (loop_arg) != SSA_NAME)
1765 {
1766 if (vect_print_dump_info (REPORT_DETAILS))
1767 {
1768 fprintf (vect_dump, "reduction: not ssa_name: ");
1769 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1770 }
1771 return NULL;
1772 }
1773
1774 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1775 if (!def_stmt)
1776 {
1777 if (vect_print_dump_info (REPORT_DETAILS))
1778 fprintf (vect_dump, "reduction: no def_stmt.");
1779 return NULL;
1780 }
1781
1782 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1783 {
1784 if (vect_print_dump_info (REPORT_DETAILS))
1785 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1786 return NULL;
1787 }
1788
1789 if (is_gimple_assign (def_stmt))
1790 {
1791 name = gimple_assign_lhs (def_stmt);
1792 phi_def = false;
1793 }
1794 else
1795 {
1796 name = PHI_RESULT (def_stmt);
1797 phi_def = true;
1798 }
1799
1800 nloop_uses = 0;
1801 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1802 {
1803 gimple use_stmt = USE_STMT (use_p);
1804 if (is_gimple_debug (use_stmt))
1805 continue;
1806 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1807 && vinfo_for_stmt (use_stmt)
1808 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1809 nloop_uses++;
1810 if (nloop_uses > 1)
1811 {
1812 if (vect_print_dump_info (REPORT_DETAILS))
1813 fprintf (vect_dump, "reduction used in loop.");
1814 return NULL;
1815 }
1816 }
1817
1818 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1819 defined in the inner loop. */
1820 if (phi_def)
1821 {
1822 op1 = PHI_ARG_DEF (def_stmt, 0);
1823
1824 if (gimple_phi_num_args (def_stmt) != 1
1825 || TREE_CODE (op1) != SSA_NAME)
1826 {
1827 if (vect_print_dump_info (REPORT_DETAILS))
1828 fprintf (vect_dump, "unsupported phi node definition.");
1829
1830 return NULL;
1831 }
1832
1833 def1 = SSA_NAME_DEF_STMT (op1);
1834 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1835 && loop->inner
1836 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1837 && is_gimple_assign (def1))
1838 {
1839 if (vect_print_dump_info (REPORT_DETAILS))
1840 report_vect_op (def_stmt, "detected double reduction: ");
1841
1842 *double_reduc = true;
1843 return def_stmt;
1844 }
1845
1846 return NULL;
1847 }
1848
1849 code = orig_code = gimple_assign_rhs_code (def_stmt);
1850
1851 /* We can handle "res -= x[i]", which is non-associative by
1852 simply rewriting this into "res += -x[i]". Avoid changing
1853 gimple instruction for the first simple tests and only do this
1854 if we're allowed to change code at all. */
1855 if (code == MINUS_EXPR
1856 && modify
1857 && (op1 = gimple_assign_rhs1 (def_stmt))
1858 && TREE_CODE (op1) == SSA_NAME
1859 && SSA_NAME_DEF_STMT (op1) == phi)
1860 code = PLUS_EXPR;
1861
1862 if (check_reduction
1863 && (!commutative_tree_code (code) || !associative_tree_code (code)))
1864 {
1865 if (vect_print_dump_info (REPORT_DETAILS))
1866 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1867 return NULL;
1868 }
1869
1870 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1871 {
1872 if (code != COND_EXPR)
1873 {
1874 if (vect_print_dump_info (REPORT_DETAILS))
1875 report_vect_op (def_stmt, "reduction: not binary operation: ");
1876
1877 return NULL;
1878 }
1879
1880 op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1881 if (COMPARISON_CLASS_P (op3))
1882 {
1883 op4 = TREE_OPERAND (op3, 1);
1884 op3 = TREE_OPERAND (op3, 0);
1885 }
1886
1887 op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1888 op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1889
1890 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1891 {
1892 if (vect_print_dump_info (REPORT_DETAILS))
1893 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1894
1895 return NULL;
1896 }
1897 }
1898 else
1899 {
1900 op1 = gimple_assign_rhs1 (def_stmt);
1901 op2 = gimple_assign_rhs2 (def_stmt);
1902
1903 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1904 {
1905 if (vect_print_dump_info (REPORT_DETAILS))
1906 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1907
1908 return NULL;
1909 }
1910 }
1911
1912 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1913 if ((TREE_CODE (op1) == SSA_NAME
1914 && !types_compatible_p (type,TREE_TYPE (op1)))
1915 || (TREE_CODE (op2) == SSA_NAME
1916 && !types_compatible_p (type, TREE_TYPE (op2)))
1917 || (op3 && TREE_CODE (op3) == SSA_NAME
1918 && !types_compatible_p (type, TREE_TYPE (op3)))
1919 || (op4 && TREE_CODE (op4) == SSA_NAME
1920 && !types_compatible_p (type, TREE_TYPE (op4))))
1921 {
1922 if (vect_print_dump_info (REPORT_DETAILS))
1923 {
1924 fprintf (vect_dump, "reduction: multiple types: operation type: ");
1925 print_generic_expr (vect_dump, type, TDF_SLIM);
1926 fprintf (vect_dump, ", operands types: ");
1927 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1928 fprintf (vect_dump, ",");
1929 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1930 if (op3)
1931 {
1932 fprintf (vect_dump, ",");
1933 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1934 }
1935
1936 if (op4)
1937 {
1938 fprintf (vect_dump, ",");
1939 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1940 }
1941 }
1942
1943 return NULL;
1944 }
1945
1946 /* Check that it's ok to change the order of the computation.
1947 Generally, when vectorizing a reduction we change the order of the
1948 computation. This may change the behavior of the program in some
1949 cases, so we need to check that this is ok. One exception is when
1950 vectorizing an outer-loop: the inner-loop is executed sequentially,
1951 and therefore vectorizing reductions in the inner-loop during
1952 outer-loop vectorization is safe. */
1953
1954 /* CHECKME: check for !flag_finite_math_only too? */
1955 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1956 && check_reduction)
1957 {
1958 /* Changing the order of operations changes the semantics. */
1959 if (vect_print_dump_info (REPORT_DETAILS))
1960 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1961 return NULL;
1962 }
1963 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1964 && check_reduction)
1965 {
1966 /* Changing the order of operations changes the semantics. */
1967 if (vect_print_dump_info (REPORT_DETAILS))
1968 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1969 return NULL;
1970 }
1971 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1972 {
1973 /* Changing the order of operations changes the semantics. */
1974 if (vect_print_dump_info (REPORT_DETAILS))
1975 report_vect_op (def_stmt,
1976 "reduction: unsafe fixed-point math optimization: ");
1977 return NULL;
1978 }
1979
1980 /* If we detected "res -= x[i]" earlier, rewrite it into
1981 "res += -x[i]" now. If this turns out to be useless reassoc
1982 will clean it up again. */
1983 if (orig_code == MINUS_EXPR)
1984 {
1985 tree rhs = gimple_assign_rhs2 (def_stmt);
1986 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
1987 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
1988 rhs, NULL);
1989 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
1990 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
1991 loop_info, NULL));
1992 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
1993 gimple_assign_set_rhs2 (def_stmt, negrhs);
1994 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
1995 update_stmt (def_stmt);
1996 }
1997
1998 /* Reduction is safe. We're dealing with one of the following:
1999 1) integer arithmetic and no trapv
2000 2) floating point arithmetic, and special flags permit this optimization
2001 3) nested cycle (i.e., outer loop vectorization). */
2002 if (TREE_CODE (op1) == SSA_NAME)
2003 def1 = SSA_NAME_DEF_STMT (op1);
2004
2005 if (TREE_CODE (op2) == SSA_NAME)
2006 def2 = SSA_NAME_DEF_STMT (op2);
2007
2008 if (code != COND_EXPR
2009 && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
2010 {
2011 if (vect_print_dump_info (REPORT_DETAILS))
2012 report_vect_op (def_stmt, "reduction: no defs for operands: ");
2013 return NULL;
2014 }
2015
2016 /* Check that one def is the reduction def, defined by PHI,
2017 the other def is either defined in the loop ("vect_internal_def"),
2018 or it's an induction (defined by a loop-header phi-node). */
2019
2020 if (def2 && def2 == phi
2021 && (code == COND_EXPR
2022 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2023 && (is_gimple_assign (def1)
2024 || is_gimple_call (def1)
2025 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2026 == vect_induction_def
2027 || (gimple_code (def1) == GIMPLE_PHI
2028 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2029 == vect_internal_def
2030 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2031 {
2032 if (vect_print_dump_info (REPORT_DETAILS))
2033 report_vect_op (def_stmt, "detected reduction: ");
2034 return def_stmt;
2035 }
2036 else if (def1 && def1 == phi
2037 && (code == COND_EXPR
2038 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2039 && (is_gimple_assign (def2)
2040 || is_gimple_call (def2)
2041 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2042 == vect_induction_def
2043 || (gimple_code (def2) == GIMPLE_PHI
2044 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2045 == vect_internal_def
2046 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2047 {
2048 if (check_reduction)
2049 {
2050 /* Swap operands (just for simplicity - so that the rest of the code
2051 can assume that the reduction variable is always the last (second)
2052 argument). */
2053 if (vect_print_dump_info (REPORT_DETAILS))
2054 report_vect_op (def_stmt,
2055 "detected reduction: need to swap operands: ");
2056
2057 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2058 gimple_assign_rhs2_ptr (def_stmt));
2059 }
2060 else
2061 {
2062 if (vect_print_dump_info (REPORT_DETAILS))
2063 report_vect_op (def_stmt, "detected reduction: ");
2064 }
2065
2066 return def_stmt;
2067 }
2068 else
2069 {
2070 if (vect_print_dump_info (REPORT_DETAILS))
2071 report_vect_op (def_stmt, "reduction: unknown pattern: ");
2072
2073 return NULL;
2074 }
2075 }
2076
2077 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2078 in-place. Arguments as there. */
2079
2080 static gimple
2081 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2082 bool check_reduction, bool *double_reduc)
2083 {
2084 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2085 double_reduc, false);
2086 }
2087
2088 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2089 in-place if it enables detection of more reductions. Arguments
2090 as there. */
2091
2092 gimple
2093 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2094 bool check_reduction, bool *double_reduc)
2095 {
2096 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2097 double_reduc, true);
2098 }
2099
2100 /* Calculate the cost of one scalar iteration of the loop. */
2101 int
2102 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2103 {
2104 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2105 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2106 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2107 int innerloop_iters, i, stmt_cost;
2108
2109 /* Count statements in scalar loop. Using this as scalar cost for a single
2110 iteration for now.
2111
2112 TODO: Add outer loop support.
2113
2114 TODO: Consider assigning different costs to different scalar
2115 statements. */
2116
2117 /* FORNOW. */
2118 innerloop_iters = 1;
2119 if (loop->inner)
2120 innerloop_iters = 50; /* FIXME */
2121
2122 for (i = 0; i < nbbs; i++)
2123 {
2124 gimple_stmt_iterator si;
2125 basic_block bb = bbs[i];
2126
2127 if (bb->loop_father == loop->inner)
2128 factor = innerloop_iters;
2129 else
2130 factor = 1;
2131
2132 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2133 {
2134 gimple stmt = gsi_stmt (si);
2135 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2136
2137 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2138 continue;
2139
2140 /* Skip stmts that are not vectorized inside the loop. */
2141 if (stmt_info
2142 && !STMT_VINFO_RELEVANT_P (stmt_info)
2143 && (!STMT_VINFO_LIVE_P (stmt_info)
2144 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2145 continue;
2146
2147 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2148 {
2149 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2150 stmt_cost = vect_get_cost (scalar_load);
2151 else
2152 stmt_cost = vect_get_cost (scalar_store);
2153 }
2154 else
2155 stmt_cost = vect_get_cost (scalar_stmt);
2156
2157 scalar_single_iter_cost += stmt_cost * factor;
2158 }
2159 }
2160 return scalar_single_iter_cost;
2161 }
2162
2163 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2164 int
2165 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2166 int *peel_iters_epilogue,
2167 int scalar_single_iter_cost)
2168 {
2169 int peel_guard_costs = 0;
2170 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2171
2172 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2173 {
2174 *peel_iters_epilogue = vf/2;
2175 if (vect_print_dump_info (REPORT_COST))
2176 fprintf (vect_dump, "cost model: "
2177 "epilogue peel iters set to vf/2 because "
2178 "loop iterations are unknown .");
2179
2180 /* If peeled iterations are known but number of scalar loop
2181 iterations are unknown, count a taken branch per peeled loop. */
2182 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2183 }
2184 else
2185 {
2186 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2187 peel_iters_prologue = niters < peel_iters_prologue ?
2188 niters : peel_iters_prologue;
2189 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2190 }
2191
2192 return (peel_iters_prologue * scalar_single_iter_cost)
2193 + (*peel_iters_epilogue * scalar_single_iter_cost)
2194 + peel_guard_costs;
2195 }
2196
2197 /* Function vect_estimate_min_profitable_iters
2198
2199 Return the number of iterations required for the vector version of the
2200 loop to be profitable relative to the cost of the scalar version of the
2201 loop.
2202
2203 TODO: Take profile info into account before making vectorization
2204 decisions, if available. */
2205
2206 int
2207 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2208 {
2209 int i;
2210 int min_profitable_iters;
2211 int peel_iters_prologue;
2212 int peel_iters_epilogue;
2213 int vec_inside_cost = 0;
2214 int vec_outside_cost = 0;
2215 int scalar_single_iter_cost = 0;
2216 int scalar_outside_cost = 0;
2217 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2218 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2219 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2220 int nbbs = loop->num_nodes;
2221 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2222 int peel_guard_costs = 0;
2223 int innerloop_iters = 0, factor;
2224 VEC (slp_instance, heap) *slp_instances;
2225 slp_instance instance;
2226
2227 /* Cost model disabled. */
2228 if (!flag_vect_cost_model)
2229 {
2230 if (vect_print_dump_info (REPORT_COST))
2231 fprintf (vect_dump, "cost model disabled.");
2232 return 0;
2233 }
2234
2235 /* Requires loop versioning tests to handle misalignment. */
2236 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2237 {
2238 /* FIXME: Make cost depend on complexity of individual check. */
2239 vec_outside_cost +=
2240 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2241 if (vect_print_dump_info (REPORT_COST))
2242 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2243 "versioning to treat misalignment.\n");
2244 }
2245
2246 /* Requires loop versioning with alias checks. */
2247 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2248 {
2249 /* FIXME: Make cost depend on complexity of individual check. */
2250 vec_outside_cost +=
2251 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2252 if (vect_print_dump_info (REPORT_COST))
2253 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2254 "versioning aliasing.\n");
2255 }
2256
2257 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2258 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2259 vec_outside_cost += vect_get_cost (cond_branch_taken);
2260
2261 /* Count statements in scalar loop. Using this as scalar cost for a single
2262 iteration for now.
2263
2264 TODO: Add outer loop support.
2265
2266 TODO: Consider assigning different costs to different scalar
2267 statements. */
2268
2269 /* FORNOW. */
2270 if (loop->inner)
2271 innerloop_iters = 50; /* FIXME */
2272
2273 for (i = 0; i < nbbs; i++)
2274 {
2275 gimple_stmt_iterator si;
2276 basic_block bb = bbs[i];
2277
2278 if (bb->loop_father == loop->inner)
2279 factor = innerloop_iters;
2280 else
2281 factor = 1;
2282
2283 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2284 {
2285 gimple stmt = gsi_stmt (si);
2286 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2287 /* Skip stmts that are not vectorized inside the loop. */
2288 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2289 && (!STMT_VINFO_LIVE_P (stmt_info)
2290 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2291 continue;
2292 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2293 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2294 some of the "outside" costs are generated inside the outer-loop. */
2295 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2296 }
2297 }
2298
2299 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2300
2301 /* Add additional cost for the peeled instructions in prologue and epilogue
2302 loop.
2303
2304 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2305 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2306
2307 TODO: Build an expression that represents peel_iters for prologue and
2308 epilogue to be used in a run-time test. */
2309
2310 if (npeel < 0)
2311 {
2312 peel_iters_prologue = vf/2;
2313 if (vect_print_dump_info (REPORT_COST))
2314 fprintf (vect_dump, "cost model: "
2315 "prologue peel iters set to vf/2.");
2316
2317 /* If peeling for alignment is unknown, loop bound of main loop becomes
2318 unknown. */
2319 peel_iters_epilogue = vf/2;
2320 if (vect_print_dump_info (REPORT_COST))
2321 fprintf (vect_dump, "cost model: "
2322 "epilogue peel iters set to vf/2 because "
2323 "peeling for alignment is unknown .");
2324
2325 /* If peeled iterations are unknown, count a taken branch and a not taken
2326 branch per peeled loop. Even if scalar loop iterations are known,
2327 vector iterations are not known since peeled prologue iterations are
2328 not known. Hence guards remain the same. */
2329 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2330 + vect_get_cost (cond_branch_not_taken));
2331 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2332 + (peel_iters_epilogue * scalar_single_iter_cost)
2333 + peel_guard_costs;
2334 }
2335 else
2336 {
2337 peel_iters_prologue = npeel;
2338 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2339 peel_iters_prologue, &peel_iters_epilogue,
2340 scalar_single_iter_cost);
2341 }
2342
2343 /* FORNOW: The scalar outside cost is incremented in one of the
2344 following ways:
2345
2346 1. The vectorizer checks for alignment and aliasing and generates
2347 a condition that allows dynamic vectorization. A cost model
2348 check is ANDED with the versioning condition. Hence scalar code
2349 path now has the added cost of the versioning check.
2350
2351 if (cost > th & versioning_check)
2352 jmp to vector code
2353
2354 Hence run-time scalar is incremented by not-taken branch cost.
2355
2356 2. The vectorizer then checks if a prologue is required. If the
2357 cost model check was not done before during versioning, it has to
2358 be done before the prologue check.
2359
2360 if (cost <= th)
2361 prologue = scalar_iters
2362 if (prologue == 0)
2363 jmp to vector code
2364 else
2365 execute prologue
2366 if (prologue == num_iters)
2367 go to exit
2368
2369 Hence the run-time scalar cost is incremented by a taken branch,
2370 plus a not-taken branch, plus a taken branch cost.
2371
2372 3. The vectorizer then checks if an epilogue is required. If the
2373 cost model check was not done before during prologue check, it
2374 has to be done with the epilogue check.
2375
2376 if (prologue == 0)
2377 jmp to vector code
2378 else
2379 execute prologue
2380 if (prologue == num_iters)
2381 go to exit
2382 vector code:
2383 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2384 jmp to epilogue
2385
2386 Hence the run-time scalar cost should be incremented by 2 taken
2387 branches.
2388
2389 TODO: The back end may reorder the BBS's differently and reverse
2390 conditions/branch directions. Change the estimates below to
2391 something more reasonable. */
2392
2393 /* If the number of iterations is known and we do not do versioning, we can
2394 decide whether to vectorize at compile time. Hence the scalar version
2395 do not carry cost model guard costs. */
2396 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2397 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2398 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2399 {
2400 /* Cost model check occurs at versioning. */
2401 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2402 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2403 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2404 else
2405 {
2406 /* Cost model check occurs at prologue generation. */
2407 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2408 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2409 + vect_get_cost (cond_branch_not_taken);
2410 /* Cost model check occurs at epilogue generation. */
2411 else
2412 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2413 }
2414 }
2415
2416 /* Add SLP costs. */
2417 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2418 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2419 {
2420 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2421 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2422 }
2423
2424 /* Calculate number of iterations required to make the vector version
2425 profitable, relative to the loop bodies only. The following condition
2426 must hold true:
2427 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2428 where
2429 SIC = scalar iteration cost, VIC = vector iteration cost,
2430 VOC = vector outside cost, VF = vectorization factor,
2431 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2432 SOC = scalar outside cost for run time cost model check. */
2433
2434 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2435 {
2436 if (vec_outside_cost <= 0)
2437 min_profitable_iters = 1;
2438 else
2439 {
2440 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2441 - vec_inside_cost * peel_iters_prologue
2442 - vec_inside_cost * peel_iters_epilogue)
2443 / ((scalar_single_iter_cost * vf)
2444 - vec_inside_cost);
2445
2446 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2447 <= ((vec_inside_cost * min_profitable_iters)
2448 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2449 min_profitable_iters++;
2450 }
2451 }
2452 /* vector version will never be profitable. */
2453 else
2454 {
2455 if (vect_print_dump_info (REPORT_COST))
2456 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2457 "divided by the scalar iteration cost = %d "
2458 "is greater or equal to the vectorization factor = %d.",
2459 vec_inside_cost, scalar_single_iter_cost, vf);
2460 return -1;
2461 }
2462
2463 if (vect_print_dump_info (REPORT_COST))
2464 {
2465 fprintf (vect_dump, "Cost model analysis: \n");
2466 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2467 vec_inside_cost);
2468 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2469 vec_outside_cost);
2470 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2471 scalar_single_iter_cost);
2472 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2473 fprintf (vect_dump, " prologue iterations: %d\n",
2474 peel_iters_prologue);
2475 fprintf (vect_dump, " epilogue iterations: %d\n",
2476 peel_iters_epilogue);
2477 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2478 min_profitable_iters);
2479 }
2480
2481 min_profitable_iters =
2482 min_profitable_iters < vf ? vf : min_profitable_iters;
2483
2484 /* Because the condition we create is:
2485 if (niters <= min_profitable_iters)
2486 then skip the vectorized loop. */
2487 min_profitable_iters--;
2488
2489 if (vect_print_dump_info (REPORT_COST))
2490 fprintf (vect_dump, " Profitability threshold = %d\n",
2491 min_profitable_iters);
2492
2493 return min_profitable_iters;
2494 }
2495
2496
2497 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2498 functions. Design better to avoid maintenance issues. */
2499
2500 /* Function vect_model_reduction_cost.
2501
2502 Models cost for a reduction operation, including the vector ops
2503 generated within the strip-mine loop, the initial definition before
2504 the loop, and the epilogue code that must be generated. */
2505
2506 static bool
2507 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2508 int ncopies)
2509 {
2510 int outer_cost = 0;
2511 enum tree_code code;
2512 optab optab;
2513 tree vectype;
2514 gimple stmt, orig_stmt;
2515 tree reduction_op;
2516 enum machine_mode mode;
2517 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2518 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2519
2520
2521 /* Cost of reduction op inside loop. */
2522 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2523 += ncopies * vect_get_cost (vector_stmt);
2524
2525 stmt = STMT_VINFO_STMT (stmt_info);
2526
2527 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2528 {
2529 case GIMPLE_SINGLE_RHS:
2530 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2531 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2532 break;
2533 case GIMPLE_UNARY_RHS:
2534 reduction_op = gimple_assign_rhs1 (stmt);
2535 break;
2536 case GIMPLE_BINARY_RHS:
2537 reduction_op = gimple_assign_rhs2 (stmt);
2538 break;
2539 case GIMPLE_TERNARY_RHS:
2540 reduction_op = gimple_assign_rhs3 (stmt);
2541 break;
2542 default:
2543 gcc_unreachable ();
2544 }
2545
2546 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2547 if (!vectype)
2548 {
2549 if (vect_print_dump_info (REPORT_COST))
2550 {
2551 fprintf (vect_dump, "unsupported data-type ");
2552 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2553 }
2554 return false;
2555 }
2556
2557 mode = TYPE_MODE (vectype);
2558 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2559
2560 if (!orig_stmt)
2561 orig_stmt = STMT_VINFO_STMT (stmt_info);
2562
2563 code = gimple_assign_rhs_code (orig_stmt);
2564
2565 /* Add in cost for initial definition. */
2566 outer_cost += vect_get_cost (scalar_to_vec);
2567
2568 /* Determine cost of epilogue code.
2569
2570 We have a reduction operator that will reduce the vector in one statement.
2571 Also requires scalar extract. */
2572
2573 if (!nested_in_vect_loop_p (loop, orig_stmt))
2574 {
2575 if (reduc_code != ERROR_MARK)
2576 outer_cost += vect_get_cost (vector_stmt)
2577 + vect_get_cost (vec_to_scalar);
2578 else
2579 {
2580 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2581 tree bitsize =
2582 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2583 int element_bitsize = tree_low_cst (bitsize, 1);
2584 int nelements = vec_size_in_bits / element_bitsize;
2585
2586 optab = optab_for_tree_code (code, vectype, optab_default);
2587
2588 /* We have a whole vector shift available. */
2589 if (VECTOR_MODE_P (mode)
2590 && optab_handler (optab, mode) != CODE_FOR_nothing
2591 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2592 /* Final reduction via vector shifts and the reduction operator. Also
2593 requires scalar extract. */
2594 outer_cost += ((exact_log2(nelements) * 2)
2595 * vect_get_cost (vector_stmt)
2596 + vect_get_cost (vec_to_scalar));
2597 else
2598 /* Use extracts and reduction op for final reduction. For N elements,
2599 we have N extracts and N-1 reduction ops. */
2600 outer_cost += ((nelements + nelements - 1)
2601 * vect_get_cost (vector_stmt));
2602 }
2603 }
2604
2605 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2606
2607 if (vect_print_dump_info (REPORT_COST))
2608 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2609 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2610 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2611
2612 return true;
2613 }
2614
2615
2616 /* Function vect_model_induction_cost.
2617
2618 Models cost for induction operations. */
2619
2620 static void
2621 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2622 {
2623 /* loop cost for vec_loop. */
2624 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2625 = ncopies * vect_get_cost (vector_stmt);
2626 /* prologue cost for vec_init and vec_step. */
2627 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2628 = 2 * vect_get_cost (scalar_to_vec);
2629
2630 if (vect_print_dump_info (REPORT_COST))
2631 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2632 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2633 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2634 }
2635
2636
2637 /* Function get_initial_def_for_induction
2638
2639 Input:
2640 STMT - a stmt that performs an induction operation in the loop.
2641 IV_PHI - the initial value of the induction variable
2642
2643 Output:
2644 Return a vector variable, initialized with the first VF values of
2645 the induction variable. E.g., for an iv with IV_PHI='X' and
2646 evolution S, for a vector of 4 units, we want to return:
2647 [X, X + S, X + 2*S, X + 3*S]. */
2648
2649 static tree
2650 get_initial_def_for_induction (gimple iv_phi)
2651 {
2652 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2653 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2654 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2655 tree scalar_type;
2656 tree vectype;
2657 int nunits;
2658 edge pe = loop_preheader_edge (loop);
2659 struct loop *iv_loop;
2660 basic_block new_bb;
2661 tree vec, vec_init, vec_step, t;
2662 tree access_fn;
2663 tree new_var;
2664 tree new_name;
2665 gimple init_stmt, induction_phi, new_stmt;
2666 tree induc_def, vec_def, vec_dest;
2667 tree init_expr, step_expr;
2668 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2669 int i;
2670 bool ok;
2671 int ncopies;
2672 tree expr;
2673 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2674 bool nested_in_vect_loop = false;
2675 gimple_seq stmts = NULL;
2676 imm_use_iterator imm_iter;
2677 use_operand_p use_p;
2678 gimple exit_phi;
2679 edge latch_e;
2680 tree loop_arg;
2681 gimple_stmt_iterator si;
2682 basic_block bb = gimple_bb (iv_phi);
2683 tree stepvectype;
2684 tree resvectype;
2685
2686 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2687 if (nested_in_vect_loop_p (loop, iv_phi))
2688 {
2689 nested_in_vect_loop = true;
2690 iv_loop = loop->inner;
2691 }
2692 else
2693 iv_loop = loop;
2694 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2695
2696 latch_e = loop_latch_edge (iv_loop);
2697 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2698
2699 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2700 gcc_assert (access_fn);
2701 STRIP_NOPS (access_fn);
2702 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2703 &init_expr, &step_expr);
2704 gcc_assert (ok);
2705 pe = loop_preheader_edge (iv_loop);
2706
2707 scalar_type = TREE_TYPE (init_expr);
2708 vectype = get_vectype_for_scalar_type (scalar_type);
2709 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
2710 gcc_assert (vectype);
2711 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2712 ncopies = vf / nunits;
2713
2714 gcc_assert (phi_info);
2715 gcc_assert (ncopies >= 1);
2716
2717 /* Find the first insertion point in the BB. */
2718 si = gsi_after_labels (bb);
2719
2720 /* Create the vector that holds the initial_value of the induction. */
2721 if (nested_in_vect_loop)
2722 {
2723 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2724 been created during vectorization of previous stmts. We obtain it
2725 from the STMT_VINFO_VEC_STMT of the defining stmt. */
2726 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2727 loop_preheader_edge (iv_loop));
2728 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2729 }
2730 else
2731 {
2732 /* iv_loop is the loop to be vectorized. Create:
2733 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2734 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2735 add_referenced_var (new_var);
2736
2737 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2738 if (stmts)
2739 {
2740 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2741 gcc_assert (!new_bb);
2742 }
2743
2744 t = NULL_TREE;
2745 t = tree_cons (NULL_TREE, new_name, t);
2746 for (i = 1; i < nunits; i++)
2747 {
2748 /* Create: new_name_i = new_name + step_expr */
2749 enum tree_code code = POINTER_TYPE_P (scalar_type)
2750 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2751 init_stmt = gimple_build_assign_with_ops (code, new_var,
2752 new_name, step_expr);
2753 new_name = make_ssa_name (new_var, init_stmt);
2754 gimple_assign_set_lhs (init_stmt, new_name);
2755
2756 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2757 gcc_assert (!new_bb);
2758
2759 if (vect_print_dump_info (REPORT_DETAILS))
2760 {
2761 fprintf (vect_dump, "created new init_stmt: ");
2762 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2763 }
2764 t = tree_cons (NULL_TREE, new_name, t);
2765 }
2766 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2767 vec = build_constructor_from_list (vectype, nreverse (t));
2768 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2769 }
2770
2771
2772 /* Create the vector that holds the step of the induction. */
2773 if (nested_in_vect_loop)
2774 /* iv_loop is nested in the loop to be vectorized. Generate:
2775 vec_step = [S, S, S, S] */
2776 new_name = step_expr;
2777 else
2778 {
2779 /* iv_loop is the loop to be vectorized. Generate:
2780 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2781 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2782 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2783 expr, step_expr);
2784 }
2785
2786 t = unshare_expr (new_name);
2787 gcc_assert (CONSTANT_CLASS_P (new_name));
2788 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2789 gcc_assert (stepvectype);
2790 vec = build_vector_from_val (stepvectype, t);
2791 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2792
2793
2794 /* Create the following def-use cycle:
2795 loop prolog:
2796 vec_init = ...
2797 vec_step = ...
2798 loop:
2799 vec_iv = PHI <vec_init, vec_loop>
2800 ...
2801 STMT
2802 ...
2803 vec_loop = vec_iv + vec_step; */
2804
2805 /* Create the induction-phi that defines the induction-operand. */
2806 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2807 add_referenced_var (vec_dest);
2808 induction_phi = create_phi_node (vec_dest, iv_loop->header);
2809 set_vinfo_for_stmt (induction_phi,
2810 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2811 induc_def = PHI_RESULT (induction_phi);
2812
2813 /* Create the iv update inside the loop */
2814 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2815 induc_def, vec_step);
2816 vec_def = make_ssa_name (vec_dest, new_stmt);
2817 gimple_assign_set_lhs (new_stmt, vec_def);
2818 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2819 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2820 NULL));
2821
2822 /* Set the arguments of the phi node: */
2823 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2824 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2825 UNKNOWN_LOCATION);
2826
2827
2828 /* In case that vectorization factor (VF) is bigger than the number
2829 of elements that we can fit in a vectype (nunits), we have to generate
2830 more than one vector stmt - i.e - we need to "unroll" the
2831 vector stmt by a factor VF/nunits. For more details see documentation
2832 in vectorizable_operation. */
2833
2834 if (ncopies > 1)
2835 {
2836 stmt_vec_info prev_stmt_vinfo;
2837 /* FORNOW. This restriction should be relaxed. */
2838 gcc_assert (!nested_in_vect_loop);
2839
2840 /* Create the vector that holds the step of the induction. */
2841 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2842 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2843 expr, step_expr);
2844 t = unshare_expr (new_name);
2845 gcc_assert (CONSTANT_CLASS_P (new_name));
2846 vec = build_vector_from_val (stepvectype, t);
2847 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2848
2849 vec_def = induc_def;
2850 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2851 for (i = 1; i < ncopies; i++)
2852 {
2853 /* vec_i = vec_prev + vec_step */
2854 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2855 vec_def, vec_step);
2856 vec_def = make_ssa_name (vec_dest, new_stmt);
2857 gimple_assign_set_lhs (new_stmt, vec_def);
2858
2859 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2860 if (!useless_type_conversion_p (resvectype, vectype))
2861 {
2862 new_stmt = gimple_build_assign_with_ops
2863 (VIEW_CONVERT_EXPR,
2864 vect_get_new_vect_var (resvectype, vect_simple_var,
2865 "vec_iv_"),
2866 build1 (VIEW_CONVERT_EXPR, resvectype,
2867 gimple_assign_lhs (new_stmt)), NULL_TREE);
2868 gimple_assign_set_lhs (new_stmt,
2869 make_ssa_name
2870 (gimple_assign_lhs (new_stmt), new_stmt));
2871 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2872 }
2873 set_vinfo_for_stmt (new_stmt,
2874 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2875 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2876 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2877 }
2878 }
2879
2880 if (nested_in_vect_loop)
2881 {
2882 /* Find the loop-closed exit-phi of the induction, and record
2883 the final vector of induction results: */
2884 exit_phi = NULL;
2885 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2886 {
2887 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2888 {
2889 exit_phi = USE_STMT (use_p);
2890 break;
2891 }
2892 }
2893 if (exit_phi)
2894 {
2895 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2896 /* FORNOW. Currently not supporting the case that an inner-loop induction
2897 is not used in the outer-loop (i.e. only outside the outer-loop). */
2898 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2899 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2900
2901 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2902 if (vect_print_dump_info (REPORT_DETAILS))
2903 {
2904 fprintf (vect_dump, "vector of inductions after inner-loop:");
2905 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2906 }
2907 }
2908 }
2909
2910
2911 if (vect_print_dump_info (REPORT_DETAILS))
2912 {
2913 fprintf (vect_dump, "transform induction: created def-use cycle: ");
2914 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2915 fprintf (vect_dump, "\n");
2916 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2917 }
2918
2919 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2920 if (!useless_type_conversion_p (resvectype, vectype))
2921 {
2922 new_stmt = gimple_build_assign_with_ops
2923 (VIEW_CONVERT_EXPR,
2924 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
2925 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
2926 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
2927 gimple_assign_set_lhs (new_stmt, induc_def);
2928 si = gsi_start_bb (bb);
2929 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2930 set_vinfo_for_stmt (new_stmt,
2931 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2932 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
2933 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
2934 }
2935
2936 return induc_def;
2937 }
2938
2939
2940 /* Function get_initial_def_for_reduction
2941
2942 Input:
2943 STMT - a stmt that performs a reduction operation in the loop.
2944 INIT_VAL - the initial value of the reduction variable
2945
2946 Output:
2947 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2948 of the reduction (used for adjusting the epilog - see below).
2949 Return a vector variable, initialized according to the operation that STMT
2950 performs. This vector will be used as the initial value of the
2951 vector of partial results.
2952
2953 Option1 (adjust in epilog): Initialize the vector as follows:
2954 add/bit or/xor: [0,0,...,0,0]
2955 mult/bit and: [1,1,...,1,1]
2956 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2957 and when necessary (e.g. add/mult case) let the caller know
2958 that it needs to adjust the result by init_val.
2959
2960 Option2: Initialize the vector as follows:
2961 add/bit or/xor: [init_val,0,0,...,0]
2962 mult/bit and: [init_val,1,1,...,1]
2963 min/max/cond_expr: [init_val,init_val,...,init_val]
2964 and no adjustments are needed.
2965
2966 For example, for the following code:
2967
2968 s = init_val;
2969 for (i=0;i<n;i++)
2970 s = s + a[i];
2971
2972 STMT is 's = s + a[i]', and the reduction variable is 's'.
2973 For a vector of 4 units, we want to return either [0,0,0,init_val],
2974 or [0,0,0,0] and let the caller know that it needs to adjust
2975 the result at the end by 'init_val'.
2976
2977 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2978 initialization vector is simpler (same element in all entries), if
2979 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2980
2981 A cost model should help decide between these two schemes. */
2982
2983 tree
2984 get_initial_def_for_reduction (gimple stmt, tree init_val,
2985 tree *adjustment_def)
2986 {
2987 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2988 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2989 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2990 tree scalar_type = TREE_TYPE (init_val);
2991 tree vectype = get_vectype_for_scalar_type (scalar_type);
2992 int nunits;
2993 enum tree_code code = gimple_assign_rhs_code (stmt);
2994 tree def_for_init;
2995 tree init_def;
2996 tree t = NULL_TREE;
2997 int i;
2998 bool nested_in_vect_loop = false;
2999 tree init_value;
3000 REAL_VALUE_TYPE real_init_val = dconst0;
3001 int int_init_val = 0;
3002 gimple def_stmt = NULL;
3003
3004 gcc_assert (vectype);
3005 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3006
3007 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3008 || SCALAR_FLOAT_TYPE_P (scalar_type));
3009
3010 if (nested_in_vect_loop_p (loop, stmt))
3011 nested_in_vect_loop = true;
3012 else
3013 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3014
3015 /* In case of double reduction we only create a vector variable to be put
3016 in the reduction phi node. The actual statement creation is done in
3017 vect_create_epilog_for_reduction. */
3018 if (adjustment_def && nested_in_vect_loop
3019 && TREE_CODE (init_val) == SSA_NAME
3020 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3021 && gimple_code (def_stmt) == GIMPLE_PHI
3022 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3023 && vinfo_for_stmt (def_stmt)
3024 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3025 == vect_double_reduction_def)
3026 {
3027 *adjustment_def = NULL;
3028 return vect_create_destination_var (init_val, vectype);
3029 }
3030
3031 if (TREE_CONSTANT (init_val))
3032 {
3033 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3034 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3035 else
3036 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3037 }
3038 else
3039 init_value = init_val;
3040
3041 switch (code)
3042 {
3043 case WIDEN_SUM_EXPR:
3044 case DOT_PROD_EXPR:
3045 case PLUS_EXPR:
3046 case MINUS_EXPR:
3047 case BIT_IOR_EXPR:
3048 case BIT_XOR_EXPR:
3049 case MULT_EXPR:
3050 case BIT_AND_EXPR:
3051 /* ADJUSMENT_DEF is NULL when called from
3052 vect_create_epilog_for_reduction to vectorize double reduction. */
3053 if (adjustment_def)
3054 {
3055 if (nested_in_vect_loop)
3056 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3057 NULL);
3058 else
3059 *adjustment_def = init_val;
3060 }
3061
3062 if (code == MULT_EXPR)
3063 {
3064 real_init_val = dconst1;
3065 int_init_val = 1;
3066 }
3067
3068 if (code == BIT_AND_EXPR)
3069 int_init_val = -1;
3070
3071 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3072 def_for_init = build_real (scalar_type, real_init_val);
3073 else
3074 def_for_init = build_int_cst (scalar_type, int_init_val);
3075
3076 /* Create a vector of '0' or '1' except the first element. */
3077 for (i = nunits - 2; i >= 0; --i)
3078 t = tree_cons (NULL_TREE, def_for_init, t);
3079
3080 /* Option1: the first element is '0' or '1' as well. */
3081 if (adjustment_def)
3082 {
3083 t = tree_cons (NULL_TREE, def_for_init, t);
3084 init_def = build_vector (vectype, t);
3085 break;
3086 }
3087
3088 /* Option2: the first element is INIT_VAL. */
3089 t = tree_cons (NULL_TREE, init_value, t);
3090 if (TREE_CONSTANT (init_val))
3091 init_def = build_vector (vectype, t);
3092 else
3093 init_def = build_constructor_from_list (vectype, t);
3094
3095 break;
3096
3097 case MIN_EXPR:
3098 case MAX_EXPR:
3099 case COND_EXPR:
3100 if (adjustment_def)
3101 {
3102 *adjustment_def = NULL_TREE;
3103 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3104 break;
3105 }
3106
3107 init_def = build_vector_from_val (vectype, init_value);
3108 break;
3109
3110 default:
3111 gcc_unreachable ();
3112 }
3113
3114 return init_def;
3115 }
3116
3117
3118 /* Function vect_create_epilog_for_reduction
3119
3120 Create code at the loop-epilog to finalize the result of a reduction
3121 computation.
3122
3123 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3124 reduction statements.
3125 STMT is the scalar reduction stmt that is being vectorized.
3126 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3127 number of elements that we can fit in a vectype (nunits). In this case
3128 we have to generate more than one vector stmt - i.e - we need to "unroll"
3129 the vector stmt by a factor VF/nunits. For more details see documentation
3130 in vectorizable_operation.
3131 REDUC_CODE is the tree-code for the epilog reduction.
3132 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3133 computation.
3134 REDUC_INDEX is the index of the operand in the right hand side of the
3135 statement that is defined by REDUCTION_PHI.
3136 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3137 SLP_NODE is an SLP node containing a group of reduction statements. The
3138 first one in this group is STMT.
3139
3140 This function:
3141 1. Creates the reduction def-use cycles: sets the arguments for
3142 REDUCTION_PHIS:
3143 The loop-entry argument is the vectorized initial-value of the reduction.
3144 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3145 sums.
3146 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3147 by applying the operation specified by REDUC_CODE if available, or by
3148 other means (whole-vector shifts or a scalar loop).
3149 The function also creates a new phi node at the loop exit to preserve
3150 loop-closed form, as illustrated below.
3151
3152 The flow at the entry to this function:
3153
3154 loop:
3155 vec_def = phi <null, null> # REDUCTION_PHI
3156 VECT_DEF = vector_stmt # vectorized form of STMT
3157 s_loop = scalar_stmt # (scalar) STMT
3158 loop_exit:
3159 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3160 use <s_out0>
3161 use <s_out0>
3162
3163 The above is transformed by this function into:
3164
3165 loop:
3166 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3167 VECT_DEF = vector_stmt # vectorized form of STMT
3168 s_loop = scalar_stmt # (scalar) STMT
3169 loop_exit:
3170 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3171 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3172 v_out2 = reduce <v_out1>
3173 s_out3 = extract_field <v_out2, 0>
3174 s_out4 = adjust_result <s_out3>
3175 use <s_out4>
3176 use <s_out4>
3177 */
3178
3179 static void
3180 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3181 int ncopies, enum tree_code reduc_code,
3182 VEC (gimple, heap) *reduction_phis,
3183 int reduc_index, bool double_reduc,
3184 slp_tree slp_node)
3185 {
3186 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3187 stmt_vec_info prev_phi_info;
3188 tree vectype;
3189 enum machine_mode mode;
3190 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3191 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3192 basic_block exit_bb;
3193 tree scalar_dest;
3194 tree scalar_type;
3195 gimple new_phi = NULL, phi;
3196 gimple_stmt_iterator exit_gsi;
3197 tree vec_dest;
3198 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3199 gimple epilog_stmt = NULL;
3200 enum tree_code code = gimple_assign_rhs_code (stmt);
3201 gimple exit_phi;
3202 tree bitsize, bitpos;
3203 tree adjustment_def = NULL;
3204 tree vec_initial_def = NULL;
3205 tree reduction_op, expr, def;
3206 tree orig_name, scalar_result;
3207 imm_use_iterator imm_iter, phi_imm_iter;
3208 use_operand_p use_p, phi_use_p;
3209 bool extract_scalar_result = false;
3210 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3211 bool nested_in_vect_loop = false;
3212 VEC (gimple, heap) *new_phis = NULL;
3213 enum vect_def_type dt = vect_unknown_def_type;
3214 int j, i;
3215 VEC (tree, heap) *scalar_results = NULL;
3216 unsigned int group_size = 1, k, ratio;
3217 VEC (tree, heap) *vec_initial_defs = NULL;
3218 VEC (gimple, heap) *phis;
3219
3220 if (slp_node)
3221 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3222
3223 if (nested_in_vect_loop_p (loop, stmt))
3224 {
3225 outer_loop = loop;
3226 loop = loop->inner;
3227 nested_in_vect_loop = true;
3228 gcc_assert (!slp_node);
3229 }
3230
3231 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3232 {
3233 case GIMPLE_SINGLE_RHS:
3234 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3235 == ternary_op);
3236 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3237 break;
3238 case GIMPLE_UNARY_RHS:
3239 reduction_op = gimple_assign_rhs1 (stmt);
3240 break;
3241 case GIMPLE_BINARY_RHS:
3242 reduction_op = reduc_index ?
3243 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3244 break;
3245 case GIMPLE_TERNARY_RHS:
3246 reduction_op = gimple_op (stmt, reduc_index + 1);
3247 break;
3248 default:
3249 gcc_unreachable ();
3250 }
3251
3252 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3253 gcc_assert (vectype);
3254 mode = TYPE_MODE (vectype);
3255
3256 /* 1. Create the reduction def-use cycle:
3257 Set the arguments of REDUCTION_PHIS, i.e., transform
3258
3259 loop:
3260 vec_def = phi <null, null> # REDUCTION_PHI
3261 VECT_DEF = vector_stmt # vectorized form of STMT
3262 ...
3263
3264 into:
3265
3266 loop:
3267 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3268 VECT_DEF = vector_stmt # vectorized form of STMT
3269 ...
3270
3271 (in case of SLP, do it for all the phis). */
3272
3273 /* Get the loop-entry arguments. */
3274 if (slp_node)
3275 vect_get_slp_defs (reduction_op, NULL_TREE, slp_node, &vec_initial_defs,
3276 NULL, reduc_index);
3277 else
3278 {
3279 vec_initial_defs = VEC_alloc (tree, heap, 1);
3280 /* For the case of reduction, vect_get_vec_def_for_operand returns
3281 the scalar def before the loop, that defines the initial value
3282 of the reduction variable. */
3283 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3284 &adjustment_def);
3285 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3286 }
3287
3288 /* Set phi nodes arguments. */
3289 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3290 {
3291 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3292 tree def = VEC_index (tree, vect_defs, i);
3293 for (j = 0; j < ncopies; j++)
3294 {
3295 /* Set the loop-entry arg of the reduction-phi. */
3296 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3297 UNKNOWN_LOCATION);
3298
3299 /* Set the loop-latch arg for the reduction-phi. */
3300 if (j > 0)
3301 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3302
3303 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3304
3305 if (vect_print_dump_info (REPORT_DETAILS))
3306 {
3307 fprintf (vect_dump, "transform reduction: created def-use"
3308 " cycle: ");
3309 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3310 fprintf (vect_dump, "\n");
3311 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3312 TDF_SLIM);
3313 }
3314
3315 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3316 }
3317 }
3318
3319 VEC_free (tree, heap, vec_initial_defs);
3320
3321 /* 2. Create epilog code.
3322 The reduction epilog code operates across the elements of the vector
3323 of partial results computed by the vectorized loop.
3324 The reduction epilog code consists of:
3325
3326 step 1: compute the scalar result in a vector (v_out2)
3327 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3328 step 3: adjust the scalar result (s_out3) if needed.
3329
3330 Step 1 can be accomplished using one the following three schemes:
3331 (scheme 1) using reduc_code, if available.
3332 (scheme 2) using whole-vector shifts, if available.
3333 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3334 combined.
3335
3336 The overall epilog code looks like this:
3337
3338 s_out0 = phi <s_loop> # original EXIT_PHI
3339 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3340 v_out2 = reduce <v_out1> # step 1
3341 s_out3 = extract_field <v_out2, 0> # step 2
3342 s_out4 = adjust_result <s_out3> # step 3
3343
3344 (step 3 is optional, and steps 1 and 2 may be combined).
3345 Lastly, the uses of s_out0 are replaced by s_out4. */
3346
3347
3348 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3349 v_out1 = phi <VECT_DEF>
3350 Store them in NEW_PHIS. */
3351
3352 exit_bb = single_exit (loop)->dest;
3353 prev_phi_info = NULL;
3354 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3355 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3356 {
3357 for (j = 0; j < ncopies; j++)
3358 {
3359 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3360 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3361 if (j == 0)
3362 VEC_quick_push (gimple, new_phis, phi);
3363 else
3364 {
3365 def = vect_get_vec_def_for_stmt_copy (dt, def);
3366 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3367 }
3368
3369 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3370 prev_phi_info = vinfo_for_stmt (phi);
3371 }
3372 }
3373
3374 /* The epilogue is created for the outer-loop, i.e., for the loop being
3375 vectorized. */
3376 if (double_reduc)
3377 {
3378 loop = outer_loop;
3379 exit_bb = single_exit (loop)->dest;
3380 }
3381
3382 exit_gsi = gsi_after_labels (exit_bb);
3383
3384 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3385 (i.e. when reduc_code is not available) and in the final adjustment
3386 code (if needed). Also get the original scalar reduction variable as
3387 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3388 represents a reduction pattern), the tree-code and scalar-def are
3389 taken from the original stmt that the pattern-stmt (STMT) replaces.
3390 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3391 are taken from STMT. */
3392
3393 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3394 if (!orig_stmt)
3395 {
3396 /* Regular reduction */
3397 orig_stmt = stmt;
3398 }
3399 else
3400 {
3401 /* Reduction pattern */
3402 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3403 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3404 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3405 }
3406
3407 code = gimple_assign_rhs_code (orig_stmt);
3408 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3409 partial results are added and not subtracted. */
3410 if (code == MINUS_EXPR)
3411 code = PLUS_EXPR;
3412
3413 scalar_dest = gimple_assign_lhs (orig_stmt);
3414 scalar_type = TREE_TYPE (scalar_dest);
3415 scalar_results = VEC_alloc (tree, heap, group_size);
3416 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3417 bitsize = TYPE_SIZE (scalar_type);
3418
3419 /* In case this is a reduction in an inner-loop while vectorizing an outer
3420 loop - we don't need to extract a single scalar result at the end of the
3421 inner-loop (unless it is double reduction, i.e., the use of reduction is
3422 outside the outer-loop). The final vector of partial results will be used
3423 in the vectorized outer-loop, or reduced to a scalar result at the end of
3424 the outer-loop. */
3425 if (nested_in_vect_loop && !double_reduc)
3426 goto vect_finalize_reduction;
3427
3428 /* 2.3 Create the reduction code, using one of the three schemes described
3429 above. In SLP we simply need to extract all the elements from the
3430 vector (without reducing them), so we use scalar shifts. */
3431 if (reduc_code != ERROR_MARK && !slp_node)
3432 {
3433 tree tmp;
3434
3435 /*** Case 1: Create:
3436 v_out2 = reduc_expr <v_out1> */
3437
3438 if (vect_print_dump_info (REPORT_DETAILS))
3439 fprintf (vect_dump, "Reduce using direct vector reduction.");
3440
3441 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3442 new_phi = VEC_index (gimple, new_phis, 0);
3443 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
3444 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3445 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3446 gimple_assign_set_lhs (epilog_stmt, new_temp);
3447 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3448
3449 extract_scalar_result = true;
3450 }
3451 else
3452 {
3453 enum tree_code shift_code = ERROR_MARK;
3454 bool have_whole_vector_shift = true;
3455 int bit_offset;
3456 int element_bitsize = tree_low_cst (bitsize, 1);
3457 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3458 tree vec_temp;
3459
3460 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3461 shift_code = VEC_RSHIFT_EXPR;
3462 else
3463 have_whole_vector_shift = false;
3464
3465 /* Regardless of whether we have a whole vector shift, if we're
3466 emulating the operation via tree-vect-generic, we don't want
3467 to use it. Only the first round of the reduction is likely
3468 to still be profitable via emulation. */
3469 /* ??? It might be better to emit a reduction tree code here, so that
3470 tree-vect-generic can expand the first round via bit tricks. */
3471 if (!VECTOR_MODE_P (mode))
3472 have_whole_vector_shift = false;
3473 else
3474 {
3475 optab optab = optab_for_tree_code (code, vectype, optab_default);
3476 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3477 have_whole_vector_shift = false;
3478 }
3479
3480 if (have_whole_vector_shift && !slp_node)
3481 {
3482 /*** Case 2: Create:
3483 for (offset = VS/2; offset >= element_size; offset/=2)
3484 {
3485 Create: va' = vec_shift <va, offset>
3486 Create: va = vop <va, va'>
3487 } */
3488
3489 if (vect_print_dump_info (REPORT_DETAILS))
3490 fprintf (vect_dump, "Reduce using vector shifts");
3491
3492 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3493 new_phi = VEC_index (gimple, new_phis, 0);
3494 new_temp = PHI_RESULT (new_phi);
3495 for (bit_offset = vec_size_in_bits/2;
3496 bit_offset >= element_bitsize;
3497 bit_offset /= 2)
3498 {
3499 tree bitpos = size_int (bit_offset);
3500
3501 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3502 vec_dest, new_temp, bitpos);
3503 new_name = make_ssa_name (vec_dest, epilog_stmt);
3504 gimple_assign_set_lhs (epilog_stmt, new_name);
3505 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3506
3507 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3508 new_name, new_temp);
3509 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3510 gimple_assign_set_lhs (epilog_stmt, new_temp);
3511 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3512 }
3513
3514 extract_scalar_result = true;
3515 }
3516 else
3517 {
3518 tree rhs;
3519
3520 /*** Case 3: Create:
3521 s = extract_field <v_out2, 0>
3522 for (offset = element_size;
3523 offset < vector_size;
3524 offset += element_size;)
3525 {
3526 Create: s' = extract_field <v_out2, offset>
3527 Create: s = op <s, s'> // For non SLP cases
3528 } */
3529
3530 if (vect_print_dump_info (REPORT_DETAILS))
3531 fprintf (vect_dump, "Reduce using scalar code. ");
3532
3533 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3534 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3535 {
3536 vec_temp = PHI_RESULT (new_phi);
3537 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3538 bitsize_zero_node);
3539 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3540 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3541 gimple_assign_set_lhs (epilog_stmt, new_temp);
3542 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3543
3544 /* In SLP we don't need to apply reduction operation, so we just
3545 collect s' values in SCALAR_RESULTS. */
3546 if (slp_node)
3547 VEC_safe_push (tree, heap, scalar_results, new_temp);
3548
3549 for (bit_offset = element_bitsize;
3550 bit_offset < vec_size_in_bits;
3551 bit_offset += element_bitsize)
3552 {
3553 tree bitpos = bitsize_int (bit_offset);
3554 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3555 bitsize, bitpos);
3556
3557 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3558 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3559 gimple_assign_set_lhs (epilog_stmt, new_name);
3560 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3561
3562 if (slp_node)
3563 {
3564 /* In SLP we don't need to apply reduction operation, so
3565 we just collect s' values in SCALAR_RESULTS. */
3566 new_temp = new_name;
3567 VEC_safe_push (tree, heap, scalar_results, new_name);
3568 }
3569 else
3570 {
3571 epilog_stmt = gimple_build_assign_with_ops (code,
3572 new_scalar_dest, new_name, new_temp);
3573 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3574 gimple_assign_set_lhs (epilog_stmt, new_temp);
3575 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3576 }
3577 }
3578 }
3579
3580 /* The only case where we need to reduce scalar results in SLP, is
3581 unrolling. If the size of SCALAR_RESULTS is greater than
3582 GROUP_SIZE, we reduce them combining elements modulo
3583 GROUP_SIZE. */
3584 if (slp_node)
3585 {
3586 tree res, first_res, new_res;
3587 gimple new_stmt;
3588
3589 /* Reduce multiple scalar results in case of SLP unrolling. */
3590 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3591 j++)
3592 {
3593 first_res = VEC_index (tree, scalar_results, j % group_size);
3594 new_stmt = gimple_build_assign_with_ops (code,
3595 new_scalar_dest, first_res, res);
3596 new_res = make_ssa_name (new_scalar_dest, new_stmt);
3597 gimple_assign_set_lhs (new_stmt, new_res);
3598 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3599 VEC_replace (tree, scalar_results, j % group_size, new_res);
3600 }
3601 }
3602 else
3603 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
3604 VEC_safe_push (tree, heap, scalar_results, new_temp);
3605
3606 extract_scalar_result = false;
3607 }
3608 }
3609
3610 /* 2.4 Extract the final scalar result. Create:
3611 s_out3 = extract_field <v_out2, bitpos> */
3612
3613 if (extract_scalar_result)
3614 {
3615 tree rhs;
3616
3617 if (vect_print_dump_info (REPORT_DETAILS))
3618 fprintf (vect_dump, "extract scalar result");
3619
3620 if (BYTES_BIG_ENDIAN)
3621 bitpos = size_binop (MULT_EXPR,
3622 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3623 TYPE_SIZE (scalar_type));
3624 else
3625 bitpos = bitsize_zero_node;
3626
3627 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3628 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3629 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3630 gimple_assign_set_lhs (epilog_stmt, new_temp);
3631 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3632 VEC_safe_push (tree, heap, scalar_results, new_temp);
3633 }
3634
3635 vect_finalize_reduction:
3636
3637 if (double_reduc)
3638 loop = loop->inner;
3639
3640 /* 2.5 Adjust the final result by the initial value of the reduction
3641 variable. (When such adjustment is not needed, then
3642 'adjustment_def' is zero). For example, if code is PLUS we create:
3643 new_temp = loop_exit_def + adjustment_def */
3644
3645 if (adjustment_def)
3646 {
3647 gcc_assert (!slp_node);
3648 if (nested_in_vect_loop)
3649 {
3650 new_phi = VEC_index (gimple, new_phis, 0);
3651 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3652 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3653 new_dest = vect_create_destination_var (scalar_dest, vectype);
3654 }
3655 else
3656 {
3657 new_temp = VEC_index (tree, scalar_results, 0);
3658 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3659 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3660 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3661 }
3662
3663 epilog_stmt = gimple_build_assign (new_dest, expr);
3664 new_temp = make_ssa_name (new_dest, epilog_stmt);
3665 gimple_assign_set_lhs (epilog_stmt, new_temp);
3666 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3667 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3668 if (nested_in_vect_loop)
3669 {
3670 set_vinfo_for_stmt (epilog_stmt,
3671 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3672 NULL));
3673 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3674 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3675
3676 if (!double_reduc)
3677 VEC_quick_push (tree, scalar_results, new_temp);
3678 else
3679 VEC_replace (tree, scalar_results, 0, new_temp);
3680 }
3681 else
3682 VEC_replace (tree, scalar_results, 0, new_temp);
3683
3684 VEC_replace (gimple, new_phis, 0, epilog_stmt);
3685 }
3686
3687 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
3688 phis with new adjusted scalar results, i.e., replace use <s_out0>
3689 with use <s_out4>.
3690
3691 Transform:
3692 loop_exit:
3693 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3694 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3695 v_out2 = reduce <v_out1>
3696 s_out3 = extract_field <v_out2, 0>
3697 s_out4 = adjust_result <s_out3>
3698 use <s_out0>
3699 use <s_out0>
3700
3701 into:
3702
3703 loop_exit:
3704 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3705 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3706 v_out2 = reduce <v_out1>
3707 s_out3 = extract_field <v_out2, 0>
3708 s_out4 = adjust_result <s_out3>
3709 use <s_out4>
3710 use <s_out4> */
3711
3712 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3713 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3714 need to match SCALAR_RESULTS with corresponding statements. The first
3715 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3716 the first vector stmt, etc.
3717 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
3718 if (group_size > VEC_length (gimple, new_phis))
3719 {
3720 ratio = group_size / VEC_length (gimple, new_phis);
3721 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3722 }
3723 else
3724 ratio = 1;
3725
3726 for (k = 0; k < group_size; k++)
3727 {
3728 if (k % ratio == 0)
3729 {
3730 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3731 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3732 }
3733
3734 if (slp_node)
3735 {
3736 gimple current_stmt = VEC_index (gimple,
3737 SLP_TREE_SCALAR_STMTS (slp_node), k);
3738
3739 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3740 /* SLP statements can't participate in patterns. */
3741 gcc_assert (!orig_stmt);
3742 scalar_dest = gimple_assign_lhs (current_stmt);
3743 }
3744
3745 phis = VEC_alloc (gimple, heap, 3);
3746 /* Find the loop-closed-use at the loop exit of the original scalar
3747 result. (The reduction result is expected to have two immediate uses -
3748 one at the latch block, and one at the loop exit). */
3749 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3750 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3751 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3752
3753 /* We expect to have found an exit_phi because of loop-closed-ssa
3754 form. */
3755 gcc_assert (!VEC_empty (gimple, phis));
3756
3757 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3758 {
3759 if (outer_loop)
3760 {
3761 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3762 gimple vect_phi;
3763
3764 /* FORNOW. Currently not supporting the case that an inner-loop
3765 reduction is not used in the outer-loop (but only outside the
3766 outer-loop), unless it is double reduction. */
3767 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3768 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3769 || double_reduc);
3770
3771 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3772 if (!double_reduc
3773 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3774 != vect_double_reduction_def)
3775 continue;
3776
3777 /* Handle double reduction:
3778
3779 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
3780 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3781 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
3782 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
3783
3784 At that point the regular reduction (stmt2 and stmt3) is
3785 already vectorized, as well as the exit phi node, stmt4.
3786 Here we vectorize the phi node of double reduction, stmt1, and
3787 update all relevant statements. */
3788
3789 /* Go through all the uses of s2 to find double reduction phi
3790 node, i.e., stmt1 above. */
3791 orig_name = PHI_RESULT (exit_phi);
3792 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3793 {
3794 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3795 stmt_vec_info new_phi_vinfo;
3796 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3797 basic_block bb = gimple_bb (use_stmt);
3798 gimple use;
3799
3800 /* Check that USE_STMT is really double reduction phi
3801 node. */
3802 if (gimple_code (use_stmt) != GIMPLE_PHI
3803 || gimple_phi_num_args (use_stmt) != 2
3804 || !use_stmt_vinfo
3805 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3806 != vect_double_reduction_def
3807 || bb->loop_father != outer_loop)
3808 continue;
3809
3810 /* Create vector phi node for double reduction:
3811 vs1 = phi <vs0, vs2>
3812 vs1 was created previously in this function by a call to
3813 vect_get_vec_def_for_operand and is stored in
3814 vec_initial_def;
3815 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3816 vs0 is created here. */
3817
3818 /* Create vector phi node. */
3819 vect_phi = create_phi_node (vec_initial_def, bb);
3820 new_phi_vinfo = new_stmt_vec_info (vect_phi,
3821 loop_vec_info_for_loop (outer_loop), NULL);
3822 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3823
3824 /* Create vs0 - initial def of the double reduction phi. */
3825 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3826 loop_preheader_edge (outer_loop));
3827 init_def = get_initial_def_for_reduction (stmt,
3828 preheader_arg, NULL);
3829 vect_phi_init = vect_init_vector (use_stmt, init_def,
3830 vectype, NULL);
3831
3832 /* Update phi node arguments with vs0 and vs2. */
3833 add_phi_arg (vect_phi, vect_phi_init,
3834 loop_preheader_edge (outer_loop),
3835 UNKNOWN_LOCATION);
3836 add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3837 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3838 if (vect_print_dump_info (REPORT_DETAILS))
3839 {
3840 fprintf (vect_dump, "created double reduction phi "
3841 "node: ");
3842 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3843 }
3844
3845 vect_phi_res = PHI_RESULT (vect_phi);
3846
3847 /* Replace the use, i.e., set the correct vs1 in the regular
3848 reduction phi node. FORNOW, NCOPIES is always 1, so the
3849 loop is redundant. */
3850 use = reduction_phi;
3851 for (j = 0; j < ncopies; j++)
3852 {
3853 edge pr_edge = loop_preheader_edge (loop);
3854 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3855 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3856 }
3857 }
3858 }
3859 }
3860
3861 VEC_free (gimple, heap, phis);
3862 if (nested_in_vect_loop)
3863 {
3864 if (double_reduc)
3865 loop = outer_loop;
3866 else
3867 continue;
3868 }
3869
3870 phis = VEC_alloc (gimple, heap, 3);
3871 /* Find the loop-closed-use at the loop exit of the original scalar
3872 result. (The reduction result is expected to have two immediate uses,
3873 one at the latch block, and one at the loop exit). For double
3874 reductions we are looking for exit phis of the outer loop. */
3875 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3876 {
3877 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3878 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3879 else
3880 {
3881 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
3882 {
3883 tree phi_res = PHI_RESULT (USE_STMT (use_p));
3884
3885 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
3886 {
3887 if (!flow_bb_inside_loop_p (loop,
3888 gimple_bb (USE_STMT (phi_use_p))))
3889 VEC_safe_push (gimple, heap, phis,
3890 USE_STMT (phi_use_p));
3891 }
3892 }
3893 }
3894 }
3895
3896 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3897 {
3898 /* Replace the uses: */
3899 orig_name = PHI_RESULT (exit_phi);
3900 scalar_result = VEC_index (tree, scalar_results, k);
3901 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3902 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3903 SET_USE (use_p, scalar_result);
3904 }
3905
3906 VEC_free (gimple, heap, phis);
3907 }
3908
3909 VEC_free (tree, heap, scalar_results);
3910 VEC_free (gimple, heap, new_phis);
3911 }
3912
3913
3914 /* Function vectorizable_reduction.
3915
3916 Check if STMT performs a reduction operation that can be vectorized.
3917 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3918 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3919 Return FALSE if not a vectorizable STMT, TRUE otherwise.
3920
3921 This function also handles reduction idioms (patterns) that have been
3922 recognized in advance during vect_pattern_recog. In this case, STMT may be
3923 of this form:
3924 X = pattern_expr (arg0, arg1, ..., X)
3925 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3926 sequence that had been detected and replaced by the pattern-stmt (STMT).
3927
3928 In some cases of reduction patterns, the type of the reduction variable X is
3929 different than the type of the other arguments of STMT.
3930 In such cases, the vectype that is used when transforming STMT into a vector
3931 stmt is different than the vectype that is used to determine the
3932 vectorization factor, because it consists of a different number of elements
3933 than the actual number of elements that are being operated upon in parallel.
3934
3935 For example, consider an accumulation of shorts into an int accumulator.
3936 On some targets it's possible to vectorize this pattern operating on 8
3937 shorts at a time (hence, the vectype for purposes of determining the
3938 vectorization factor should be V8HI); on the other hand, the vectype that
3939 is used to create the vector form is actually V4SI (the type of the result).
3940
3941 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3942 indicates what is the actual level of parallelism (V8HI in the example), so
3943 that the right vectorization factor would be derived. This vectype
3944 corresponds to the type of arguments to the reduction stmt, and should *NOT*
3945 be used to create the vectorized stmt. The right vectype for the vectorized
3946 stmt is obtained from the type of the result X:
3947 get_vectype_for_scalar_type (TREE_TYPE (X))
3948
3949 This means that, contrary to "regular" reductions (or "regular" stmts in
3950 general), the following equation:
3951 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3952 does *NOT* necessarily hold for reduction patterns. */
3953
3954 bool
3955 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3956 gimple *vec_stmt, slp_tree slp_node)
3957 {
3958 tree vec_dest;
3959 tree scalar_dest;
3960 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3961 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3962 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3963 tree vectype_in = NULL_TREE;
3964 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3965 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3966 enum tree_code code, orig_code, epilog_reduc_code;
3967 enum machine_mode vec_mode;
3968 int op_type;
3969 optab optab, reduc_optab;
3970 tree new_temp = NULL_TREE;
3971 tree def;
3972 gimple def_stmt;
3973 enum vect_def_type dt;
3974 gimple new_phi = NULL;
3975 tree scalar_type;
3976 bool is_simple_use;
3977 gimple orig_stmt;
3978 stmt_vec_info orig_stmt_info;
3979 tree expr = NULL_TREE;
3980 int i;
3981 int ncopies;
3982 int epilog_copies;
3983 stmt_vec_info prev_stmt_info, prev_phi_info;
3984 bool single_defuse_cycle = false;
3985 tree reduc_def = NULL_TREE;
3986 gimple new_stmt = NULL;
3987 int j;
3988 tree ops[3];
3989 bool nested_cycle = false, found_nested_cycle_def = false;
3990 gimple reduc_def_stmt = NULL;
3991 /* The default is that the reduction variable is the last in statement. */
3992 int reduc_index = 2;
3993 bool double_reduc = false, dummy;
3994 basic_block def_bb;
3995 struct loop * def_stmt_loop, *outer_loop = NULL;
3996 tree def_arg;
3997 gimple def_arg_stmt;
3998 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
3999 VEC (gimple, heap) *phis = NULL;
4000 int vec_num;
4001 tree def0, def1, tem;
4002
4003 if (nested_in_vect_loop_p (loop, stmt))
4004 {
4005 outer_loop = loop;
4006 loop = loop->inner;
4007 nested_cycle = true;
4008 }
4009
4010 /* 1. Is vectorizable reduction? */
4011 /* Not supportable if the reduction variable is used in the loop. */
4012 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
4013 return false;
4014
4015 /* Reductions that are not used even in an enclosing outer-loop,
4016 are expected to be "live" (used out of the loop). */
4017 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4018 && !STMT_VINFO_LIVE_P (stmt_info))
4019 return false;
4020
4021 /* Make sure it was already recognized as a reduction computation. */
4022 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4023 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4024 return false;
4025
4026 /* 2. Has this been recognized as a reduction pattern?
4027
4028 Check if STMT represents a pattern that has been recognized
4029 in earlier analysis stages. For stmts that represent a pattern,
4030 the STMT_VINFO_RELATED_STMT field records the last stmt in
4031 the original sequence that constitutes the pattern. */
4032
4033 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4034 if (orig_stmt)
4035 {
4036 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4037 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4038 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4039 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4040 }
4041
4042 /* 3. Check the operands of the operation. The first operands are defined
4043 inside the loop body. The last operand is the reduction variable,
4044 which is defined by the loop-header-phi. */
4045
4046 gcc_assert (is_gimple_assign (stmt));
4047
4048 /* Flatten RHS. */
4049 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4050 {
4051 case GIMPLE_SINGLE_RHS:
4052 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4053 if (op_type == ternary_op)
4054 {
4055 tree rhs = gimple_assign_rhs1 (stmt);
4056 ops[0] = TREE_OPERAND (rhs, 0);
4057 ops[1] = TREE_OPERAND (rhs, 1);
4058 ops[2] = TREE_OPERAND (rhs, 2);
4059 code = TREE_CODE (rhs);
4060 }
4061 else
4062 return false;
4063 break;
4064
4065 case GIMPLE_BINARY_RHS:
4066 code = gimple_assign_rhs_code (stmt);
4067 op_type = TREE_CODE_LENGTH (code);
4068 gcc_assert (op_type == binary_op);
4069 ops[0] = gimple_assign_rhs1 (stmt);
4070 ops[1] = gimple_assign_rhs2 (stmt);
4071 break;
4072
4073 case GIMPLE_TERNARY_RHS:
4074 code = gimple_assign_rhs_code (stmt);
4075 op_type = TREE_CODE_LENGTH (code);
4076 gcc_assert (op_type == ternary_op);
4077 ops[0] = gimple_assign_rhs1 (stmt);
4078 ops[1] = gimple_assign_rhs2 (stmt);
4079 ops[2] = gimple_assign_rhs3 (stmt);
4080 break;
4081
4082 case GIMPLE_UNARY_RHS:
4083 return false;
4084
4085 default:
4086 gcc_unreachable ();
4087 }
4088
4089 scalar_dest = gimple_assign_lhs (stmt);
4090 scalar_type = TREE_TYPE (scalar_dest);
4091 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4092 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4093 return false;
4094
4095 /* All uses but the last are expected to be defined in the loop.
4096 The last use is the reduction variable. In case of nested cycle this
4097 assumption is not true: we use reduc_index to record the index of the
4098 reduction variable. */
4099 for (i = 0; i < op_type-1; i++)
4100 {
4101 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4102 if (i == 0 && code == COND_EXPR)
4103 continue;
4104
4105 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4106 &def_stmt, &def, &dt, &tem);
4107 if (!vectype_in)
4108 vectype_in = tem;
4109 gcc_assert (is_simple_use);
4110 if (dt != vect_internal_def
4111 && dt != vect_external_def
4112 && dt != vect_constant_def
4113 && dt != vect_induction_def
4114 && !(dt == vect_nested_cycle && nested_cycle))
4115 return false;
4116
4117 if (dt == vect_nested_cycle)
4118 {
4119 found_nested_cycle_def = true;
4120 reduc_def_stmt = def_stmt;
4121 reduc_index = i;
4122 }
4123 }
4124
4125 is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
4126 &def, &dt, &tem);
4127 if (!vectype_in)
4128 vectype_in = tem;
4129 gcc_assert (is_simple_use);
4130 gcc_assert (dt == vect_reduction_def
4131 || dt == vect_nested_cycle
4132 || ((dt == vect_internal_def || dt == vect_external_def
4133 || dt == vect_constant_def || dt == vect_induction_def)
4134 && nested_cycle && found_nested_cycle_def));
4135 if (!found_nested_cycle_def)
4136 reduc_def_stmt = def_stmt;
4137
4138 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4139 if (orig_stmt)
4140 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4141 reduc_def_stmt,
4142 !nested_cycle,
4143 &dummy));
4144 else
4145 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4146 !nested_cycle, &dummy));
4147
4148 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4149 return false;
4150
4151 if (slp_node || PURE_SLP_STMT (stmt_info))
4152 ncopies = 1;
4153 else
4154 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4155 / TYPE_VECTOR_SUBPARTS (vectype_in));
4156
4157 gcc_assert (ncopies >= 1);
4158
4159 vec_mode = TYPE_MODE (vectype_in);
4160
4161 if (code == COND_EXPR)
4162 {
4163 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
4164 {
4165 if (vect_print_dump_info (REPORT_DETAILS))
4166 fprintf (vect_dump, "unsupported condition in reduction");
4167
4168 return false;
4169 }
4170 }
4171 else
4172 {
4173 /* 4. Supportable by target? */
4174
4175 /* 4.1. check support for the operation in the loop */
4176 optab = optab_for_tree_code (code, vectype_in, optab_default);
4177 if (!optab)
4178 {
4179 if (vect_print_dump_info (REPORT_DETAILS))
4180 fprintf (vect_dump, "no optab.");
4181
4182 return false;
4183 }
4184
4185 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4186 {
4187 if (vect_print_dump_info (REPORT_DETAILS))
4188 fprintf (vect_dump, "op not supported by target.");
4189
4190 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4191 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4192 < vect_min_worthwhile_factor (code))
4193 return false;
4194
4195 if (vect_print_dump_info (REPORT_DETAILS))
4196 fprintf (vect_dump, "proceeding using word mode.");
4197 }
4198
4199 /* Worthwhile without SIMD support? */
4200 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4201 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4202 < vect_min_worthwhile_factor (code))
4203 {
4204 if (vect_print_dump_info (REPORT_DETAILS))
4205 fprintf (vect_dump, "not worthwhile without SIMD support.");
4206
4207 return false;
4208 }
4209 }
4210
4211 /* 4.2. Check support for the epilog operation.
4212
4213 If STMT represents a reduction pattern, then the type of the
4214 reduction variable may be different than the type of the rest
4215 of the arguments. For example, consider the case of accumulation
4216 of shorts into an int accumulator; The original code:
4217 S1: int_a = (int) short_a;
4218 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4219
4220 was replaced with:
4221 STMT: int_acc = widen_sum <short_a, int_acc>
4222
4223 This means that:
4224 1. The tree-code that is used to create the vector operation in the
4225 epilog code (that reduces the partial results) is not the
4226 tree-code of STMT, but is rather the tree-code of the original
4227 stmt from the pattern that STMT is replacing. I.e, in the example
4228 above we want to use 'widen_sum' in the loop, but 'plus' in the
4229 epilog.
4230 2. The type (mode) we use to check available target support
4231 for the vector operation to be created in the *epilog*, is
4232 determined by the type of the reduction variable (in the example
4233 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4234 However the type (mode) we use to check available target support
4235 for the vector operation to be created *inside the loop*, is
4236 determined by the type of the other arguments to STMT (in the
4237 example we'd check this: optab_handler (widen_sum_optab,
4238 vect_short_mode)).
4239
4240 This is contrary to "regular" reductions, in which the types of all
4241 the arguments are the same as the type of the reduction variable.
4242 For "regular" reductions we can therefore use the same vector type
4243 (and also the same tree-code) when generating the epilog code and
4244 when generating the code inside the loop. */
4245
4246 if (orig_stmt)
4247 {
4248 /* This is a reduction pattern: get the vectype from the type of the
4249 reduction variable, and get the tree-code from orig_stmt. */
4250 orig_code = gimple_assign_rhs_code (orig_stmt);
4251 gcc_assert (vectype_out);
4252 vec_mode = TYPE_MODE (vectype_out);
4253 }
4254 else
4255 {
4256 /* Regular reduction: use the same vectype and tree-code as used for
4257 the vector code inside the loop can be used for the epilog code. */
4258 orig_code = code;
4259 }
4260
4261 if (nested_cycle)
4262 {
4263 def_bb = gimple_bb (reduc_def_stmt);
4264 def_stmt_loop = def_bb->loop_father;
4265 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4266 loop_preheader_edge (def_stmt_loop));
4267 if (TREE_CODE (def_arg) == SSA_NAME
4268 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4269 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4270 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4271 && vinfo_for_stmt (def_arg_stmt)
4272 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4273 == vect_double_reduction_def)
4274 double_reduc = true;
4275 }
4276
4277 epilog_reduc_code = ERROR_MARK;
4278 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4279 {
4280 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4281 optab_default);
4282 if (!reduc_optab)
4283 {
4284 if (vect_print_dump_info (REPORT_DETAILS))
4285 fprintf (vect_dump, "no optab for reduction.");
4286
4287 epilog_reduc_code = ERROR_MARK;
4288 }
4289
4290 if (reduc_optab
4291 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4292 {
4293 if (vect_print_dump_info (REPORT_DETAILS))
4294 fprintf (vect_dump, "reduc op not supported by target.");
4295
4296 epilog_reduc_code = ERROR_MARK;
4297 }
4298 }
4299 else
4300 {
4301 if (!nested_cycle || double_reduc)
4302 {
4303 if (vect_print_dump_info (REPORT_DETAILS))
4304 fprintf (vect_dump, "no reduc code for scalar code.");
4305
4306 return false;
4307 }
4308 }
4309
4310 if (double_reduc && ncopies > 1)
4311 {
4312 if (vect_print_dump_info (REPORT_DETAILS))
4313 fprintf (vect_dump, "multiple types in double reduction");
4314
4315 return false;
4316 }
4317
4318 if (!vec_stmt) /* transformation not required. */
4319 {
4320 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4321 return false;
4322 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4323 return true;
4324 }
4325
4326 /** Transform. **/
4327
4328 if (vect_print_dump_info (REPORT_DETAILS))
4329 fprintf (vect_dump, "transform reduction.");
4330
4331 /* FORNOW: Multiple types are not supported for condition. */
4332 if (code == COND_EXPR)
4333 gcc_assert (ncopies == 1);
4334
4335 /* Create the destination vector */
4336 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4337
4338 /* In case the vectorization factor (VF) is bigger than the number
4339 of elements that we can fit in a vectype (nunits), we have to generate
4340 more than one vector stmt - i.e - we need to "unroll" the
4341 vector stmt by a factor VF/nunits. For more details see documentation
4342 in vectorizable_operation. */
4343
4344 /* If the reduction is used in an outer loop we need to generate
4345 VF intermediate results, like so (e.g. for ncopies=2):
4346 r0 = phi (init, r0)
4347 r1 = phi (init, r1)
4348 r0 = x0 + r0;
4349 r1 = x1 + r1;
4350 (i.e. we generate VF results in 2 registers).
4351 In this case we have a separate def-use cycle for each copy, and therefore
4352 for each copy we get the vector def for the reduction variable from the
4353 respective phi node created for this copy.
4354
4355 Otherwise (the reduction is unused in the loop nest), we can combine
4356 together intermediate results, like so (e.g. for ncopies=2):
4357 r = phi (init, r)
4358 r = x0 + r;
4359 r = x1 + r;
4360 (i.e. we generate VF/2 results in a single register).
4361 In this case for each copy we get the vector def for the reduction variable
4362 from the vectorized reduction operation generated in the previous iteration.
4363 */
4364
4365 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4366 {
4367 single_defuse_cycle = true;
4368 epilog_copies = 1;
4369 }
4370 else
4371 epilog_copies = ncopies;
4372
4373 prev_stmt_info = NULL;
4374 prev_phi_info = NULL;
4375 if (slp_node)
4376 {
4377 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4378 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4379 == TYPE_VECTOR_SUBPARTS (vectype_in));
4380 }
4381 else
4382 {
4383 vec_num = 1;
4384 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4385 if (op_type == ternary_op)
4386 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4387 }
4388
4389 phis = VEC_alloc (gimple, heap, vec_num);
4390 vect_defs = VEC_alloc (tree, heap, vec_num);
4391 if (!slp_node)
4392 VEC_quick_push (tree, vect_defs, NULL_TREE);
4393
4394 for (j = 0; j < ncopies; j++)
4395 {
4396 if (j == 0 || !single_defuse_cycle)
4397 {
4398 for (i = 0; i < vec_num; i++)
4399 {
4400 /* Create the reduction-phi that defines the reduction
4401 operand. */
4402 new_phi = create_phi_node (vec_dest, loop->header);
4403 set_vinfo_for_stmt (new_phi,
4404 new_stmt_vec_info (new_phi, loop_vinfo,
4405 NULL));
4406 if (j == 0 || slp_node)
4407 VEC_quick_push (gimple, phis, new_phi);
4408 }
4409 }
4410
4411 if (code == COND_EXPR)
4412 {
4413 gcc_assert (!slp_node);
4414 vectorizable_condition (stmt, gsi, vec_stmt,
4415 PHI_RESULT (VEC_index (gimple, phis, 0)),
4416 reduc_index);
4417 /* Multiple types are not supported for condition. */
4418 break;
4419 }
4420
4421 /* Handle uses. */
4422 if (j == 0)
4423 {
4424 tree op0, op1 = NULL_TREE;
4425
4426 op0 = ops[!reduc_index];
4427 if (op_type == ternary_op)
4428 {
4429 if (reduc_index == 0)
4430 op1 = ops[2];
4431 else
4432 op1 = ops[1];
4433 }
4434
4435 if (slp_node)
4436 vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0, &vec_oprnds1,
4437 -1);
4438 else
4439 {
4440 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4441 stmt, NULL);
4442 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4443 if (op_type == ternary_op)
4444 {
4445 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4446 NULL);
4447 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4448 }
4449 }
4450 }
4451 else
4452 {
4453 if (!slp_node)
4454 {
4455 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4456 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4457 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4458 if (op_type == ternary_op)
4459 {
4460 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4461 loop_vec_def1);
4462 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4463 }
4464 }
4465
4466 if (single_defuse_cycle)
4467 reduc_def = gimple_assign_lhs (new_stmt);
4468
4469 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4470 }
4471
4472 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4473 {
4474 if (slp_node)
4475 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4476 else
4477 {
4478 if (!single_defuse_cycle || j == 0)
4479 reduc_def = PHI_RESULT (new_phi);
4480 }
4481
4482 def1 = ((op_type == ternary_op)
4483 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4484 if (op_type == binary_op)
4485 {
4486 if (reduc_index == 0)
4487 expr = build2 (code, vectype_out, reduc_def, def0);
4488 else
4489 expr = build2 (code, vectype_out, def0, reduc_def);
4490 }
4491 else
4492 {
4493 if (reduc_index == 0)
4494 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4495 else
4496 {
4497 if (reduc_index == 1)
4498 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4499 else
4500 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4501 }
4502 }
4503
4504 new_stmt = gimple_build_assign (vec_dest, expr);
4505 new_temp = make_ssa_name (vec_dest, new_stmt);
4506 gimple_assign_set_lhs (new_stmt, new_temp);
4507 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4508 if (slp_node)
4509 {
4510 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4511 VEC_quick_push (tree, vect_defs, new_temp);
4512 }
4513 else
4514 VEC_replace (tree, vect_defs, 0, new_temp);
4515 }
4516
4517 if (slp_node)
4518 continue;
4519
4520 if (j == 0)
4521 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4522 else
4523 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4524
4525 prev_stmt_info = vinfo_for_stmt (new_stmt);
4526 prev_phi_info = vinfo_for_stmt (new_phi);
4527 }
4528
4529 /* Finalize the reduction-phi (set its arguments) and create the
4530 epilog reduction code. */
4531 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4532 {
4533 new_temp = gimple_assign_lhs (*vec_stmt);
4534 VEC_replace (tree, vect_defs, 0, new_temp);
4535 }
4536
4537 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4538 epilog_reduc_code, phis, reduc_index,
4539 double_reduc, slp_node);
4540
4541 VEC_free (gimple, heap, phis);
4542 VEC_free (tree, heap, vec_oprnds0);
4543 if (vec_oprnds1)
4544 VEC_free (tree, heap, vec_oprnds1);
4545
4546 return true;
4547 }
4548
4549 /* Function vect_min_worthwhile_factor.
4550
4551 For a loop where we could vectorize the operation indicated by CODE,
4552 return the minimum vectorization factor that makes it worthwhile
4553 to use generic vectors. */
4554 int
4555 vect_min_worthwhile_factor (enum tree_code code)
4556 {
4557 switch (code)
4558 {
4559 case PLUS_EXPR:
4560 case MINUS_EXPR:
4561 case NEGATE_EXPR:
4562 return 4;
4563
4564 case BIT_AND_EXPR:
4565 case BIT_IOR_EXPR:
4566 case BIT_XOR_EXPR:
4567 case BIT_NOT_EXPR:
4568 return 2;
4569
4570 default:
4571 return INT_MAX;
4572 }
4573 }
4574
4575
4576 /* Function vectorizable_induction
4577
4578 Check if PHI performs an induction computation that can be vectorized.
4579 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4580 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4581 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
4582
4583 bool
4584 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4585 gimple *vec_stmt)
4586 {
4587 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4588 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4589 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4590 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4591 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4592 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4593 tree vec_def;
4594
4595 gcc_assert (ncopies >= 1);
4596 /* FORNOW. This restriction should be relaxed. */
4597 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4598 {
4599 if (vect_print_dump_info (REPORT_DETAILS))
4600 fprintf (vect_dump, "multiple types in nested loop.");
4601 return false;
4602 }
4603
4604 if (!STMT_VINFO_RELEVANT_P (stmt_info))
4605 return false;
4606
4607 /* FORNOW: SLP not supported. */
4608 if (STMT_SLP_TYPE (stmt_info))
4609 return false;
4610
4611 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4612
4613 if (gimple_code (phi) != GIMPLE_PHI)
4614 return false;
4615
4616 if (!vec_stmt) /* transformation not required. */
4617 {
4618 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4619 if (vect_print_dump_info (REPORT_DETAILS))
4620 fprintf (vect_dump, "=== vectorizable_induction ===");
4621 vect_model_induction_cost (stmt_info, ncopies);
4622 return true;
4623 }
4624
4625 /** Transform. **/
4626
4627 if (vect_print_dump_info (REPORT_DETAILS))
4628 fprintf (vect_dump, "transform induction phi.");
4629
4630 vec_def = get_initial_def_for_induction (phi);
4631 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4632 return true;
4633 }
4634
4635 /* Function vectorizable_live_operation.
4636
4637 STMT computes a value that is used outside the loop. Check if
4638 it can be supported. */
4639
4640 bool
4641 vectorizable_live_operation (gimple stmt,
4642 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4643 gimple *vec_stmt ATTRIBUTE_UNUSED)
4644 {
4645 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4646 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4647 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4648 int i;
4649 int op_type;
4650 tree op;
4651 tree def;
4652 gimple def_stmt;
4653 enum vect_def_type dt;
4654 enum tree_code code;
4655 enum gimple_rhs_class rhs_class;
4656
4657 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4658
4659 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4660 return false;
4661
4662 if (!is_gimple_assign (stmt))
4663 return false;
4664
4665 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4666 return false;
4667
4668 /* FORNOW. CHECKME. */
4669 if (nested_in_vect_loop_p (loop, stmt))
4670 return false;
4671
4672 code = gimple_assign_rhs_code (stmt);
4673 op_type = TREE_CODE_LENGTH (code);
4674 rhs_class = get_gimple_rhs_class (code);
4675 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4676 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4677
4678 /* FORNOW: support only if all uses are invariant. This means
4679 that the scalar operations can remain in place, unvectorized.
4680 The original last scalar value that they compute will be used. */
4681
4682 for (i = 0; i < op_type; i++)
4683 {
4684 if (rhs_class == GIMPLE_SINGLE_RHS)
4685 op = TREE_OPERAND (gimple_op (stmt, 1), i);
4686 else
4687 op = gimple_op (stmt, i + 1);
4688 if (op
4689 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4690 {
4691 if (vect_print_dump_info (REPORT_DETAILS))
4692 fprintf (vect_dump, "use not simple.");
4693 return false;
4694 }
4695
4696 if (dt != vect_external_def && dt != vect_constant_def)
4697 return false;
4698 }
4699
4700 /* No transformation is required for the cases we currently support. */
4701 return true;
4702 }
4703
4704 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
4705
4706 static void
4707 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4708 {
4709 ssa_op_iter op_iter;
4710 imm_use_iterator imm_iter;
4711 def_operand_p def_p;
4712 gimple ustmt;
4713
4714 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4715 {
4716 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4717 {
4718 basic_block bb;
4719
4720 if (!is_gimple_debug (ustmt))
4721 continue;
4722
4723 bb = gimple_bb (ustmt);
4724
4725 if (!flow_bb_inside_loop_p (loop, bb))
4726 {
4727 if (gimple_debug_bind_p (ustmt))
4728 {
4729 if (vect_print_dump_info (REPORT_DETAILS))
4730 fprintf (vect_dump, "killing debug use");
4731
4732 gimple_debug_bind_reset_value (ustmt);
4733 update_stmt (ustmt);
4734 }
4735 else
4736 gcc_unreachable ();
4737 }
4738 }
4739 }
4740 }
4741
4742 /* Function vect_transform_loop.
4743
4744 The analysis phase has determined that the loop is vectorizable.
4745 Vectorize the loop - created vectorized stmts to replace the scalar
4746 stmts in the loop, and update the loop exit condition. */
4747
4748 void
4749 vect_transform_loop (loop_vec_info loop_vinfo)
4750 {
4751 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4752 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4753 int nbbs = loop->num_nodes;
4754 gimple_stmt_iterator si;
4755 int i;
4756 tree ratio = NULL;
4757 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4758 bool strided_store;
4759 bool slp_scheduled = false;
4760 unsigned int nunits;
4761 tree cond_expr = NULL_TREE;
4762 gimple_seq cond_expr_stmt_list = NULL;
4763 bool do_peeling_for_loop_bound;
4764
4765 if (vect_print_dump_info (REPORT_DETAILS))
4766 fprintf (vect_dump, "=== vec_transform_loop ===");
4767
4768 /* Peel the loop if there are data refs with unknown alignment.
4769 Only one data ref with unknown store is allowed. */
4770
4771 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4772 vect_do_peeling_for_alignment (loop_vinfo);
4773
4774 do_peeling_for_loop_bound
4775 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4776 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4777 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4778
4779 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4780 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4781 vect_loop_versioning (loop_vinfo,
4782 !do_peeling_for_loop_bound,
4783 &cond_expr, &cond_expr_stmt_list);
4784
4785 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4786 compile time constant), or it is a constant that doesn't divide by the
4787 vectorization factor, then an epilog loop needs to be created.
4788 We therefore duplicate the loop: the original loop will be vectorized,
4789 and will compute the first (n/VF) iterations. The second copy of the loop
4790 will remain scalar and will compute the remaining (n%VF) iterations.
4791 (VF is the vectorization factor). */
4792
4793 if (do_peeling_for_loop_bound)
4794 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4795 cond_expr, cond_expr_stmt_list);
4796 else
4797 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4798 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4799
4800 /* 1) Make sure the loop header has exactly two entries
4801 2) Make sure we have a preheader basic block. */
4802
4803 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4804
4805 split_edge (loop_preheader_edge (loop));
4806
4807 /* FORNOW: the vectorizer supports only loops which body consist
4808 of one basic block (header + empty latch). When the vectorizer will
4809 support more involved loop forms, the order by which the BBs are
4810 traversed need to be reconsidered. */
4811
4812 for (i = 0; i < nbbs; i++)
4813 {
4814 basic_block bb = bbs[i];
4815 stmt_vec_info stmt_info;
4816 gimple phi;
4817
4818 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4819 {
4820 phi = gsi_stmt (si);
4821 if (vect_print_dump_info (REPORT_DETAILS))
4822 {
4823 fprintf (vect_dump, "------>vectorizing phi: ");
4824 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4825 }
4826 stmt_info = vinfo_for_stmt (phi);
4827 if (!stmt_info)
4828 continue;
4829
4830 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4831 vect_loop_kill_debug_uses (loop, phi);
4832
4833 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4834 && !STMT_VINFO_LIVE_P (stmt_info))
4835 continue;
4836
4837 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4838 != (unsigned HOST_WIDE_INT) vectorization_factor)
4839 && vect_print_dump_info (REPORT_DETAILS))
4840 fprintf (vect_dump, "multiple-types.");
4841
4842 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4843 {
4844 if (vect_print_dump_info (REPORT_DETAILS))
4845 fprintf (vect_dump, "transform phi.");
4846 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4847 }
4848 }
4849
4850 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4851 {
4852 gimple stmt = gsi_stmt (si);
4853 bool is_store;
4854
4855 if (vect_print_dump_info (REPORT_DETAILS))
4856 {
4857 fprintf (vect_dump, "------>vectorizing statement: ");
4858 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4859 }
4860
4861 stmt_info = vinfo_for_stmt (stmt);
4862
4863 /* vector stmts created in the outer-loop during vectorization of
4864 stmts in an inner-loop may not have a stmt_info, and do not
4865 need to be vectorized. */
4866 if (!stmt_info)
4867 {
4868 gsi_next (&si);
4869 continue;
4870 }
4871
4872 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4873 vect_loop_kill_debug_uses (loop, stmt);
4874
4875 if (!STMT_VINFO_RELEVANT_P (stmt_info)
4876 && !STMT_VINFO_LIVE_P (stmt_info))
4877 {
4878 gsi_next (&si);
4879 continue;
4880 }
4881
4882 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4883 nunits =
4884 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4885 if (!STMT_SLP_TYPE (stmt_info)
4886 && nunits != (unsigned int) vectorization_factor
4887 && vect_print_dump_info (REPORT_DETAILS))
4888 /* For SLP VF is set according to unrolling factor, and not to
4889 vector size, hence for SLP this print is not valid. */
4890 fprintf (vect_dump, "multiple-types.");
4891
4892 /* SLP. Schedule all the SLP instances when the first SLP stmt is
4893 reached. */
4894 if (STMT_SLP_TYPE (stmt_info))
4895 {
4896 if (!slp_scheduled)
4897 {
4898 slp_scheduled = true;
4899
4900 if (vect_print_dump_info (REPORT_DETAILS))
4901 fprintf (vect_dump, "=== scheduling SLP instances ===");
4902
4903 vect_schedule_slp (loop_vinfo, NULL);
4904 }
4905
4906 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
4907 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4908 {
4909 gsi_next (&si);
4910 continue;
4911 }
4912 }
4913
4914 /* -------- vectorize statement ------------ */
4915 if (vect_print_dump_info (REPORT_DETAILS))
4916 fprintf (vect_dump, "transform statement.");
4917
4918 strided_store = false;
4919 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4920 if (is_store)
4921 {
4922 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4923 {
4924 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4925 interleaving chain was completed - free all the stores in
4926 the chain. */
4927 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
4928 gsi_remove (&si, true);
4929 continue;
4930 }
4931 else
4932 {
4933 /* Free the attached stmt_vec_info and remove the stmt. */
4934 free_stmt_vec_info (stmt);
4935 gsi_remove (&si, true);
4936 continue;
4937 }
4938 }
4939 gsi_next (&si);
4940 } /* stmts in BB */
4941 } /* BBs in loop */
4942
4943 slpeel_make_loop_iterate_ntimes (loop, ratio);
4944
4945 /* The memory tags and pointers in vectorized statements need to
4946 have their SSA forms updated. FIXME, why can't this be delayed
4947 until all the loops have been transformed? */
4948 update_ssa (TODO_update_ssa);
4949
4950 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4951 fprintf (vect_dump, "LOOP VECTORIZED.");
4952 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4953 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
4954 }