re PR middle-end/53408 (ICE in get_initial_def_for_induction, at tree-vect-loop.c...
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
6
7 This file is part of GCC.
8
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
13
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
22
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "tree-pretty-print.h"
31 #include "gimple-pretty-print.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "cfgloop.h"
35 #include "cfglayout.h"
36 #include "expr.h"
37 #include "recog.h"
38 #include "optabs.h"
39 #include "params.h"
40 #include "diagnostic-core.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "tree-vectorizer.h"
44 #include "target.h"
45
46 /* Loop Vectorization Pass.
47
48 This pass tries to vectorize loops.
49
50 For example, the vectorizer transforms the following simple loop:
51
52 short a[N]; short b[N]; short c[N]; int i;
53
54 for (i=0; i<N; i++){
55 a[i] = b[i] + c[i];
56 }
57
58 as if it was manually vectorized by rewriting the source code into:
59
60 typedef int __attribute__((mode(V8HI))) v8hi;
61 short a[N]; short b[N]; short c[N]; int i;
62 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
63 v8hi va, vb, vc;
64
65 for (i=0; i<N/8; i++){
66 vb = pb[i];
67 vc = pc[i];
68 va = vb + vc;
69 pa[i] = va;
70 }
71
72 The main entry to this pass is vectorize_loops(), in which
73 the vectorizer applies a set of analyses on a given set of loops,
74 followed by the actual vectorization transformation for the loops that
75 had successfully passed the analysis phase.
76 Throughout this pass we make a distinction between two types of
77 data: scalars (which are represented by SSA_NAMES), and memory references
78 ("data-refs"). These two types of data require different handling both
79 during analysis and transformation. The types of data-refs that the
80 vectorizer currently supports are ARRAY_REFS which base is an array DECL
81 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
82 accesses are required to have a simple (consecutive) access pattern.
83
84 Analysis phase:
85 ===============
86 The driver for the analysis phase is vect_analyze_loop().
87 It applies a set of analyses, some of which rely on the scalar evolution
88 analyzer (scev) developed by Sebastian Pop.
89
90 During the analysis phase the vectorizer records some information
91 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
92 loop, as well as general information about the loop as a whole, which is
93 recorded in a "loop_vec_info" struct attached to each loop.
94
95 Transformation phase:
96 =====================
97 The loop transformation phase scans all the stmts in the loop, and
98 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
99 the loop that needs to be vectorized. It inserts the vector code sequence
100 just before the scalar stmt S, and records a pointer to the vector code
101 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
102 attached to S). This pointer will be used for the vectorization of following
103 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
104 otherwise, we rely on dead code elimination for removing it.
105
106 For example, say stmt S1 was vectorized into stmt VS1:
107
108 VS1: vb = px[i];
109 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
110 S2: a = b;
111
112 To vectorize stmt S2, the vectorizer first finds the stmt that defines
113 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
114 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
115 resulting sequence would be:
116
117 VS1: vb = px[i];
118 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
119 VS2: va = vb;
120 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
121
122 Operands that are not SSA_NAMEs, are data-refs that appear in
123 load/store operations (like 'x[i]' in S1), and are handled differently.
124
125 Target modeling:
126 =================
127 Currently the only target specific information that is used is the
128 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
129 Targets that can support different sizes of vectors, for now will need
130 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
131 flexibility will be added in the future.
132
133 Since we only vectorize operations which vector form can be
134 expressed using existing tree codes, to verify that an operation is
135 supported, the vectorizer checks the relevant optab at the relevant
136 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
137 the value found is CODE_FOR_nothing, then there's no target support, and
138 we can't vectorize the stmt.
139
140 For additional information on this project see:
141 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
142 */
143
144 /* Function vect_determine_vectorization_factor
145
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
151
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
155 in the loop.
156
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
158 original loop:
159 for (i=0; i<N; i++){
160 a[i] = b[i] + c[i];
161 }
162
163 vectorized loop:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
166 }
167 */
168
169 static bool
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
171 {
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
177 tree scalar_type;
178 gimple phi;
179 tree vectype;
180 unsigned int nunits;
181 stmt_vec_info stmt_info;
182 int i;
183 HOST_WIDE_INT dummy;
184 gimple stmt, pattern_stmt = NULL;
185 gimple_seq pattern_def_seq = NULL;
186 gimple_stmt_iterator pattern_def_si = gsi_none ();
187 bool analyze_pattern_stmt = false;
188
189 if (vect_print_dump_info (REPORT_DETAILS))
190 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
191
192 for (i = 0; i < nbbs; i++)
193 {
194 basic_block bb = bbs[i];
195
196 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
197 {
198 phi = gsi_stmt (si);
199 stmt_info = vinfo_for_stmt (phi);
200 if (vect_print_dump_info (REPORT_DETAILS))
201 {
202 fprintf (vect_dump, "==> examining phi: ");
203 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
204 }
205
206 gcc_assert (stmt_info);
207
208 if (STMT_VINFO_RELEVANT_P (stmt_info))
209 {
210 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
211 scalar_type = TREE_TYPE (PHI_RESULT (phi));
212
213 if (vect_print_dump_info (REPORT_DETAILS))
214 {
215 fprintf (vect_dump, "get vectype for scalar type: ");
216 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
217 }
218
219 vectype = get_vectype_for_scalar_type (scalar_type);
220 if (!vectype)
221 {
222 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
223 {
224 fprintf (vect_dump,
225 "not vectorized: unsupported data-type ");
226 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
227 }
228 return false;
229 }
230 STMT_VINFO_VECTYPE (stmt_info) = vectype;
231
232 if (vect_print_dump_info (REPORT_DETAILS))
233 {
234 fprintf (vect_dump, "vectype: ");
235 print_generic_expr (vect_dump, vectype, TDF_SLIM);
236 }
237
238 nunits = TYPE_VECTOR_SUBPARTS (vectype);
239 if (vect_print_dump_info (REPORT_DETAILS))
240 fprintf (vect_dump, "nunits = %d", nunits);
241
242 if (!vectorization_factor
243 || (nunits > vectorization_factor))
244 vectorization_factor = nunits;
245 }
246 }
247
248 for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
249 {
250 tree vf_vectype;
251
252 if (analyze_pattern_stmt)
253 stmt = pattern_stmt;
254 else
255 stmt = gsi_stmt (si);
256
257 stmt_info = vinfo_for_stmt (stmt);
258
259 if (vect_print_dump_info (REPORT_DETAILS))
260 {
261 fprintf (vect_dump, "==> examining statement: ");
262 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
263 }
264
265 gcc_assert (stmt_info);
266
267 /* Skip stmts which do not need to be vectorized. */
268 if (!STMT_VINFO_RELEVANT_P (stmt_info)
269 && !STMT_VINFO_LIVE_P (stmt_info))
270 {
271 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
272 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
273 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
274 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
275 {
276 stmt = pattern_stmt;
277 stmt_info = vinfo_for_stmt (pattern_stmt);
278 if (vect_print_dump_info (REPORT_DETAILS))
279 {
280 fprintf (vect_dump, "==> examining pattern statement: ");
281 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
282 }
283 }
284 else
285 {
286 if (vect_print_dump_info (REPORT_DETAILS))
287 fprintf (vect_dump, "skip.");
288 gsi_next (&si);
289 continue;
290 }
291 }
292 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
293 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
294 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
295 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
296 analyze_pattern_stmt = true;
297
298 /* If a pattern statement has def stmts, analyze them too. */
299 if (is_pattern_stmt_p (stmt_info))
300 {
301 if (pattern_def_seq == NULL)
302 {
303 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
304 pattern_def_si = gsi_start (pattern_def_seq);
305 }
306 else if (!gsi_end_p (pattern_def_si))
307 gsi_next (&pattern_def_si);
308 if (pattern_def_seq != NULL)
309 {
310 gimple pattern_def_stmt = NULL;
311 stmt_vec_info pattern_def_stmt_info = NULL;
312
313 while (!gsi_end_p (pattern_def_si))
314 {
315 pattern_def_stmt = gsi_stmt (pattern_def_si);
316 pattern_def_stmt_info
317 = vinfo_for_stmt (pattern_def_stmt);
318 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
319 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
320 break;
321 gsi_next (&pattern_def_si);
322 }
323
324 if (!gsi_end_p (pattern_def_si))
325 {
326 if (vect_print_dump_info (REPORT_DETAILS))
327 {
328 fprintf (vect_dump,
329 "==> examining pattern def stmt: ");
330 print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
331 TDF_SLIM);
332 }
333
334 stmt = pattern_def_stmt;
335 stmt_info = pattern_def_stmt_info;
336 }
337 else
338 {
339 pattern_def_si = gsi_none ();
340 analyze_pattern_stmt = false;
341 }
342 }
343 else
344 analyze_pattern_stmt = false;
345 }
346
347 if (gimple_get_lhs (stmt) == NULL_TREE)
348 {
349 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
350 {
351 fprintf (vect_dump, "not vectorized: irregular stmt.");
352 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
353 }
354 return false;
355 }
356
357 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
358 {
359 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
360 {
361 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
362 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
363 }
364 return false;
365 }
366
367 if (STMT_VINFO_VECTYPE (stmt_info))
368 {
369 /* The only case when a vectype had been already set is for stmts
370 that contain a dataref, or for "pattern-stmts" (stmts
371 generated by the vectorizer to represent/replace a certain
372 idiom). */
373 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
374 || is_pattern_stmt_p (stmt_info)
375 || !gsi_end_p (pattern_def_si));
376 vectype = STMT_VINFO_VECTYPE (stmt_info);
377 }
378 else
379 {
380 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
381 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
382 if (vect_print_dump_info (REPORT_DETAILS))
383 {
384 fprintf (vect_dump, "get vectype for scalar type: ");
385 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
386 }
387 vectype = get_vectype_for_scalar_type (scalar_type);
388 if (!vectype)
389 {
390 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
391 {
392 fprintf (vect_dump,
393 "not vectorized: unsupported data-type ");
394 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
395 }
396 return false;
397 }
398
399 STMT_VINFO_VECTYPE (stmt_info) = vectype;
400 }
401
402 /* The vectorization factor is according to the smallest
403 scalar type (or the largest vector size, but we only
404 support one vector size per loop). */
405 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
406 &dummy);
407 if (vect_print_dump_info (REPORT_DETAILS))
408 {
409 fprintf (vect_dump, "get vectype for scalar type: ");
410 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
411 }
412 vf_vectype = get_vectype_for_scalar_type (scalar_type);
413 if (!vf_vectype)
414 {
415 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
416 {
417 fprintf (vect_dump,
418 "not vectorized: unsupported data-type ");
419 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
420 }
421 return false;
422 }
423
424 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
425 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
426 {
427 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
428 {
429 fprintf (vect_dump,
430 "not vectorized: different sized vector "
431 "types in statement, ");
432 print_generic_expr (vect_dump, vectype, TDF_SLIM);
433 fprintf (vect_dump, " and ");
434 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
435 }
436 return false;
437 }
438
439 if (vect_print_dump_info (REPORT_DETAILS))
440 {
441 fprintf (vect_dump, "vectype: ");
442 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
443 }
444
445 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
446 if (vect_print_dump_info (REPORT_DETAILS))
447 fprintf (vect_dump, "nunits = %d", nunits);
448
449 if (!vectorization_factor
450 || (nunits > vectorization_factor))
451 vectorization_factor = nunits;
452
453 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
454 {
455 pattern_def_seq = NULL;
456 gsi_next (&si);
457 }
458 }
459 }
460
461 /* TODO: Analyze cost. Decide if worth while to vectorize. */
462 if (vect_print_dump_info (REPORT_DETAILS))
463 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
464 if (vectorization_factor <= 1)
465 {
466 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
467 fprintf (vect_dump, "not vectorized: unsupported data-type");
468 return false;
469 }
470 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
471
472 return true;
473 }
474
475
476 /* Function vect_is_simple_iv_evolution.
477
478 FORNOW: A simple evolution of an induction variables in the loop is
479 considered a polynomial evolution with constant step. */
480
481 static bool
482 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
483 tree * step)
484 {
485 tree init_expr;
486 tree step_expr;
487 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
488
489 /* When there is no evolution in this loop, the evolution function
490 is not "simple". */
491 if (evolution_part == NULL_TREE)
492 return false;
493
494 /* When the evolution is a polynomial of degree >= 2
495 the evolution function is not "simple". */
496 if (tree_is_chrec (evolution_part))
497 return false;
498
499 step_expr = evolution_part;
500 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
501
502 if (vect_print_dump_info (REPORT_DETAILS))
503 {
504 fprintf (vect_dump, "step: ");
505 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
506 fprintf (vect_dump, ", init: ");
507 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
508 }
509
510 *init = init_expr;
511 *step = step_expr;
512
513 if (TREE_CODE (step_expr) != INTEGER_CST)
514 {
515 if (vect_print_dump_info (REPORT_DETAILS))
516 fprintf (vect_dump, "step unknown.");
517 return false;
518 }
519
520 return true;
521 }
522
523 /* Function vect_analyze_scalar_cycles_1.
524
525 Examine the cross iteration def-use cycles of scalar variables
526 in LOOP. LOOP_VINFO represents the loop that is now being
527 considered for vectorization (can be LOOP, or an outer-loop
528 enclosing LOOP). */
529
530 static void
531 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
532 {
533 basic_block bb = loop->header;
534 tree dumy;
535 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
536 gimple_stmt_iterator gsi;
537 bool double_reduc;
538
539 if (vect_print_dump_info (REPORT_DETAILS))
540 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
541
542 /* First - identify all inductions. Reduction detection assumes that all the
543 inductions have been identified, therefore, this order must not be
544 changed. */
545 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
546 {
547 gimple phi = gsi_stmt (gsi);
548 tree access_fn = NULL;
549 tree def = PHI_RESULT (phi);
550 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
551
552 if (vect_print_dump_info (REPORT_DETAILS))
553 {
554 fprintf (vect_dump, "Analyze phi: ");
555 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
556 }
557
558 /* Skip virtual phi's. The data dependences that are associated with
559 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
560 if (!is_gimple_reg (SSA_NAME_VAR (def)))
561 continue;
562
563 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
564
565 /* Analyze the evolution function. */
566 access_fn = analyze_scalar_evolution (loop, def);
567 if (access_fn)
568 {
569 STRIP_NOPS (access_fn);
570 if (vect_print_dump_info (REPORT_DETAILS))
571 {
572 fprintf (vect_dump, "Access function of PHI: ");
573 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
574 }
575 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
576 = evolution_part_in_loop_num (access_fn, loop->num);
577 }
578
579 if (!access_fn
580 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
581 {
582 VEC_safe_push (gimple, heap, worklist, phi);
583 continue;
584 }
585
586 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
587
588 if (vect_print_dump_info (REPORT_DETAILS))
589 fprintf (vect_dump, "Detected induction.");
590 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
591 }
592
593
594 /* Second - identify all reductions and nested cycles. */
595 while (VEC_length (gimple, worklist) > 0)
596 {
597 gimple phi = VEC_pop (gimple, worklist);
598 tree def = PHI_RESULT (phi);
599 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
600 gimple reduc_stmt;
601 bool nested_cycle;
602
603 if (vect_print_dump_info (REPORT_DETAILS))
604 {
605 fprintf (vect_dump, "Analyze phi: ");
606 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
607 }
608
609 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
610 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
611
612 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
613 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
614 &double_reduc);
615 if (reduc_stmt)
616 {
617 if (double_reduc)
618 {
619 if (vect_print_dump_info (REPORT_DETAILS))
620 fprintf (vect_dump, "Detected double reduction.");
621
622 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
623 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
624 vect_double_reduction_def;
625 }
626 else
627 {
628 if (nested_cycle)
629 {
630 if (vect_print_dump_info (REPORT_DETAILS))
631 fprintf (vect_dump, "Detected vectorizable nested cycle.");
632
633 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
634 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
635 vect_nested_cycle;
636 }
637 else
638 {
639 if (vect_print_dump_info (REPORT_DETAILS))
640 fprintf (vect_dump, "Detected reduction.");
641
642 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
643 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
644 vect_reduction_def;
645 /* Store the reduction cycles for possible vectorization in
646 loop-aware SLP. */
647 VEC_safe_push (gimple, heap,
648 LOOP_VINFO_REDUCTIONS (loop_vinfo),
649 reduc_stmt);
650 }
651 }
652 }
653 else
654 if (vect_print_dump_info (REPORT_DETAILS))
655 fprintf (vect_dump, "Unknown def-use cycle pattern.");
656 }
657
658 VEC_free (gimple, heap, worklist);
659 }
660
661
662 /* Function vect_analyze_scalar_cycles.
663
664 Examine the cross iteration def-use cycles of scalar variables, by
665 analyzing the loop-header PHIs of scalar variables. Classify each
666 cycle as one of the following: invariant, induction, reduction, unknown.
667 We do that for the loop represented by LOOP_VINFO, and also to its
668 inner-loop, if exists.
669 Examples for scalar cycles:
670
671 Example1: reduction:
672
673 loop1:
674 for (i=0; i<N; i++)
675 sum += a[i];
676
677 Example2: induction:
678
679 loop2:
680 for (i=0; i<N; i++)
681 a[i] = i; */
682
683 static void
684 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
685 {
686 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
687
688 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
689
690 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
691 Reductions in such inner-loop therefore have different properties than
692 the reductions in the nest that gets vectorized:
693 1. When vectorized, they are executed in the same order as in the original
694 scalar loop, so we can't change the order of computation when
695 vectorizing them.
696 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
697 current checks are too strict. */
698
699 if (loop->inner)
700 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
701 }
702
703 /* Function vect_get_loop_niters.
704
705 Determine how many iterations the loop is executed.
706 If an expression that represents the number of iterations
707 can be constructed, place it in NUMBER_OF_ITERATIONS.
708 Return the loop exit condition. */
709
710 static gimple
711 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
712 {
713 tree niters;
714
715 if (vect_print_dump_info (REPORT_DETAILS))
716 fprintf (vect_dump, "=== get_loop_niters ===");
717
718 niters = number_of_exit_cond_executions (loop);
719
720 if (niters != NULL_TREE
721 && niters != chrec_dont_know)
722 {
723 *number_of_iterations = niters;
724
725 if (vect_print_dump_info (REPORT_DETAILS))
726 {
727 fprintf (vect_dump, "==> get_loop_niters:" );
728 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
729 }
730 }
731
732 return get_loop_exit_condition (loop);
733 }
734
735
736 /* Function bb_in_loop_p
737
738 Used as predicate for dfs order traversal of the loop bbs. */
739
740 static bool
741 bb_in_loop_p (const_basic_block bb, const void *data)
742 {
743 const struct loop *const loop = (const struct loop *)data;
744 if (flow_bb_inside_loop_p (loop, bb))
745 return true;
746 return false;
747 }
748
749
750 /* Function new_loop_vec_info.
751
752 Create and initialize a new loop_vec_info struct for LOOP, as well as
753 stmt_vec_info structs for all the stmts in LOOP. */
754
755 static loop_vec_info
756 new_loop_vec_info (struct loop *loop)
757 {
758 loop_vec_info res;
759 basic_block *bbs;
760 gimple_stmt_iterator si;
761 unsigned int i, nbbs;
762
763 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
764 LOOP_VINFO_LOOP (res) = loop;
765
766 bbs = get_loop_body (loop);
767
768 /* Create/Update stmt_info for all stmts in the loop. */
769 for (i = 0; i < loop->num_nodes; i++)
770 {
771 basic_block bb = bbs[i];
772
773 /* BBs in a nested inner-loop will have been already processed (because
774 we will have called vect_analyze_loop_form for any nested inner-loop).
775 Therefore, for stmts in an inner-loop we just want to update the
776 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
777 loop_info of the outer-loop we are currently considering to vectorize
778 (instead of the loop_info of the inner-loop).
779 For stmts in other BBs we need to create a stmt_info from scratch. */
780 if (bb->loop_father != loop)
781 {
782 /* Inner-loop bb. */
783 gcc_assert (loop->inner && bb->loop_father == loop->inner);
784 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
785 {
786 gimple phi = gsi_stmt (si);
787 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
788 loop_vec_info inner_loop_vinfo =
789 STMT_VINFO_LOOP_VINFO (stmt_info);
790 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
791 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
792 }
793 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
794 {
795 gimple stmt = gsi_stmt (si);
796 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
797 loop_vec_info inner_loop_vinfo =
798 STMT_VINFO_LOOP_VINFO (stmt_info);
799 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
800 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
801 }
802 }
803 else
804 {
805 /* bb in current nest. */
806 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
807 {
808 gimple phi = gsi_stmt (si);
809 gimple_set_uid (phi, 0);
810 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
811 }
812
813 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
814 {
815 gimple stmt = gsi_stmt (si);
816 gimple_set_uid (stmt, 0);
817 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
818 }
819 }
820 }
821
822 /* CHECKME: We want to visit all BBs before their successors (except for
823 latch blocks, for which this assertion wouldn't hold). In the simple
824 case of the loop forms we allow, a dfs order of the BBs would the same
825 as reversed postorder traversal, so we are safe. */
826
827 free (bbs);
828 bbs = XCNEWVEC (basic_block, loop->num_nodes);
829 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
830 bbs, loop->num_nodes, loop);
831 gcc_assert (nbbs == loop->num_nodes);
832
833 LOOP_VINFO_BBS (res) = bbs;
834 LOOP_VINFO_NITERS (res) = NULL;
835 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
836 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
837 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
838 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
839 LOOP_VINFO_VECT_FACTOR (res) = 0;
840 LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
841 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
842 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
843 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
844 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
845 VEC_alloc (gimple, heap,
846 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
847 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
848 VEC_alloc (ddr_p, heap,
849 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
850 LOOP_VINFO_GROUPED_STORES (res) = VEC_alloc (gimple, heap, 10);
851 LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
852 LOOP_VINFO_REDUCTION_CHAINS (res) = VEC_alloc (gimple, heap, 10);
853 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
854 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
855 LOOP_VINFO_PEELING_HTAB (res) = NULL;
856 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
857
858 return res;
859 }
860
861
862 /* Function destroy_loop_vec_info.
863
864 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
865 stmts in the loop. */
866
867 void
868 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
869 {
870 struct loop *loop;
871 basic_block *bbs;
872 int nbbs;
873 gimple_stmt_iterator si;
874 int j;
875 VEC (slp_instance, heap) *slp_instances;
876 slp_instance instance;
877
878 if (!loop_vinfo)
879 return;
880
881 loop = LOOP_VINFO_LOOP (loop_vinfo);
882
883 bbs = LOOP_VINFO_BBS (loop_vinfo);
884 nbbs = loop->num_nodes;
885
886 if (!clean_stmts)
887 {
888 free (LOOP_VINFO_BBS (loop_vinfo));
889 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
890 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
891 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
892 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
893 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
894
895 free (loop_vinfo);
896 loop->aux = NULL;
897 return;
898 }
899
900 for (j = 0; j < nbbs; j++)
901 {
902 basic_block bb = bbs[j];
903 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
904 free_stmt_vec_info (gsi_stmt (si));
905
906 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
907 {
908 gimple stmt = gsi_stmt (si);
909 /* Free stmt_vec_info. */
910 free_stmt_vec_info (stmt);
911 gsi_next (&si);
912 }
913 }
914
915 free (LOOP_VINFO_BBS (loop_vinfo));
916 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
917 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
918 VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
919 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
920 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
921 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
922 FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
923 vect_free_slp_instance (instance);
924
925 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
926 VEC_free (gimple, heap, LOOP_VINFO_GROUPED_STORES (loop_vinfo));
927 VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
928 VEC_free (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo));
929
930 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
931 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
932
933 free (loop_vinfo);
934 loop->aux = NULL;
935 }
936
937
938 /* Function vect_analyze_loop_1.
939
940 Apply a set of analyses on LOOP, and create a loop_vec_info struct
941 for it. The different analyses will record information in the
942 loop_vec_info struct. This is a subset of the analyses applied in
943 vect_analyze_loop, to be applied on an inner-loop nested in the loop
944 that is now considered for (outer-loop) vectorization. */
945
946 static loop_vec_info
947 vect_analyze_loop_1 (struct loop *loop)
948 {
949 loop_vec_info loop_vinfo;
950
951 if (vect_print_dump_info (REPORT_DETAILS))
952 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
953
954 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
955
956 loop_vinfo = vect_analyze_loop_form (loop);
957 if (!loop_vinfo)
958 {
959 if (vect_print_dump_info (REPORT_DETAILS))
960 fprintf (vect_dump, "bad inner-loop form.");
961 return NULL;
962 }
963
964 return loop_vinfo;
965 }
966
967
968 /* Function vect_analyze_loop_form.
969
970 Verify that certain CFG restrictions hold, including:
971 - the loop has a pre-header
972 - the loop has a single entry and exit
973 - the loop exit condition is simple enough, and the number of iterations
974 can be analyzed (a countable loop). */
975
976 loop_vec_info
977 vect_analyze_loop_form (struct loop *loop)
978 {
979 loop_vec_info loop_vinfo;
980 gimple loop_cond;
981 tree number_of_iterations = NULL;
982 loop_vec_info inner_loop_vinfo = NULL;
983
984 if (vect_print_dump_info (REPORT_DETAILS))
985 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
986
987 /* Different restrictions apply when we are considering an inner-most loop,
988 vs. an outer (nested) loop.
989 (FORNOW. May want to relax some of these restrictions in the future). */
990
991 if (!loop->inner)
992 {
993 /* Inner-most loop. We currently require that the number of BBs is
994 exactly 2 (the header and latch). Vectorizable inner-most loops
995 look like this:
996
997 (pre-header)
998 |
999 header <--------+
1000 | | |
1001 | +--> latch --+
1002 |
1003 (exit-bb) */
1004
1005 if (loop->num_nodes != 2)
1006 {
1007 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1008 fprintf (vect_dump, "not vectorized: control flow in loop.");
1009 return NULL;
1010 }
1011
1012 if (empty_block_p (loop->header))
1013 {
1014 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1015 fprintf (vect_dump, "not vectorized: empty loop.");
1016 return NULL;
1017 }
1018 }
1019 else
1020 {
1021 struct loop *innerloop = loop->inner;
1022 edge entryedge;
1023
1024 /* Nested loop. We currently require that the loop is doubly-nested,
1025 contains a single inner loop, and the number of BBs is exactly 5.
1026 Vectorizable outer-loops look like this:
1027
1028 (pre-header)
1029 |
1030 header <---+
1031 | |
1032 inner-loop |
1033 | |
1034 tail ------+
1035 |
1036 (exit-bb)
1037
1038 The inner-loop has the properties expected of inner-most loops
1039 as described above. */
1040
1041 if ((loop->inner)->inner || (loop->inner)->next)
1042 {
1043 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1044 fprintf (vect_dump, "not vectorized: multiple nested loops.");
1045 return NULL;
1046 }
1047
1048 /* Analyze the inner-loop. */
1049 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1050 if (!inner_loop_vinfo)
1051 {
1052 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1053 fprintf (vect_dump, "not vectorized: Bad inner loop.");
1054 return NULL;
1055 }
1056
1057 if (!expr_invariant_in_loop_p (loop,
1058 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1059 {
1060 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1061 fprintf (vect_dump,
1062 "not vectorized: inner-loop count not invariant.");
1063 destroy_loop_vec_info (inner_loop_vinfo, true);
1064 return NULL;
1065 }
1066
1067 if (loop->num_nodes != 5)
1068 {
1069 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1070 fprintf (vect_dump, "not vectorized: control flow in loop.");
1071 destroy_loop_vec_info (inner_loop_vinfo, true);
1072 return NULL;
1073 }
1074
1075 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1076 entryedge = EDGE_PRED (innerloop->header, 0);
1077 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1078 entryedge = EDGE_PRED (innerloop->header, 1);
1079
1080 if (entryedge->src != loop->header
1081 || !single_exit (innerloop)
1082 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1083 {
1084 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1085 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1086 destroy_loop_vec_info (inner_loop_vinfo, true);
1087 return NULL;
1088 }
1089
1090 if (vect_print_dump_info (REPORT_DETAILS))
1091 fprintf (vect_dump, "Considering outer-loop vectorization.");
1092 }
1093
1094 if (!single_exit (loop)
1095 || EDGE_COUNT (loop->header->preds) != 2)
1096 {
1097 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1098 {
1099 if (!single_exit (loop))
1100 fprintf (vect_dump, "not vectorized: multiple exits.");
1101 else if (EDGE_COUNT (loop->header->preds) != 2)
1102 fprintf (vect_dump, "not vectorized: too many incoming edges.");
1103 }
1104 if (inner_loop_vinfo)
1105 destroy_loop_vec_info (inner_loop_vinfo, true);
1106 return NULL;
1107 }
1108
1109 /* We assume that the loop exit condition is at the end of the loop. i.e,
1110 that the loop is represented as a do-while (with a proper if-guard
1111 before the loop if needed), where the loop header contains all the
1112 executable statements, and the latch is empty. */
1113 if (!empty_block_p (loop->latch)
1114 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1115 {
1116 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1117 fprintf (vect_dump, "not vectorized: unexpected loop form.");
1118 if (inner_loop_vinfo)
1119 destroy_loop_vec_info (inner_loop_vinfo, true);
1120 return NULL;
1121 }
1122
1123 /* Make sure there exists a single-predecessor exit bb: */
1124 if (!single_pred_p (single_exit (loop)->dest))
1125 {
1126 edge e = single_exit (loop);
1127 if (!(e->flags & EDGE_ABNORMAL))
1128 {
1129 split_loop_exit_edge (e);
1130 if (vect_print_dump_info (REPORT_DETAILS))
1131 fprintf (vect_dump, "split exit edge.");
1132 }
1133 else
1134 {
1135 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1136 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1137 if (inner_loop_vinfo)
1138 destroy_loop_vec_info (inner_loop_vinfo, true);
1139 return NULL;
1140 }
1141 }
1142
1143 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1144 if (!loop_cond)
1145 {
1146 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1147 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1148 if (inner_loop_vinfo)
1149 destroy_loop_vec_info (inner_loop_vinfo, true);
1150 return NULL;
1151 }
1152
1153 if (!number_of_iterations)
1154 {
1155 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1156 fprintf (vect_dump,
1157 "not vectorized: number of iterations cannot be computed.");
1158 if (inner_loop_vinfo)
1159 destroy_loop_vec_info (inner_loop_vinfo, true);
1160 return NULL;
1161 }
1162
1163 if (chrec_contains_undetermined (number_of_iterations))
1164 {
1165 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1166 fprintf (vect_dump, "Infinite number of iterations.");
1167 if (inner_loop_vinfo)
1168 destroy_loop_vec_info (inner_loop_vinfo, true);
1169 return NULL;
1170 }
1171
1172 if (!NITERS_KNOWN_P (number_of_iterations))
1173 {
1174 if (vect_print_dump_info (REPORT_DETAILS))
1175 {
1176 fprintf (vect_dump, "Symbolic number of iterations is ");
1177 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1178 }
1179 }
1180 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1181 {
1182 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1183 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1184 if (inner_loop_vinfo)
1185 destroy_loop_vec_info (inner_loop_vinfo, false);
1186 return NULL;
1187 }
1188
1189 loop_vinfo = new_loop_vec_info (loop);
1190 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1191 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1192
1193 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1194
1195 /* CHECKME: May want to keep it around it in the future. */
1196 if (inner_loop_vinfo)
1197 destroy_loop_vec_info (inner_loop_vinfo, false);
1198
1199 gcc_assert (!loop->aux);
1200 loop->aux = loop_vinfo;
1201 return loop_vinfo;
1202 }
1203
1204
1205 /* Get cost by calling cost target builtin. */
1206
1207 static inline int
1208 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1209 {
1210 tree dummy_type = NULL;
1211 int dummy = 0;
1212
1213 return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1214 dummy_type, dummy);
1215 }
1216
1217
1218 /* Function vect_analyze_loop_operations.
1219
1220 Scan the loop stmts and make sure they are all vectorizable. */
1221
1222 static bool
1223 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1224 {
1225 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1226 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1227 int nbbs = loop->num_nodes;
1228 gimple_stmt_iterator si;
1229 unsigned int vectorization_factor = 0;
1230 int i;
1231 gimple phi;
1232 stmt_vec_info stmt_info;
1233 bool need_to_vectorize = false;
1234 int min_profitable_iters;
1235 int min_scalar_loop_bound;
1236 unsigned int th;
1237 bool only_slp_in_loop = true, ok;
1238 HOST_WIDE_INT max_niter;
1239
1240 if (vect_print_dump_info (REPORT_DETAILS))
1241 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1242
1243 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1244 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1245 if (slp)
1246 {
1247 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1248 vectorization factor of the loop is the unrolling factor required by
1249 the SLP instances. If that unrolling factor is 1, we say, that we
1250 perform pure SLP on loop - cross iteration parallelism is not
1251 exploited. */
1252 for (i = 0; i < nbbs; i++)
1253 {
1254 basic_block bb = bbs[i];
1255 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1256 {
1257 gimple stmt = gsi_stmt (si);
1258 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1259 gcc_assert (stmt_info);
1260 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1261 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1262 && !PURE_SLP_STMT (stmt_info))
1263 /* STMT needs both SLP and loop-based vectorization. */
1264 only_slp_in_loop = false;
1265 }
1266 }
1267
1268 if (only_slp_in_loop)
1269 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1270 else
1271 vectorization_factor = least_common_multiple (vectorization_factor,
1272 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1273
1274 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1275 if (vect_print_dump_info (REPORT_DETAILS))
1276 fprintf (vect_dump, "Updating vectorization factor to %d ",
1277 vectorization_factor);
1278 }
1279
1280 for (i = 0; i < nbbs; i++)
1281 {
1282 basic_block bb = bbs[i];
1283
1284 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1285 {
1286 phi = gsi_stmt (si);
1287 ok = true;
1288
1289 stmt_info = vinfo_for_stmt (phi);
1290 if (vect_print_dump_info (REPORT_DETAILS))
1291 {
1292 fprintf (vect_dump, "examining phi: ");
1293 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1294 }
1295
1296 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1297 (i.e., a phi in the tail of the outer-loop). */
1298 if (! is_loop_header_bb_p (bb))
1299 {
1300 /* FORNOW: we currently don't support the case that these phis
1301 are not used in the outerloop (unless it is double reduction,
1302 i.e., this phi is vect_reduction_def), cause this case
1303 requires to actually do something here. */
1304 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1305 || STMT_VINFO_LIVE_P (stmt_info))
1306 && STMT_VINFO_DEF_TYPE (stmt_info)
1307 != vect_double_reduction_def)
1308 {
1309 if (vect_print_dump_info (REPORT_DETAILS))
1310 fprintf (vect_dump,
1311 "Unsupported loop-closed phi in outer-loop.");
1312 return false;
1313 }
1314
1315 /* If PHI is used in the outer loop, we check that its operand
1316 is defined in the inner loop. */
1317 if (STMT_VINFO_RELEVANT_P (stmt_info))
1318 {
1319 tree phi_op;
1320 gimple op_def_stmt;
1321
1322 if (gimple_phi_num_args (phi) != 1)
1323 return false;
1324
1325 phi_op = PHI_ARG_DEF (phi, 0);
1326 if (TREE_CODE (phi_op) != SSA_NAME)
1327 return false;
1328
1329 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1330 if (!op_def_stmt || !vinfo_for_stmt (op_def_stmt))
1331 return false;
1332
1333 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1334 != vect_used_in_outer
1335 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1336 != vect_used_in_outer_by_reduction)
1337 return false;
1338 }
1339
1340 continue;
1341 }
1342
1343 gcc_assert (stmt_info);
1344
1345 if (STMT_VINFO_LIVE_P (stmt_info))
1346 {
1347 /* FORNOW: not yet supported. */
1348 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1349 fprintf (vect_dump, "not vectorized: value used after loop.");
1350 return false;
1351 }
1352
1353 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1354 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1355 {
1356 /* A scalar-dependence cycle that we don't support. */
1357 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1358 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1359 return false;
1360 }
1361
1362 if (STMT_VINFO_RELEVANT_P (stmt_info))
1363 {
1364 need_to_vectorize = true;
1365 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1366 ok = vectorizable_induction (phi, NULL, NULL);
1367 }
1368
1369 if (!ok)
1370 {
1371 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1372 {
1373 fprintf (vect_dump,
1374 "not vectorized: relevant phi not supported: ");
1375 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1376 }
1377 return false;
1378 }
1379 }
1380
1381 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1382 {
1383 gimple stmt = gsi_stmt (si);
1384 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1385 return false;
1386 }
1387 } /* bbs */
1388
1389 /* All operations in the loop are either irrelevant (deal with loop
1390 control, or dead), or only used outside the loop and can be moved
1391 out of the loop (e.g. invariants, inductions). The loop can be
1392 optimized away by scalar optimizations. We're better off not
1393 touching this loop. */
1394 if (!need_to_vectorize)
1395 {
1396 if (vect_print_dump_info (REPORT_DETAILS))
1397 fprintf (vect_dump,
1398 "All the computation can be taken out of the loop.");
1399 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1400 fprintf (vect_dump,
1401 "not vectorized: redundant loop. no profit to vectorize.");
1402 return false;
1403 }
1404
1405 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1406 && vect_print_dump_info (REPORT_DETAILS))
1407 fprintf (vect_dump,
1408 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1409 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1410
1411 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1412 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1413 || ((max_niter = max_stmt_executions_int (loop)) != -1
1414 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1415 {
1416 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1417 fprintf (vect_dump, "not vectorized: iteration count too small.");
1418 if (vect_print_dump_info (REPORT_DETAILS))
1419 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1420 "vectorization factor.");
1421 return false;
1422 }
1423
1424 /* Analyze cost. Decide if worth while to vectorize. */
1425
1426 /* Once VF is set, SLP costs should be updated since the number of created
1427 vector stmts depends on VF. */
1428 vect_update_slp_costs_according_to_vf (loop_vinfo);
1429
1430 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1431 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1432
1433 if (min_profitable_iters < 0)
1434 {
1435 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1436 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1437 if (vect_print_dump_info (REPORT_DETAILS))
1438 fprintf (vect_dump, "not vectorized: vector version will never be "
1439 "profitable.");
1440 return false;
1441 }
1442
1443 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1444 * vectorization_factor) - 1);
1445
1446 /* Use the cost model only if it is more conservative than user specified
1447 threshold. */
1448
1449 th = (unsigned) min_scalar_loop_bound;
1450 if (min_profitable_iters
1451 && (!min_scalar_loop_bound
1452 || min_profitable_iters > min_scalar_loop_bound))
1453 th = (unsigned) min_profitable_iters;
1454
1455 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1456 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1457 {
1458 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1459 fprintf (vect_dump, "not vectorized: vectorization not "
1460 "profitable.");
1461 if (vect_print_dump_info (REPORT_DETAILS))
1462 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1463 "user specified loop bound parameter or minimum "
1464 "profitable iterations (whichever is more conservative).");
1465 return false;
1466 }
1467
1468 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1469 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1470 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1471 {
1472 if (vect_print_dump_info (REPORT_DETAILS))
1473 fprintf (vect_dump, "epilog loop required.");
1474 if (!vect_can_advance_ivs_p (loop_vinfo))
1475 {
1476 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1477 fprintf (vect_dump,
1478 "not vectorized: can't create epilog loop 1.");
1479 return false;
1480 }
1481 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1482 {
1483 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1484 fprintf (vect_dump,
1485 "not vectorized: can't create epilog loop 2.");
1486 return false;
1487 }
1488 }
1489
1490 return true;
1491 }
1492
1493
1494 /* Function vect_analyze_loop_2.
1495
1496 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1497 for it. The different analyses will record information in the
1498 loop_vec_info struct. */
1499 static bool
1500 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1501 {
1502 bool ok, slp = false;
1503 int max_vf = MAX_VECTORIZATION_FACTOR;
1504 int min_vf = 2;
1505
1506 /* Find all data references in the loop (which correspond to vdefs/vuses)
1507 and analyze their evolution in the loop. Also adjust the minimal
1508 vectorization factor according to the loads and stores.
1509
1510 FORNOW: Handle only simple, array references, which
1511 alignment can be forced, and aligned pointer-references. */
1512
1513 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1514 if (!ok)
1515 {
1516 if (vect_print_dump_info (REPORT_DETAILS))
1517 fprintf (vect_dump, "bad data references.");
1518 return false;
1519 }
1520
1521 /* Classify all cross-iteration scalar data-flow cycles.
1522 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1523
1524 vect_analyze_scalar_cycles (loop_vinfo);
1525
1526 vect_pattern_recog (loop_vinfo, NULL);
1527
1528 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1529
1530 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1531 if (!ok)
1532 {
1533 if (vect_print_dump_info (REPORT_DETAILS))
1534 fprintf (vect_dump, "unexpected pattern.");
1535 return false;
1536 }
1537
1538 /* Analyze data dependences between the data-refs in the loop
1539 and adjust the maximum vectorization factor according to
1540 the dependences.
1541 FORNOW: fail at the first data dependence that we encounter. */
1542
1543 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1544 if (!ok
1545 || max_vf < min_vf)
1546 {
1547 if (vect_print_dump_info (REPORT_DETAILS))
1548 fprintf (vect_dump, "bad data dependence.");
1549 return false;
1550 }
1551
1552 ok = vect_determine_vectorization_factor (loop_vinfo);
1553 if (!ok)
1554 {
1555 if (vect_print_dump_info (REPORT_DETAILS))
1556 fprintf (vect_dump, "can't determine vectorization factor.");
1557 return false;
1558 }
1559 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1560 {
1561 if (vect_print_dump_info (REPORT_DETAILS))
1562 fprintf (vect_dump, "bad data dependence.");
1563 return false;
1564 }
1565
1566 /* Analyze the alignment of the data-refs in the loop.
1567 Fail if a data reference is found that cannot be vectorized. */
1568
1569 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1570 if (!ok)
1571 {
1572 if (vect_print_dump_info (REPORT_DETAILS))
1573 fprintf (vect_dump, "bad data alignment.");
1574 return false;
1575 }
1576
1577 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1578 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1579
1580 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1581 if (!ok)
1582 {
1583 if (vect_print_dump_info (REPORT_DETAILS))
1584 fprintf (vect_dump, "bad data access.");
1585 return false;
1586 }
1587
1588 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1589 It is important to call pruning after vect_analyze_data_ref_accesses,
1590 since we use grouping information gathered by interleaving analysis. */
1591 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1592 if (!ok)
1593 {
1594 if (vect_print_dump_info (REPORT_DETAILS))
1595 fprintf (vect_dump, "too long list of versioning for alias "
1596 "run-time tests.");
1597 return false;
1598 }
1599
1600 /* This pass will decide on using loop versioning and/or loop peeling in
1601 order to enhance the alignment of data references in the loop. */
1602
1603 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1604 if (!ok)
1605 {
1606 if (vect_print_dump_info (REPORT_DETAILS))
1607 fprintf (vect_dump, "bad data alignment.");
1608 return false;
1609 }
1610
1611 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1612 ok = vect_analyze_slp (loop_vinfo, NULL);
1613 if (ok)
1614 {
1615 /* Decide which possible SLP instances to SLP. */
1616 slp = vect_make_slp_decision (loop_vinfo);
1617
1618 /* Find stmts that need to be both vectorized and SLPed. */
1619 vect_detect_hybrid_slp (loop_vinfo);
1620 }
1621 else
1622 return false;
1623
1624 /* Scan all the operations in the loop and make sure they are
1625 vectorizable. */
1626
1627 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1628 if (!ok)
1629 {
1630 if (vect_print_dump_info (REPORT_DETAILS))
1631 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1632 return false;
1633 }
1634
1635 return true;
1636 }
1637
1638 /* Function vect_analyze_loop.
1639
1640 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1641 for it. The different analyses will record information in the
1642 loop_vec_info struct. */
1643 loop_vec_info
1644 vect_analyze_loop (struct loop *loop)
1645 {
1646 loop_vec_info loop_vinfo;
1647 unsigned int vector_sizes;
1648
1649 /* Autodetect first vector size we try. */
1650 current_vector_size = 0;
1651 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1652
1653 if (vect_print_dump_info (REPORT_DETAILS))
1654 fprintf (vect_dump, "===== analyze_loop_nest =====");
1655
1656 if (loop_outer (loop)
1657 && loop_vec_info_for_loop (loop_outer (loop))
1658 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1659 {
1660 if (vect_print_dump_info (REPORT_DETAILS))
1661 fprintf (vect_dump, "outer-loop already vectorized.");
1662 return NULL;
1663 }
1664
1665 while (1)
1666 {
1667 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1668 loop_vinfo = vect_analyze_loop_form (loop);
1669 if (!loop_vinfo)
1670 {
1671 if (vect_print_dump_info (REPORT_DETAILS))
1672 fprintf (vect_dump, "bad loop form.");
1673 return NULL;
1674 }
1675
1676 if (vect_analyze_loop_2 (loop_vinfo))
1677 {
1678 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1679
1680 return loop_vinfo;
1681 }
1682
1683 destroy_loop_vec_info (loop_vinfo, true);
1684
1685 vector_sizes &= ~current_vector_size;
1686 if (vector_sizes == 0
1687 || current_vector_size == 0)
1688 return NULL;
1689
1690 /* Try the next biggest vector size. */
1691 current_vector_size = 1 << floor_log2 (vector_sizes);
1692 if (vect_print_dump_info (REPORT_DETAILS))
1693 fprintf (vect_dump, "***** Re-trying analysis with "
1694 "vector size %d\n", current_vector_size);
1695 }
1696 }
1697
1698
1699 /* Function reduction_code_for_scalar_code
1700
1701 Input:
1702 CODE - tree_code of a reduction operations.
1703
1704 Output:
1705 REDUC_CODE - the corresponding tree-code to be used to reduce the
1706 vector of partial results into a single scalar result (which
1707 will also reside in a vector) or ERROR_MARK if the operation is
1708 a supported reduction operation, but does not have such tree-code.
1709
1710 Return FALSE if CODE currently cannot be vectorized as reduction. */
1711
1712 static bool
1713 reduction_code_for_scalar_code (enum tree_code code,
1714 enum tree_code *reduc_code)
1715 {
1716 switch (code)
1717 {
1718 case MAX_EXPR:
1719 *reduc_code = REDUC_MAX_EXPR;
1720 return true;
1721
1722 case MIN_EXPR:
1723 *reduc_code = REDUC_MIN_EXPR;
1724 return true;
1725
1726 case PLUS_EXPR:
1727 *reduc_code = REDUC_PLUS_EXPR;
1728 return true;
1729
1730 case MULT_EXPR:
1731 case MINUS_EXPR:
1732 case BIT_IOR_EXPR:
1733 case BIT_XOR_EXPR:
1734 case BIT_AND_EXPR:
1735 *reduc_code = ERROR_MARK;
1736 return true;
1737
1738 default:
1739 return false;
1740 }
1741 }
1742
1743
1744 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1745 STMT is printed with a message MSG. */
1746
1747 static void
1748 report_vect_op (gimple stmt, const char *msg)
1749 {
1750 fprintf (vect_dump, "%s", msg);
1751 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1752 }
1753
1754
1755 /* Detect SLP reduction of the form:
1756
1757 #a1 = phi <a5, a0>
1758 a2 = operation (a1)
1759 a3 = operation (a2)
1760 a4 = operation (a3)
1761 a5 = operation (a4)
1762
1763 #a = phi <a5>
1764
1765 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1766 FIRST_STMT is the first reduction stmt in the chain
1767 (a2 = operation (a1)).
1768
1769 Return TRUE if a reduction chain was detected. */
1770
1771 static bool
1772 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1773 {
1774 struct loop *loop = (gimple_bb (phi))->loop_father;
1775 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1776 enum tree_code code;
1777 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1778 stmt_vec_info use_stmt_info, current_stmt_info;
1779 tree lhs;
1780 imm_use_iterator imm_iter;
1781 use_operand_p use_p;
1782 int nloop_uses, size = 0, n_out_of_loop_uses;
1783 bool found = false;
1784
1785 if (loop != vect_loop)
1786 return false;
1787
1788 lhs = PHI_RESULT (phi);
1789 code = gimple_assign_rhs_code (first_stmt);
1790 while (1)
1791 {
1792 nloop_uses = 0;
1793 n_out_of_loop_uses = 0;
1794 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1795 {
1796 gimple use_stmt = USE_STMT (use_p);
1797 if (is_gimple_debug (use_stmt))
1798 continue;
1799
1800 use_stmt = USE_STMT (use_p);
1801
1802 /* Check if we got back to the reduction phi. */
1803 if (use_stmt == phi)
1804 {
1805 loop_use_stmt = use_stmt;
1806 found = true;
1807 break;
1808 }
1809
1810 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1811 {
1812 if (vinfo_for_stmt (use_stmt)
1813 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1814 {
1815 loop_use_stmt = use_stmt;
1816 nloop_uses++;
1817 }
1818 }
1819 else
1820 n_out_of_loop_uses++;
1821
1822 /* There are can be either a single use in the loop or two uses in
1823 phi nodes. */
1824 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1825 return false;
1826 }
1827
1828 if (found)
1829 break;
1830
1831 /* We reached a statement with no loop uses. */
1832 if (nloop_uses == 0)
1833 return false;
1834
1835 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1836 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1837 return false;
1838
1839 if (!is_gimple_assign (loop_use_stmt)
1840 || code != gimple_assign_rhs_code (loop_use_stmt)
1841 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1842 return false;
1843
1844 /* Insert USE_STMT into reduction chain. */
1845 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1846 if (current_stmt)
1847 {
1848 current_stmt_info = vinfo_for_stmt (current_stmt);
1849 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1850 GROUP_FIRST_ELEMENT (use_stmt_info)
1851 = GROUP_FIRST_ELEMENT (current_stmt_info);
1852 }
1853 else
1854 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1855
1856 lhs = gimple_assign_lhs (loop_use_stmt);
1857 current_stmt = loop_use_stmt;
1858 size++;
1859 }
1860
1861 if (!found || loop_use_stmt != phi || size < 2)
1862 return false;
1863
1864 /* Swap the operands, if needed, to make the reduction operand be the second
1865 operand. */
1866 lhs = PHI_RESULT (phi);
1867 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1868 while (next_stmt)
1869 {
1870 if (gimple_assign_rhs2 (next_stmt) == lhs)
1871 {
1872 tree op = gimple_assign_rhs1 (next_stmt);
1873 gimple def_stmt = NULL;
1874
1875 if (TREE_CODE (op) == SSA_NAME)
1876 def_stmt = SSA_NAME_DEF_STMT (op);
1877
1878 /* Check that the other def is either defined in the loop
1879 ("vect_internal_def"), or it's an induction (defined by a
1880 loop-header phi-node). */
1881 if (def_stmt
1882 && gimple_bb (def_stmt)
1883 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1884 && (is_gimple_assign (def_stmt)
1885 || is_gimple_call (def_stmt)
1886 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1887 == vect_induction_def
1888 || (gimple_code (def_stmt) == GIMPLE_PHI
1889 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1890 == vect_internal_def
1891 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1892 {
1893 lhs = gimple_assign_lhs (next_stmt);
1894 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1895 continue;
1896 }
1897
1898 return false;
1899 }
1900 else
1901 {
1902 tree op = gimple_assign_rhs2 (next_stmt);
1903 gimple def_stmt = NULL;
1904
1905 if (TREE_CODE (op) == SSA_NAME)
1906 def_stmt = SSA_NAME_DEF_STMT (op);
1907
1908 /* Check that the other def is either defined in the loop
1909 ("vect_internal_def"), or it's an induction (defined by a
1910 loop-header phi-node). */
1911 if (def_stmt
1912 && gimple_bb (def_stmt)
1913 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1914 && (is_gimple_assign (def_stmt)
1915 || is_gimple_call (def_stmt)
1916 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1917 == vect_induction_def
1918 || (gimple_code (def_stmt) == GIMPLE_PHI
1919 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1920 == vect_internal_def
1921 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1922 {
1923 if (vect_print_dump_info (REPORT_DETAILS))
1924 {
1925 fprintf (vect_dump, "swapping oprnds: ");
1926 print_gimple_stmt (vect_dump, next_stmt, 0, TDF_SLIM);
1927 }
1928
1929 swap_tree_operands (next_stmt,
1930 gimple_assign_rhs1_ptr (next_stmt),
1931 gimple_assign_rhs2_ptr (next_stmt));
1932 update_stmt (next_stmt);
1933 }
1934 else
1935 return false;
1936 }
1937
1938 lhs = gimple_assign_lhs (next_stmt);
1939 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1940 }
1941
1942 /* Save the chain for further analysis in SLP detection. */
1943 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1944 VEC_safe_push (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_info), first);
1945 GROUP_SIZE (vinfo_for_stmt (first)) = size;
1946
1947 return true;
1948 }
1949
1950
1951 /* Function vect_is_simple_reduction_1
1952
1953 (1) Detect a cross-iteration def-use cycle that represents a simple
1954 reduction computation. We look for the following pattern:
1955
1956 loop_header:
1957 a1 = phi < a0, a2 >
1958 a3 = ...
1959 a2 = operation (a3, a1)
1960
1961 such that:
1962 1. operation is commutative and associative and it is safe to
1963 change the order of the computation (if CHECK_REDUCTION is true)
1964 2. no uses for a2 in the loop (a2 is used out of the loop)
1965 3. no uses of a1 in the loop besides the reduction operation
1966 4. no uses of a1 outside the loop.
1967
1968 Conditions 1,4 are tested here.
1969 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1970
1971 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1972 nested cycles, if CHECK_REDUCTION is false.
1973
1974 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1975 reductions:
1976
1977 a1 = phi < a0, a2 >
1978 inner loop (def of a3)
1979 a2 = phi < a3 >
1980
1981 If MODIFY is true it tries also to rework the code in-place to enable
1982 detection of more reduction patterns. For the time being we rewrite
1983 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1984 */
1985
1986 static gimple
1987 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1988 bool check_reduction, bool *double_reduc,
1989 bool modify)
1990 {
1991 struct loop *loop = (gimple_bb (phi))->loop_father;
1992 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1993 edge latch_e = loop_latch_edge (loop);
1994 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1995 gimple def_stmt, def1 = NULL, def2 = NULL;
1996 enum tree_code orig_code, code;
1997 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1998 tree type;
1999 int nloop_uses;
2000 tree name;
2001 imm_use_iterator imm_iter;
2002 use_operand_p use_p;
2003 bool phi_def;
2004
2005 *double_reduc = false;
2006
2007 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2008 otherwise, we assume outer loop vectorization. */
2009 gcc_assert ((check_reduction && loop == vect_loop)
2010 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2011
2012 name = PHI_RESULT (phi);
2013 nloop_uses = 0;
2014 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2015 {
2016 gimple use_stmt = USE_STMT (use_p);
2017 if (is_gimple_debug (use_stmt))
2018 continue;
2019
2020 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2021 {
2022 if (vect_print_dump_info (REPORT_DETAILS))
2023 fprintf (vect_dump, "intermediate value used outside loop.");
2024
2025 return NULL;
2026 }
2027
2028 if (vinfo_for_stmt (use_stmt)
2029 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2030 nloop_uses++;
2031 if (nloop_uses > 1)
2032 {
2033 if (vect_print_dump_info (REPORT_DETAILS))
2034 fprintf (vect_dump, "reduction used in loop.");
2035 return NULL;
2036 }
2037 }
2038
2039 if (TREE_CODE (loop_arg) != SSA_NAME)
2040 {
2041 if (vect_print_dump_info (REPORT_DETAILS))
2042 {
2043 fprintf (vect_dump, "reduction: not ssa_name: ");
2044 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
2045 }
2046 return NULL;
2047 }
2048
2049 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2050 if (!def_stmt)
2051 {
2052 if (vect_print_dump_info (REPORT_DETAILS))
2053 fprintf (vect_dump, "reduction: no def_stmt.");
2054 return NULL;
2055 }
2056
2057 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2058 {
2059 if (vect_print_dump_info (REPORT_DETAILS))
2060 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
2061 return NULL;
2062 }
2063
2064 if (is_gimple_assign (def_stmt))
2065 {
2066 name = gimple_assign_lhs (def_stmt);
2067 phi_def = false;
2068 }
2069 else
2070 {
2071 name = PHI_RESULT (def_stmt);
2072 phi_def = true;
2073 }
2074
2075 nloop_uses = 0;
2076 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2077 {
2078 gimple use_stmt = USE_STMT (use_p);
2079 if (is_gimple_debug (use_stmt))
2080 continue;
2081 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2082 && vinfo_for_stmt (use_stmt)
2083 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2084 nloop_uses++;
2085 if (nloop_uses > 1)
2086 {
2087 if (vect_print_dump_info (REPORT_DETAILS))
2088 fprintf (vect_dump, "reduction used in loop.");
2089 return NULL;
2090 }
2091 }
2092
2093 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2094 defined in the inner loop. */
2095 if (phi_def)
2096 {
2097 op1 = PHI_ARG_DEF (def_stmt, 0);
2098
2099 if (gimple_phi_num_args (def_stmt) != 1
2100 || TREE_CODE (op1) != SSA_NAME)
2101 {
2102 if (vect_print_dump_info (REPORT_DETAILS))
2103 fprintf (vect_dump, "unsupported phi node definition.");
2104
2105 return NULL;
2106 }
2107
2108 def1 = SSA_NAME_DEF_STMT (op1);
2109 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2110 && loop->inner
2111 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2112 && is_gimple_assign (def1))
2113 {
2114 if (vect_print_dump_info (REPORT_DETAILS))
2115 report_vect_op (def_stmt, "detected double reduction: ");
2116
2117 *double_reduc = true;
2118 return def_stmt;
2119 }
2120
2121 return NULL;
2122 }
2123
2124 code = orig_code = gimple_assign_rhs_code (def_stmt);
2125
2126 /* We can handle "res -= x[i]", which is non-associative by
2127 simply rewriting this into "res += -x[i]". Avoid changing
2128 gimple instruction for the first simple tests and only do this
2129 if we're allowed to change code at all. */
2130 if (code == MINUS_EXPR
2131 && modify
2132 && (op1 = gimple_assign_rhs1 (def_stmt))
2133 && TREE_CODE (op1) == SSA_NAME
2134 && SSA_NAME_DEF_STMT (op1) == phi)
2135 code = PLUS_EXPR;
2136
2137 if (check_reduction
2138 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2139 {
2140 if (vect_print_dump_info (REPORT_DETAILS))
2141 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
2142 return NULL;
2143 }
2144
2145 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2146 {
2147 if (code != COND_EXPR)
2148 {
2149 if (vect_print_dump_info (REPORT_DETAILS))
2150 report_vect_op (def_stmt, "reduction: not binary operation: ");
2151
2152 return NULL;
2153 }
2154
2155 op3 = gimple_assign_rhs1 (def_stmt);
2156 if (COMPARISON_CLASS_P (op3))
2157 {
2158 op4 = TREE_OPERAND (op3, 1);
2159 op3 = TREE_OPERAND (op3, 0);
2160 }
2161
2162 op1 = gimple_assign_rhs2 (def_stmt);
2163 op2 = gimple_assign_rhs3 (def_stmt);
2164
2165 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2166 {
2167 if (vect_print_dump_info (REPORT_DETAILS))
2168 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2169
2170 return NULL;
2171 }
2172 }
2173 else
2174 {
2175 op1 = gimple_assign_rhs1 (def_stmt);
2176 op2 = gimple_assign_rhs2 (def_stmt);
2177
2178 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2179 {
2180 if (vect_print_dump_info (REPORT_DETAILS))
2181 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2182
2183 return NULL;
2184 }
2185 }
2186
2187 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2188 if ((TREE_CODE (op1) == SSA_NAME
2189 && !types_compatible_p (type,TREE_TYPE (op1)))
2190 || (TREE_CODE (op2) == SSA_NAME
2191 && !types_compatible_p (type, TREE_TYPE (op2)))
2192 || (op3 && TREE_CODE (op3) == SSA_NAME
2193 && !types_compatible_p (type, TREE_TYPE (op3)))
2194 || (op4 && TREE_CODE (op4) == SSA_NAME
2195 && !types_compatible_p (type, TREE_TYPE (op4))))
2196 {
2197 if (vect_print_dump_info (REPORT_DETAILS))
2198 {
2199 fprintf (vect_dump, "reduction: multiple types: operation type: ");
2200 print_generic_expr (vect_dump, type, TDF_SLIM);
2201 fprintf (vect_dump, ", operands types: ");
2202 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2203 fprintf (vect_dump, ",");
2204 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2205 if (op3)
2206 {
2207 fprintf (vect_dump, ",");
2208 print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
2209 }
2210
2211 if (op4)
2212 {
2213 fprintf (vect_dump, ",");
2214 print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
2215 }
2216 }
2217
2218 return NULL;
2219 }
2220
2221 /* Check that it's ok to change the order of the computation.
2222 Generally, when vectorizing a reduction we change the order of the
2223 computation. This may change the behavior of the program in some
2224 cases, so we need to check that this is ok. One exception is when
2225 vectorizing an outer-loop: the inner-loop is executed sequentially,
2226 and therefore vectorizing reductions in the inner-loop during
2227 outer-loop vectorization is safe. */
2228
2229 /* CHECKME: check for !flag_finite_math_only too? */
2230 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2231 && check_reduction)
2232 {
2233 /* Changing the order of operations changes the semantics. */
2234 if (vect_print_dump_info (REPORT_DETAILS))
2235 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
2236 return NULL;
2237 }
2238 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2239 && check_reduction)
2240 {
2241 /* Changing the order of operations changes the semantics. */
2242 if (vect_print_dump_info (REPORT_DETAILS))
2243 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
2244 return NULL;
2245 }
2246 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2247 {
2248 /* Changing the order of operations changes the semantics. */
2249 if (vect_print_dump_info (REPORT_DETAILS))
2250 report_vect_op (def_stmt,
2251 "reduction: unsafe fixed-point math optimization: ");
2252 return NULL;
2253 }
2254
2255 /* If we detected "res -= x[i]" earlier, rewrite it into
2256 "res += -x[i]" now. If this turns out to be useless reassoc
2257 will clean it up again. */
2258 if (orig_code == MINUS_EXPR)
2259 {
2260 tree rhs = gimple_assign_rhs2 (def_stmt);
2261 tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
2262 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2263 rhs, NULL);
2264 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2265 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2266 loop_info, NULL));
2267 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2268 gimple_assign_set_rhs2 (def_stmt, negrhs);
2269 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2270 update_stmt (def_stmt);
2271 }
2272
2273 /* Reduction is safe. We're dealing with one of the following:
2274 1) integer arithmetic and no trapv
2275 2) floating point arithmetic, and special flags permit this optimization
2276 3) nested cycle (i.e., outer loop vectorization). */
2277 if (TREE_CODE (op1) == SSA_NAME)
2278 def1 = SSA_NAME_DEF_STMT (op1);
2279
2280 if (TREE_CODE (op2) == SSA_NAME)
2281 def2 = SSA_NAME_DEF_STMT (op2);
2282
2283 if (code != COND_EXPR
2284 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2285 {
2286 if (vect_print_dump_info (REPORT_DETAILS))
2287 report_vect_op (def_stmt, "reduction: no defs for operands: ");
2288 return NULL;
2289 }
2290
2291 /* Check that one def is the reduction def, defined by PHI,
2292 the other def is either defined in the loop ("vect_internal_def"),
2293 or it's an induction (defined by a loop-header phi-node). */
2294
2295 if (def2 && def2 == phi
2296 && (code == COND_EXPR
2297 || !def1 || gimple_nop_p (def1)
2298 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2299 && (is_gimple_assign (def1)
2300 || is_gimple_call (def1)
2301 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2302 == vect_induction_def
2303 || (gimple_code (def1) == GIMPLE_PHI
2304 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2305 == vect_internal_def
2306 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2307 {
2308 if (vect_print_dump_info (REPORT_DETAILS))
2309 report_vect_op (def_stmt, "detected reduction: ");
2310 return def_stmt;
2311 }
2312
2313 if (def1 && def1 == phi
2314 && (code == COND_EXPR
2315 || !def2 || gimple_nop_p (def2)
2316 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2317 && (is_gimple_assign (def2)
2318 || is_gimple_call (def2)
2319 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2320 == vect_induction_def
2321 || (gimple_code (def2) == GIMPLE_PHI
2322 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2323 == vect_internal_def
2324 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2325 {
2326 if (check_reduction)
2327 {
2328 /* Swap operands (just for simplicity - so that the rest of the code
2329 can assume that the reduction variable is always the last (second)
2330 argument). */
2331 if (vect_print_dump_info (REPORT_DETAILS))
2332 report_vect_op (def_stmt,
2333 "detected reduction: need to swap operands: ");
2334
2335 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2336 gimple_assign_rhs2_ptr (def_stmt));
2337 }
2338 else
2339 {
2340 if (vect_print_dump_info (REPORT_DETAILS))
2341 report_vect_op (def_stmt, "detected reduction: ");
2342 }
2343
2344 return def_stmt;
2345 }
2346
2347 /* Try to find SLP reduction chain. */
2348 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2349 {
2350 if (vect_print_dump_info (REPORT_DETAILS))
2351 report_vect_op (def_stmt, "reduction: detected reduction chain: ");
2352
2353 return def_stmt;
2354 }
2355
2356 if (vect_print_dump_info (REPORT_DETAILS))
2357 report_vect_op (def_stmt, "reduction: unknown pattern: ");
2358
2359 return NULL;
2360 }
2361
2362 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2363 in-place. Arguments as there. */
2364
2365 static gimple
2366 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2367 bool check_reduction, bool *double_reduc)
2368 {
2369 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2370 double_reduc, false);
2371 }
2372
2373 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2374 in-place if it enables detection of more reductions. Arguments
2375 as there. */
2376
2377 gimple
2378 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2379 bool check_reduction, bool *double_reduc)
2380 {
2381 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2382 double_reduc, true);
2383 }
2384
2385 /* Calculate the cost of one scalar iteration of the loop. */
2386 int
2387 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2388 {
2389 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2390 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2391 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2392 int innerloop_iters, i, stmt_cost;
2393
2394 /* Count statements in scalar loop. Using this as scalar cost for a single
2395 iteration for now.
2396
2397 TODO: Add outer loop support.
2398
2399 TODO: Consider assigning different costs to different scalar
2400 statements. */
2401
2402 /* FORNOW. */
2403 innerloop_iters = 1;
2404 if (loop->inner)
2405 innerloop_iters = 50; /* FIXME */
2406
2407 for (i = 0; i < nbbs; i++)
2408 {
2409 gimple_stmt_iterator si;
2410 basic_block bb = bbs[i];
2411
2412 if (bb->loop_father == loop->inner)
2413 factor = innerloop_iters;
2414 else
2415 factor = 1;
2416
2417 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2418 {
2419 gimple stmt = gsi_stmt (si);
2420 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2421
2422 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2423 continue;
2424
2425 /* Skip stmts that are not vectorized inside the loop. */
2426 if (stmt_info
2427 && !STMT_VINFO_RELEVANT_P (stmt_info)
2428 && (!STMT_VINFO_LIVE_P (stmt_info)
2429 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2430 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2431 continue;
2432
2433 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2434 {
2435 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2436 stmt_cost = vect_get_cost (scalar_load);
2437 else
2438 stmt_cost = vect_get_cost (scalar_store);
2439 }
2440 else
2441 stmt_cost = vect_get_cost (scalar_stmt);
2442
2443 scalar_single_iter_cost += stmt_cost * factor;
2444 }
2445 }
2446 return scalar_single_iter_cost;
2447 }
2448
2449 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2450 int
2451 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2452 int *peel_iters_epilogue,
2453 int scalar_single_iter_cost)
2454 {
2455 int peel_guard_costs = 0;
2456 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2457
2458 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2459 {
2460 *peel_iters_epilogue = vf/2;
2461 if (vect_print_dump_info (REPORT_COST))
2462 fprintf (vect_dump, "cost model: "
2463 "epilogue peel iters set to vf/2 because "
2464 "loop iterations are unknown .");
2465
2466 /* If peeled iterations are known but number of scalar loop
2467 iterations are unknown, count a taken branch per peeled loop. */
2468 peel_guard_costs = 2 * vect_get_cost (cond_branch_taken);
2469 }
2470 else
2471 {
2472 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2473 peel_iters_prologue = niters < peel_iters_prologue ?
2474 niters : peel_iters_prologue;
2475 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2476 /* If we need to peel for gaps, but no peeling is required, we have to
2477 peel VF iterations. */
2478 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2479 *peel_iters_epilogue = vf;
2480 }
2481
2482 return (peel_iters_prologue * scalar_single_iter_cost)
2483 + (*peel_iters_epilogue * scalar_single_iter_cost)
2484 + peel_guard_costs;
2485 }
2486
2487 /* Function vect_estimate_min_profitable_iters
2488
2489 Return the number of iterations required for the vector version of the
2490 loop to be profitable relative to the cost of the scalar version of the
2491 loop.
2492
2493 TODO: Take profile info into account before making vectorization
2494 decisions, if available. */
2495
2496 int
2497 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2498 {
2499 int i;
2500 int min_profitable_iters;
2501 int peel_iters_prologue;
2502 int peel_iters_epilogue;
2503 int vec_inside_cost = 0;
2504 int vec_outside_cost = 0;
2505 int scalar_single_iter_cost = 0;
2506 int scalar_outside_cost = 0;
2507 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2508 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2509 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2510 int nbbs = loop->num_nodes;
2511 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2512 int peel_guard_costs = 0;
2513 int innerloop_iters = 0, factor;
2514 VEC (slp_instance, heap) *slp_instances;
2515 slp_instance instance;
2516
2517 /* Cost model disabled. */
2518 if (!flag_vect_cost_model)
2519 {
2520 if (vect_print_dump_info (REPORT_COST))
2521 fprintf (vect_dump, "cost model disabled.");
2522 return 0;
2523 }
2524
2525 /* Requires loop versioning tests to handle misalignment. */
2526 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2527 {
2528 /* FIXME: Make cost depend on complexity of individual check. */
2529 vec_outside_cost +=
2530 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2531 if (vect_print_dump_info (REPORT_COST))
2532 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2533 "versioning to treat misalignment.\n");
2534 }
2535
2536 /* Requires loop versioning with alias checks. */
2537 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2538 {
2539 /* FIXME: Make cost depend on complexity of individual check. */
2540 vec_outside_cost +=
2541 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2542 if (vect_print_dump_info (REPORT_COST))
2543 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2544 "versioning aliasing.\n");
2545 }
2546
2547 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2548 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2549 vec_outside_cost += vect_get_cost (cond_branch_taken);
2550
2551 /* Count statements in scalar loop. Using this as scalar cost for a single
2552 iteration for now.
2553
2554 TODO: Add outer loop support.
2555
2556 TODO: Consider assigning different costs to different scalar
2557 statements. */
2558
2559 /* FORNOW. */
2560 if (loop->inner)
2561 innerloop_iters = 50; /* FIXME */
2562
2563 for (i = 0; i < nbbs; i++)
2564 {
2565 gimple_stmt_iterator si;
2566 basic_block bb = bbs[i];
2567
2568 if (bb->loop_father == loop->inner)
2569 factor = innerloop_iters;
2570 else
2571 factor = 1;
2572
2573 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2574 {
2575 gimple stmt = gsi_stmt (si);
2576 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2577
2578 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2579 {
2580 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2581 stmt_info = vinfo_for_stmt (stmt);
2582 }
2583
2584 /* Skip stmts that are not vectorized inside the loop. */
2585 if (!STMT_VINFO_RELEVANT_P (stmt_info)
2586 && (!STMT_VINFO_LIVE_P (stmt_info)
2587 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))))
2588 continue;
2589
2590 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2591 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2592 some of the "outside" costs are generated inside the outer-loop. */
2593 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2594 if (is_pattern_stmt_p (stmt_info)
2595 && STMT_VINFO_PATTERN_DEF_SEQ (stmt_info))
2596 {
2597 gimple_stmt_iterator gsi;
2598
2599 for (gsi = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2600 !gsi_end_p (gsi); gsi_next (&gsi))
2601 {
2602 gimple pattern_def_stmt = gsi_stmt (gsi);
2603 stmt_vec_info pattern_def_stmt_info
2604 = vinfo_for_stmt (pattern_def_stmt);
2605 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
2606 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
2607 {
2608 vec_inside_cost
2609 += STMT_VINFO_INSIDE_OF_LOOP_COST
2610 (pattern_def_stmt_info) * factor;
2611 vec_outside_cost
2612 += STMT_VINFO_OUTSIDE_OF_LOOP_COST
2613 (pattern_def_stmt_info);
2614 }
2615 }
2616 }
2617 }
2618 }
2619
2620 scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2621
2622 /* Add additional cost for the peeled instructions in prologue and epilogue
2623 loop.
2624
2625 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2626 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2627
2628 TODO: Build an expression that represents peel_iters for prologue and
2629 epilogue to be used in a run-time test. */
2630
2631 if (npeel < 0)
2632 {
2633 peel_iters_prologue = vf/2;
2634 if (vect_print_dump_info (REPORT_COST))
2635 fprintf (vect_dump, "cost model: "
2636 "prologue peel iters set to vf/2.");
2637
2638 /* If peeling for alignment is unknown, loop bound of main loop becomes
2639 unknown. */
2640 peel_iters_epilogue = vf/2;
2641 if (vect_print_dump_info (REPORT_COST))
2642 fprintf (vect_dump, "cost model: "
2643 "epilogue peel iters set to vf/2 because "
2644 "peeling for alignment is unknown .");
2645
2646 /* If peeled iterations are unknown, count a taken branch and a not taken
2647 branch per peeled loop. Even if scalar loop iterations are known,
2648 vector iterations are not known since peeled prologue iterations are
2649 not known. Hence guards remain the same. */
2650 peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken)
2651 + vect_get_cost (cond_branch_not_taken));
2652 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2653 + (peel_iters_epilogue * scalar_single_iter_cost)
2654 + peel_guard_costs;
2655 }
2656 else
2657 {
2658 peel_iters_prologue = npeel;
2659 vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2660 peel_iters_prologue, &peel_iters_epilogue,
2661 scalar_single_iter_cost);
2662 }
2663
2664 /* FORNOW: The scalar outside cost is incremented in one of the
2665 following ways:
2666
2667 1. The vectorizer checks for alignment and aliasing and generates
2668 a condition that allows dynamic vectorization. A cost model
2669 check is ANDED with the versioning condition. Hence scalar code
2670 path now has the added cost of the versioning check.
2671
2672 if (cost > th & versioning_check)
2673 jmp to vector code
2674
2675 Hence run-time scalar is incremented by not-taken branch cost.
2676
2677 2. The vectorizer then checks if a prologue is required. If the
2678 cost model check was not done before during versioning, it has to
2679 be done before the prologue check.
2680
2681 if (cost <= th)
2682 prologue = scalar_iters
2683 if (prologue == 0)
2684 jmp to vector code
2685 else
2686 execute prologue
2687 if (prologue == num_iters)
2688 go to exit
2689
2690 Hence the run-time scalar cost is incremented by a taken branch,
2691 plus a not-taken branch, plus a taken branch cost.
2692
2693 3. The vectorizer then checks if an epilogue is required. If the
2694 cost model check was not done before during prologue check, it
2695 has to be done with the epilogue check.
2696
2697 if (prologue == 0)
2698 jmp to vector code
2699 else
2700 execute prologue
2701 if (prologue == num_iters)
2702 go to exit
2703 vector code:
2704 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2705 jmp to epilogue
2706
2707 Hence the run-time scalar cost should be incremented by 2 taken
2708 branches.
2709
2710 TODO: The back end may reorder the BBS's differently and reverse
2711 conditions/branch directions. Change the estimates below to
2712 something more reasonable. */
2713
2714 /* If the number of iterations is known and we do not do versioning, we can
2715 decide whether to vectorize at compile time. Hence the scalar version
2716 do not carry cost model guard costs. */
2717 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2718 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2719 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2720 {
2721 /* Cost model check occurs at versioning. */
2722 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2723 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2724 scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2725 else
2726 {
2727 /* Cost model check occurs at prologue generation. */
2728 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2729 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2730 + vect_get_cost (cond_branch_not_taken);
2731 /* Cost model check occurs at epilogue generation. */
2732 else
2733 scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2734 }
2735 }
2736
2737 /* Add SLP costs. */
2738 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2739 FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2740 {
2741 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2742 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2743 }
2744
2745 /* Calculate number of iterations required to make the vector version
2746 profitable, relative to the loop bodies only. The following condition
2747 must hold true:
2748 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2749 where
2750 SIC = scalar iteration cost, VIC = vector iteration cost,
2751 VOC = vector outside cost, VF = vectorization factor,
2752 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2753 SOC = scalar outside cost for run time cost model check. */
2754
2755 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2756 {
2757 if (vec_outside_cost <= 0)
2758 min_profitable_iters = 1;
2759 else
2760 {
2761 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2762 - vec_inside_cost * peel_iters_prologue
2763 - vec_inside_cost * peel_iters_epilogue)
2764 / ((scalar_single_iter_cost * vf)
2765 - vec_inside_cost);
2766
2767 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2768 <= ((vec_inside_cost * min_profitable_iters)
2769 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2770 min_profitable_iters++;
2771 }
2772 }
2773 /* vector version will never be profitable. */
2774 else
2775 {
2776 if (vect_print_dump_info (REPORT_COST))
2777 fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2778 "divided by the scalar iteration cost = %d "
2779 "is greater or equal to the vectorization factor = %d.",
2780 vec_inside_cost, scalar_single_iter_cost, vf);
2781 return -1;
2782 }
2783
2784 if (vect_print_dump_info (REPORT_COST))
2785 {
2786 fprintf (vect_dump, "Cost model analysis: \n");
2787 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2788 vec_inside_cost);
2789 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2790 vec_outside_cost);
2791 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2792 scalar_single_iter_cost);
2793 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2794 fprintf (vect_dump, " prologue iterations: %d\n",
2795 peel_iters_prologue);
2796 fprintf (vect_dump, " epilogue iterations: %d\n",
2797 peel_iters_epilogue);
2798 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2799 min_profitable_iters);
2800 }
2801
2802 min_profitable_iters =
2803 min_profitable_iters < vf ? vf : min_profitable_iters;
2804
2805 /* Because the condition we create is:
2806 if (niters <= min_profitable_iters)
2807 then skip the vectorized loop. */
2808 min_profitable_iters--;
2809
2810 if (vect_print_dump_info (REPORT_COST))
2811 fprintf (vect_dump, " Profitability threshold = %d\n",
2812 min_profitable_iters);
2813
2814 return min_profitable_iters;
2815 }
2816
2817
2818 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2819 functions. Design better to avoid maintenance issues. */
2820
2821 /* Function vect_model_reduction_cost.
2822
2823 Models cost for a reduction operation, including the vector ops
2824 generated within the strip-mine loop, the initial definition before
2825 the loop, and the epilogue code that must be generated. */
2826
2827 static bool
2828 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2829 int ncopies)
2830 {
2831 int outer_cost = 0;
2832 enum tree_code code;
2833 optab optab;
2834 tree vectype;
2835 gimple stmt, orig_stmt;
2836 tree reduction_op;
2837 enum machine_mode mode;
2838 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2839 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2840
2841
2842 /* Cost of reduction op inside loop. */
2843 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2844 += ncopies * vect_get_cost (vector_stmt);
2845
2846 stmt = STMT_VINFO_STMT (stmt_info);
2847
2848 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2849 {
2850 case GIMPLE_SINGLE_RHS:
2851 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2852 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2853 break;
2854 case GIMPLE_UNARY_RHS:
2855 reduction_op = gimple_assign_rhs1 (stmt);
2856 break;
2857 case GIMPLE_BINARY_RHS:
2858 reduction_op = gimple_assign_rhs2 (stmt);
2859 break;
2860 case GIMPLE_TERNARY_RHS:
2861 reduction_op = gimple_assign_rhs3 (stmt);
2862 break;
2863 default:
2864 gcc_unreachable ();
2865 }
2866
2867 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2868 if (!vectype)
2869 {
2870 if (vect_print_dump_info (REPORT_COST))
2871 {
2872 fprintf (vect_dump, "unsupported data-type ");
2873 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2874 }
2875 return false;
2876 }
2877
2878 mode = TYPE_MODE (vectype);
2879 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2880
2881 if (!orig_stmt)
2882 orig_stmt = STMT_VINFO_STMT (stmt_info);
2883
2884 code = gimple_assign_rhs_code (orig_stmt);
2885
2886 /* Add in cost for initial definition. */
2887 outer_cost += vect_get_cost (scalar_to_vec);
2888
2889 /* Determine cost of epilogue code.
2890
2891 We have a reduction operator that will reduce the vector in one statement.
2892 Also requires scalar extract. */
2893
2894 if (!nested_in_vect_loop_p (loop, orig_stmt))
2895 {
2896 if (reduc_code != ERROR_MARK)
2897 outer_cost += vect_get_cost (vector_stmt)
2898 + vect_get_cost (vec_to_scalar);
2899 else
2900 {
2901 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2902 tree bitsize =
2903 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2904 int element_bitsize = tree_low_cst (bitsize, 1);
2905 int nelements = vec_size_in_bits / element_bitsize;
2906
2907 optab = optab_for_tree_code (code, vectype, optab_default);
2908
2909 /* We have a whole vector shift available. */
2910 if (VECTOR_MODE_P (mode)
2911 && optab_handler (optab, mode) != CODE_FOR_nothing
2912 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2913 /* Final reduction via vector shifts and the reduction operator. Also
2914 requires scalar extract. */
2915 outer_cost += ((exact_log2(nelements) * 2)
2916 * vect_get_cost (vector_stmt)
2917 + vect_get_cost (vec_to_scalar));
2918 else
2919 /* Use extracts and reduction op for final reduction. For N elements,
2920 we have N extracts and N-1 reduction ops. */
2921 outer_cost += ((nelements + nelements - 1)
2922 * vect_get_cost (vector_stmt));
2923 }
2924 }
2925
2926 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2927
2928 if (vect_print_dump_info (REPORT_COST))
2929 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2930 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2931 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2932
2933 return true;
2934 }
2935
2936
2937 /* Function vect_model_induction_cost.
2938
2939 Models cost for induction operations. */
2940
2941 static void
2942 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2943 {
2944 /* loop cost for vec_loop. */
2945 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2946 = ncopies * vect_get_cost (vector_stmt);
2947 /* prologue cost for vec_init and vec_step. */
2948 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2949 = 2 * vect_get_cost (scalar_to_vec);
2950
2951 if (vect_print_dump_info (REPORT_COST))
2952 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2953 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2954 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2955 }
2956
2957
2958 /* Function get_initial_def_for_induction
2959
2960 Input:
2961 STMT - a stmt that performs an induction operation in the loop.
2962 IV_PHI - the initial value of the induction variable
2963
2964 Output:
2965 Return a vector variable, initialized with the first VF values of
2966 the induction variable. E.g., for an iv with IV_PHI='X' and
2967 evolution S, for a vector of 4 units, we want to return:
2968 [X, X + S, X + 2*S, X + 3*S]. */
2969
2970 static tree
2971 get_initial_def_for_induction (gimple iv_phi)
2972 {
2973 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2974 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2975 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2976 tree scalar_type;
2977 tree vectype;
2978 int nunits;
2979 edge pe = loop_preheader_edge (loop);
2980 struct loop *iv_loop;
2981 basic_block new_bb;
2982 tree vec, vec_init, vec_step, t;
2983 tree access_fn;
2984 tree new_var;
2985 tree new_name;
2986 gimple init_stmt, induction_phi, new_stmt;
2987 tree induc_def, vec_def, vec_dest;
2988 tree init_expr, step_expr;
2989 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2990 int i;
2991 bool ok;
2992 int ncopies;
2993 tree expr;
2994 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2995 bool nested_in_vect_loop = false;
2996 gimple_seq stmts = NULL;
2997 imm_use_iterator imm_iter;
2998 use_operand_p use_p;
2999 gimple exit_phi;
3000 edge latch_e;
3001 tree loop_arg;
3002 gimple_stmt_iterator si;
3003 basic_block bb = gimple_bb (iv_phi);
3004 tree stepvectype;
3005 tree resvectype;
3006
3007 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3008 if (nested_in_vect_loop_p (loop, iv_phi))
3009 {
3010 nested_in_vect_loop = true;
3011 iv_loop = loop->inner;
3012 }
3013 else
3014 iv_loop = loop;
3015 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3016
3017 latch_e = loop_latch_edge (iv_loop);
3018 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3019
3020 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3021 gcc_assert (access_fn);
3022 STRIP_NOPS (access_fn);
3023 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3024 &init_expr, &step_expr);
3025 gcc_assert (ok);
3026 pe = loop_preheader_edge (iv_loop);
3027
3028 scalar_type = TREE_TYPE (init_expr);
3029 vectype = get_vectype_for_scalar_type (scalar_type);
3030 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3031 gcc_assert (vectype);
3032 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3033 ncopies = vf / nunits;
3034
3035 gcc_assert (phi_info);
3036 gcc_assert (ncopies >= 1);
3037
3038 /* Find the first insertion point in the BB. */
3039 si = gsi_after_labels (bb);
3040
3041 /* Create the vector that holds the initial_value of the induction. */
3042 if (nested_in_vect_loop)
3043 {
3044 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3045 been created during vectorization of previous stmts. We obtain it
3046 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3047 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3048 loop_preheader_edge (iv_loop));
3049 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3050 }
3051 else
3052 {
3053 VEC(constructor_elt,gc) *v;
3054
3055 /* iv_loop is the loop to be vectorized. Create:
3056 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3057 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3058 add_referenced_var (new_var);
3059
3060 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3061 if (stmts)
3062 {
3063 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3064 gcc_assert (!new_bb);
3065 }
3066
3067 v = VEC_alloc (constructor_elt, gc, nunits);
3068 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3069 for (i = 1; i < nunits; i++)
3070 {
3071 /* Create: new_name_i = new_name + step_expr */
3072 enum tree_code code = POINTER_TYPE_P (scalar_type)
3073 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3074 init_stmt = gimple_build_assign_with_ops (code, new_var,
3075 new_name, step_expr);
3076 new_name = make_ssa_name (new_var, init_stmt);
3077 gimple_assign_set_lhs (init_stmt, new_name);
3078
3079 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3080 gcc_assert (!new_bb);
3081
3082 if (vect_print_dump_info (REPORT_DETAILS))
3083 {
3084 fprintf (vect_dump, "created new init_stmt: ");
3085 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
3086 }
3087 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3088 }
3089 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3090 vec = build_constructor (vectype, v);
3091 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
3092 }
3093
3094
3095 /* Create the vector that holds the step of the induction. */
3096 if (nested_in_vect_loop)
3097 /* iv_loop is nested in the loop to be vectorized. Generate:
3098 vec_step = [S, S, S, S] */
3099 new_name = step_expr;
3100 else
3101 {
3102 /* iv_loop is the loop to be vectorized. Generate:
3103 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3104 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3105 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3106 expr, step_expr);
3107 }
3108
3109 t = unshare_expr (new_name);
3110 gcc_assert (CONSTANT_CLASS_P (new_name));
3111 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3112 gcc_assert (stepvectype);
3113 vec = build_vector_from_val (stepvectype, t);
3114 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3115
3116
3117 /* Create the following def-use cycle:
3118 loop prolog:
3119 vec_init = ...
3120 vec_step = ...
3121 loop:
3122 vec_iv = PHI <vec_init, vec_loop>
3123 ...
3124 STMT
3125 ...
3126 vec_loop = vec_iv + vec_step; */
3127
3128 /* Create the induction-phi that defines the induction-operand. */
3129 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3130 add_referenced_var (vec_dest);
3131 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3132 set_vinfo_for_stmt (induction_phi,
3133 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3134 induc_def = PHI_RESULT (induction_phi);
3135
3136 /* Create the iv update inside the loop */
3137 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3138 induc_def, vec_step);
3139 vec_def = make_ssa_name (vec_dest, new_stmt);
3140 gimple_assign_set_lhs (new_stmt, vec_def);
3141 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3142 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3143 NULL));
3144
3145 /* Set the arguments of the phi node: */
3146 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3147 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3148 UNKNOWN_LOCATION);
3149
3150
3151 /* In case that vectorization factor (VF) is bigger than the number
3152 of elements that we can fit in a vectype (nunits), we have to generate
3153 more than one vector stmt - i.e - we need to "unroll" the
3154 vector stmt by a factor VF/nunits. For more details see documentation
3155 in vectorizable_operation. */
3156
3157 if (ncopies > 1)
3158 {
3159 stmt_vec_info prev_stmt_vinfo;
3160 /* FORNOW. This restriction should be relaxed. */
3161 gcc_assert (!nested_in_vect_loop);
3162
3163 /* Create the vector that holds the step of the induction. */
3164 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3165 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3166 expr, step_expr);
3167 t = unshare_expr (new_name);
3168 gcc_assert (CONSTANT_CLASS_P (new_name));
3169 vec = build_vector_from_val (stepvectype, t);
3170 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3171
3172 vec_def = induc_def;
3173 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3174 for (i = 1; i < ncopies; i++)
3175 {
3176 /* vec_i = vec_prev + vec_step */
3177 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3178 vec_def, vec_step);
3179 vec_def = make_ssa_name (vec_dest, new_stmt);
3180 gimple_assign_set_lhs (new_stmt, vec_def);
3181
3182 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3183 if (!useless_type_conversion_p (resvectype, vectype))
3184 {
3185 new_stmt = gimple_build_assign_with_ops
3186 (VIEW_CONVERT_EXPR,
3187 vect_get_new_vect_var (resvectype, vect_simple_var,
3188 "vec_iv_"),
3189 build1 (VIEW_CONVERT_EXPR, resvectype,
3190 gimple_assign_lhs (new_stmt)), NULL_TREE);
3191 gimple_assign_set_lhs (new_stmt,
3192 make_ssa_name
3193 (gimple_assign_lhs (new_stmt), new_stmt));
3194 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3195 }
3196 set_vinfo_for_stmt (new_stmt,
3197 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3198 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3199 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3200 }
3201 }
3202
3203 if (nested_in_vect_loop)
3204 {
3205 /* Find the loop-closed exit-phi of the induction, and record
3206 the final vector of induction results: */
3207 exit_phi = NULL;
3208 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3209 {
3210 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3211 {
3212 exit_phi = USE_STMT (use_p);
3213 break;
3214 }
3215 }
3216 if (exit_phi)
3217 {
3218 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3219 /* FORNOW. Currently not supporting the case that an inner-loop induction
3220 is not used in the outer-loop (i.e. only outside the outer-loop). */
3221 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3222 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3223
3224 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3225 if (vect_print_dump_info (REPORT_DETAILS))
3226 {
3227 fprintf (vect_dump, "vector of inductions after inner-loop:");
3228 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
3229 }
3230 }
3231 }
3232
3233
3234 if (vect_print_dump_info (REPORT_DETAILS))
3235 {
3236 fprintf (vect_dump, "transform induction: created def-use cycle: ");
3237 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
3238 fprintf (vect_dump, "\n");
3239 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
3240 }
3241
3242 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3243 if (!useless_type_conversion_p (resvectype, vectype))
3244 {
3245 new_stmt = gimple_build_assign_with_ops
3246 (VIEW_CONVERT_EXPR,
3247 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3248 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3249 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3250 gimple_assign_set_lhs (new_stmt, induc_def);
3251 si = gsi_start_bb (bb);
3252 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3253 set_vinfo_for_stmt (new_stmt,
3254 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3255 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3256 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3257 }
3258
3259 return induc_def;
3260 }
3261
3262
3263 /* Function get_initial_def_for_reduction
3264
3265 Input:
3266 STMT - a stmt that performs a reduction operation in the loop.
3267 INIT_VAL - the initial value of the reduction variable
3268
3269 Output:
3270 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3271 of the reduction (used for adjusting the epilog - see below).
3272 Return a vector variable, initialized according to the operation that STMT
3273 performs. This vector will be used as the initial value of the
3274 vector of partial results.
3275
3276 Option1 (adjust in epilog): Initialize the vector as follows:
3277 add/bit or/xor: [0,0,...,0,0]
3278 mult/bit and: [1,1,...,1,1]
3279 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3280 and when necessary (e.g. add/mult case) let the caller know
3281 that it needs to adjust the result by init_val.
3282
3283 Option2: Initialize the vector as follows:
3284 add/bit or/xor: [init_val,0,0,...,0]
3285 mult/bit and: [init_val,1,1,...,1]
3286 min/max/cond_expr: [init_val,init_val,...,init_val]
3287 and no adjustments are needed.
3288
3289 For example, for the following code:
3290
3291 s = init_val;
3292 for (i=0;i<n;i++)
3293 s = s + a[i];
3294
3295 STMT is 's = s + a[i]', and the reduction variable is 's'.
3296 For a vector of 4 units, we want to return either [0,0,0,init_val],
3297 or [0,0,0,0] and let the caller know that it needs to adjust
3298 the result at the end by 'init_val'.
3299
3300 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3301 initialization vector is simpler (same element in all entries), if
3302 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3303
3304 A cost model should help decide between these two schemes. */
3305
3306 tree
3307 get_initial_def_for_reduction (gimple stmt, tree init_val,
3308 tree *adjustment_def)
3309 {
3310 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3311 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3312 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3313 tree scalar_type = TREE_TYPE (init_val);
3314 tree vectype = get_vectype_for_scalar_type (scalar_type);
3315 int nunits;
3316 enum tree_code code = gimple_assign_rhs_code (stmt);
3317 tree def_for_init;
3318 tree init_def;
3319 tree *elts;
3320 int i;
3321 bool nested_in_vect_loop = false;
3322 tree init_value;
3323 REAL_VALUE_TYPE real_init_val = dconst0;
3324 int int_init_val = 0;
3325 gimple def_stmt = NULL;
3326
3327 gcc_assert (vectype);
3328 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3329
3330 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3331 || SCALAR_FLOAT_TYPE_P (scalar_type));
3332
3333 if (nested_in_vect_loop_p (loop, stmt))
3334 nested_in_vect_loop = true;
3335 else
3336 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3337
3338 /* In case of double reduction we only create a vector variable to be put
3339 in the reduction phi node. The actual statement creation is done in
3340 vect_create_epilog_for_reduction. */
3341 if (adjustment_def && nested_in_vect_loop
3342 && TREE_CODE (init_val) == SSA_NAME
3343 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3344 && gimple_code (def_stmt) == GIMPLE_PHI
3345 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3346 && vinfo_for_stmt (def_stmt)
3347 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3348 == vect_double_reduction_def)
3349 {
3350 *adjustment_def = NULL;
3351 return vect_create_destination_var (init_val, vectype);
3352 }
3353
3354 if (TREE_CONSTANT (init_val))
3355 {
3356 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3357 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3358 else
3359 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3360 }
3361 else
3362 init_value = init_val;
3363
3364 switch (code)
3365 {
3366 case WIDEN_SUM_EXPR:
3367 case DOT_PROD_EXPR:
3368 case PLUS_EXPR:
3369 case MINUS_EXPR:
3370 case BIT_IOR_EXPR:
3371 case BIT_XOR_EXPR:
3372 case MULT_EXPR:
3373 case BIT_AND_EXPR:
3374 /* ADJUSMENT_DEF is NULL when called from
3375 vect_create_epilog_for_reduction to vectorize double reduction. */
3376 if (adjustment_def)
3377 {
3378 if (nested_in_vect_loop)
3379 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3380 NULL);
3381 else
3382 *adjustment_def = init_val;
3383 }
3384
3385 if (code == MULT_EXPR)
3386 {
3387 real_init_val = dconst1;
3388 int_init_val = 1;
3389 }
3390
3391 if (code == BIT_AND_EXPR)
3392 int_init_val = -1;
3393
3394 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3395 def_for_init = build_real (scalar_type, real_init_val);
3396 else
3397 def_for_init = build_int_cst (scalar_type, int_init_val);
3398
3399 /* Create a vector of '0' or '1' except the first element. */
3400 elts = XALLOCAVEC (tree, nunits);
3401 for (i = nunits - 2; i >= 0; --i)
3402 elts[i + 1] = def_for_init;
3403
3404 /* Option1: the first element is '0' or '1' as well. */
3405 if (adjustment_def)
3406 {
3407 elts[0] = def_for_init;
3408 init_def = build_vector (vectype, elts);
3409 break;
3410 }
3411
3412 /* Option2: the first element is INIT_VAL. */
3413 elts[0] = init_val;
3414 if (TREE_CONSTANT (init_val))
3415 init_def = build_vector (vectype, elts);
3416 else
3417 {
3418 VEC(constructor_elt,gc) *v;
3419 v = VEC_alloc (constructor_elt, gc, nunits);
3420 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3421 for (i = 1; i < nunits; ++i)
3422 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3423 init_def = build_constructor (vectype, v);
3424 }
3425
3426 break;
3427
3428 case MIN_EXPR:
3429 case MAX_EXPR:
3430 case COND_EXPR:
3431 if (adjustment_def)
3432 {
3433 *adjustment_def = NULL_TREE;
3434 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3435 break;
3436 }
3437
3438 init_def = build_vector_from_val (vectype, init_value);
3439 break;
3440
3441 default:
3442 gcc_unreachable ();
3443 }
3444
3445 return init_def;
3446 }
3447
3448
3449 /* Function vect_create_epilog_for_reduction
3450
3451 Create code at the loop-epilog to finalize the result of a reduction
3452 computation.
3453
3454 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3455 reduction statements.
3456 STMT is the scalar reduction stmt that is being vectorized.
3457 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3458 number of elements that we can fit in a vectype (nunits). In this case
3459 we have to generate more than one vector stmt - i.e - we need to "unroll"
3460 the vector stmt by a factor VF/nunits. For more details see documentation
3461 in vectorizable_operation.
3462 REDUC_CODE is the tree-code for the epilog reduction.
3463 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3464 computation.
3465 REDUC_INDEX is the index of the operand in the right hand side of the
3466 statement that is defined by REDUCTION_PHI.
3467 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3468 SLP_NODE is an SLP node containing a group of reduction statements. The
3469 first one in this group is STMT.
3470
3471 This function:
3472 1. Creates the reduction def-use cycles: sets the arguments for
3473 REDUCTION_PHIS:
3474 The loop-entry argument is the vectorized initial-value of the reduction.
3475 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3476 sums.
3477 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3478 by applying the operation specified by REDUC_CODE if available, or by
3479 other means (whole-vector shifts or a scalar loop).
3480 The function also creates a new phi node at the loop exit to preserve
3481 loop-closed form, as illustrated below.
3482
3483 The flow at the entry to this function:
3484
3485 loop:
3486 vec_def = phi <null, null> # REDUCTION_PHI
3487 VECT_DEF = vector_stmt # vectorized form of STMT
3488 s_loop = scalar_stmt # (scalar) STMT
3489 loop_exit:
3490 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3491 use <s_out0>
3492 use <s_out0>
3493
3494 The above is transformed by this function into:
3495
3496 loop:
3497 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3498 VECT_DEF = vector_stmt # vectorized form of STMT
3499 s_loop = scalar_stmt # (scalar) STMT
3500 loop_exit:
3501 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3502 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3503 v_out2 = reduce <v_out1>
3504 s_out3 = extract_field <v_out2, 0>
3505 s_out4 = adjust_result <s_out3>
3506 use <s_out4>
3507 use <s_out4>
3508 */
3509
3510 static void
3511 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3512 int ncopies, enum tree_code reduc_code,
3513 VEC (gimple, heap) *reduction_phis,
3514 int reduc_index, bool double_reduc,
3515 slp_tree slp_node)
3516 {
3517 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3518 stmt_vec_info prev_phi_info;
3519 tree vectype;
3520 enum machine_mode mode;
3521 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3522 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3523 basic_block exit_bb;
3524 tree scalar_dest;
3525 tree scalar_type;
3526 gimple new_phi = NULL, phi;
3527 gimple_stmt_iterator exit_gsi;
3528 tree vec_dest;
3529 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3530 gimple epilog_stmt = NULL;
3531 enum tree_code code = gimple_assign_rhs_code (stmt);
3532 gimple exit_phi;
3533 tree bitsize, bitpos;
3534 tree adjustment_def = NULL;
3535 tree vec_initial_def = NULL;
3536 tree reduction_op, expr, def;
3537 tree orig_name, scalar_result;
3538 imm_use_iterator imm_iter, phi_imm_iter;
3539 use_operand_p use_p, phi_use_p;
3540 bool extract_scalar_result = false;
3541 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3542 bool nested_in_vect_loop = false;
3543 VEC (gimple, heap) *new_phis = NULL;
3544 VEC (gimple, heap) *inner_phis = NULL;
3545 enum vect_def_type dt = vect_unknown_def_type;
3546 int j, i;
3547 VEC (tree, heap) *scalar_results = NULL;
3548 unsigned int group_size = 1, k, ratio;
3549 VEC (tree, heap) *vec_initial_defs = NULL;
3550 VEC (gimple, heap) *phis;
3551 bool slp_reduc = false;
3552 tree new_phi_result;
3553 gimple inner_phi = NULL;
3554
3555 if (slp_node)
3556 group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3557
3558 if (nested_in_vect_loop_p (loop, stmt))
3559 {
3560 outer_loop = loop;
3561 loop = loop->inner;
3562 nested_in_vect_loop = true;
3563 gcc_assert (!slp_node);
3564 }
3565
3566 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3567 {
3568 case GIMPLE_SINGLE_RHS:
3569 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3570 == ternary_op);
3571 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3572 break;
3573 case GIMPLE_UNARY_RHS:
3574 reduction_op = gimple_assign_rhs1 (stmt);
3575 break;
3576 case GIMPLE_BINARY_RHS:
3577 reduction_op = reduc_index ?
3578 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3579 break;
3580 case GIMPLE_TERNARY_RHS:
3581 reduction_op = gimple_op (stmt, reduc_index + 1);
3582 break;
3583 default:
3584 gcc_unreachable ();
3585 }
3586
3587 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3588 gcc_assert (vectype);
3589 mode = TYPE_MODE (vectype);
3590
3591 /* 1. Create the reduction def-use cycle:
3592 Set the arguments of REDUCTION_PHIS, i.e., transform
3593
3594 loop:
3595 vec_def = phi <null, null> # REDUCTION_PHI
3596 VECT_DEF = vector_stmt # vectorized form of STMT
3597 ...
3598
3599 into:
3600
3601 loop:
3602 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3603 VECT_DEF = vector_stmt # vectorized form of STMT
3604 ...
3605
3606 (in case of SLP, do it for all the phis). */
3607
3608 /* Get the loop-entry arguments. */
3609 if (slp_node)
3610 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3611 NULL, slp_node, reduc_index);
3612 else
3613 {
3614 vec_initial_defs = VEC_alloc (tree, heap, 1);
3615 /* For the case of reduction, vect_get_vec_def_for_operand returns
3616 the scalar def before the loop, that defines the initial value
3617 of the reduction variable. */
3618 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3619 &adjustment_def);
3620 VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3621 }
3622
3623 /* Set phi nodes arguments. */
3624 FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3625 {
3626 tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3627 tree def = VEC_index (tree, vect_defs, i);
3628 for (j = 0; j < ncopies; j++)
3629 {
3630 /* Set the loop-entry arg of the reduction-phi. */
3631 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3632 UNKNOWN_LOCATION);
3633
3634 /* Set the loop-latch arg for the reduction-phi. */
3635 if (j > 0)
3636 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3637
3638 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3639
3640 if (vect_print_dump_info (REPORT_DETAILS))
3641 {
3642 fprintf (vect_dump, "transform reduction: created def-use"
3643 " cycle: ");
3644 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3645 fprintf (vect_dump, "\n");
3646 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3647 TDF_SLIM);
3648 }
3649
3650 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3651 }
3652 }
3653
3654 VEC_free (tree, heap, vec_initial_defs);
3655
3656 /* 2. Create epilog code.
3657 The reduction epilog code operates across the elements of the vector
3658 of partial results computed by the vectorized loop.
3659 The reduction epilog code consists of:
3660
3661 step 1: compute the scalar result in a vector (v_out2)
3662 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3663 step 3: adjust the scalar result (s_out3) if needed.
3664
3665 Step 1 can be accomplished using one the following three schemes:
3666 (scheme 1) using reduc_code, if available.
3667 (scheme 2) using whole-vector shifts, if available.
3668 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3669 combined.
3670
3671 The overall epilog code looks like this:
3672
3673 s_out0 = phi <s_loop> # original EXIT_PHI
3674 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3675 v_out2 = reduce <v_out1> # step 1
3676 s_out3 = extract_field <v_out2, 0> # step 2
3677 s_out4 = adjust_result <s_out3> # step 3
3678
3679 (step 3 is optional, and steps 1 and 2 may be combined).
3680 Lastly, the uses of s_out0 are replaced by s_out4. */
3681
3682
3683 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3684 v_out1 = phi <VECT_DEF>
3685 Store them in NEW_PHIS. */
3686
3687 exit_bb = single_exit (loop)->dest;
3688 prev_phi_info = NULL;
3689 new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3690 FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3691 {
3692 for (j = 0; j < ncopies; j++)
3693 {
3694 phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3695 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3696 if (j == 0)
3697 VEC_quick_push (gimple, new_phis, phi);
3698 else
3699 {
3700 def = vect_get_vec_def_for_stmt_copy (dt, def);
3701 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3702 }
3703
3704 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3705 prev_phi_info = vinfo_for_stmt (phi);
3706 }
3707 }
3708
3709 /* The epilogue is created for the outer-loop, i.e., for the loop being
3710 vectorized. Create exit phis for the outer loop. */
3711 if (double_reduc)
3712 {
3713 loop = outer_loop;
3714 exit_bb = single_exit (loop)->dest;
3715 inner_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3716 FOR_EACH_VEC_ELT (gimple, new_phis, i, phi)
3717 {
3718 gimple outer_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (phi)),
3719 exit_bb);
3720 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3721 PHI_RESULT (phi));
3722 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3723 loop_vinfo, NULL));
3724 VEC_quick_push (gimple, inner_phis, phi);
3725 VEC_replace (gimple, new_phis, i, outer_phi);
3726 prev_phi_info = vinfo_for_stmt (outer_phi);
3727 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3728 {
3729 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3730 outer_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (phi)),
3731 exit_bb);
3732 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3733 PHI_RESULT (phi));
3734 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3735 loop_vinfo, NULL));
3736 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3737 prev_phi_info = vinfo_for_stmt (outer_phi);
3738 }
3739 }
3740 }
3741
3742 exit_gsi = gsi_after_labels (exit_bb);
3743
3744 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3745 (i.e. when reduc_code is not available) and in the final adjustment
3746 code (if needed). Also get the original scalar reduction variable as
3747 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3748 represents a reduction pattern), the tree-code and scalar-def are
3749 taken from the original stmt that the pattern-stmt (STMT) replaces.
3750 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3751 are taken from STMT. */
3752
3753 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3754 if (!orig_stmt)
3755 {
3756 /* Regular reduction */
3757 orig_stmt = stmt;
3758 }
3759 else
3760 {
3761 /* Reduction pattern */
3762 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3763 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3764 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3765 }
3766
3767 code = gimple_assign_rhs_code (orig_stmt);
3768 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3769 partial results are added and not subtracted. */
3770 if (code == MINUS_EXPR)
3771 code = PLUS_EXPR;
3772
3773 scalar_dest = gimple_assign_lhs (orig_stmt);
3774 scalar_type = TREE_TYPE (scalar_dest);
3775 scalar_results = VEC_alloc (tree, heap, group_size);
3776 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3777 bitsize = TYPE_SIZE (scalar_type);
3778
3779 /* In case this is a reduction in an inner-loop while vectorizing an outer
3780 loop - we don't need to extract a single scalar result at the end of the
3781 inner-loop (unless it is double reduction, i.e., the use of reduction is
3782 outside the outer-loop). The final vector of partial results will be used
3783 in the vectorized outer-loop, or reduced to a scalar result at the end of
3784 the outer-loop. */
3785 if (nested_in_vect_loop && !double_reduc)
3786 goto vect_finalize_reduction;
3787
3788 /* SLP reduction without reduction chain, e.g.,
3789 # a1 = phi <a2, a0>
3790 # b1 = phi <b2, b0>
3791 a2 = operation (a1)
3792 b2 = operation (b1) */
3793 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3794
3795 /* In case of reduction chain, e.g.,
3796 # a1 = phi <a3, a0>
3797 a2 = operation (a1)
3798 a3 = operation (a2),
3799
3800 we may end up with more than one vector result. Here we reduce them to
3801 one vector. */
3802 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3803 {
3804 tree first_vect = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3805 tree tmp;
3806 gimple new_vec_stmt = NULL;
3807
3808 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3809 for (k = 1; k < VEC_length (gimple, new_phis); k++)
3810 {
3811 gimple next_phi = VEC_index (gimple, new_phis, k);
3812 tree second_vect = PHI_RESULT (next_phi);
3813
3814 tmp = build2 (code, vectype, first_vect, second_vect);
3815 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3816 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3817 gimple_assign_set_lhs (new_vec_stmt, first_vect);
3818 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3819 }
3820
3821 new_phi_result = first_vect;
3822 if (new_vec_stmt)
3823 {
3824 VEC_truncate (gimple, new_phis, 0);
3825 VEC_safe_push (gimple, heap, new_phis, new_vec_stmt);
3826 }
3827 }
3828 else
3829 new_phi_result = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3830
3831 /* 2.3 Create the reduction code, using one of the three schemes described
3832 above. In SLP we simply need to extract all the elements from the
3833 vector (without reducing them), so we use scalar shifts. */
3834 if (reduc_code != ERROR_MARK && !slp_reduc)
3835 {
3836 tree tmp;
3837
3838 /*** Case 1: Create:
3839 v_out2 = reduc_expr <v_out1> */
3840
3841 if (vect_print_dump_info (REPORT_DETAILS))
3842 fprintf (vect_dump, "Reduce using direct vector reduction.");
3843
3844 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3845 tmp = build1 (reduc_code, vectype, new_phi_result);
3846 epilog_stmt = gimple_build_assign (vec_dest, tmp);
3847 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3848 gimple_assign_set_lhs (epilog_stmt, new_temp);
3849 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3850
3851 extract_scalar_result = true;
3852 }
3853 else
3854 {
3855 enum tree_code shift_code = ERROR_MARK;
3856 bool have_whole_vector_shift = true;
3857 int bit_offset;
3858 int element_bitsize = tree_low_cst (bitsize, 1);
3859 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3860 tree vec_temp;
3861
3862 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3863 shift_code = VEC_RSHIFT_EXPR;
3864 else
3865 have_whole_vector_shift = false;
3866
3867 /* Regardless of whether we have a whole vector shift, if we're
3868 emulating the operation via tree-vect-generic, we don't want
3869 to use it. Only the first round of the reduction is likely
3870 to still be profitable via emulation. */
3871 /* ??? It might be better to emit a reduction tree code here, so that
3872 tree-vect-generic can expand the first round via bit tricks. */
3873 if (!VECTOR_MODE_P (mode))
3874 have_whole_vector_shift = false;
3875 else
3876 {
3877 optab optab = optab_for_tree_code (code, vectype, optab_default);
3878 if (optab_handler (optab, mode) == CODE_FOR_nothing)
3879 have_whole_vector_shift = false;
3880 }
3881
3882 if (have_whole_vector_shift && !slp_reduc)
3883 {
3884 /*** Case 2: Create:
3885 for (offset = VS/2; offset >= element_size; offset/=2)
3886 {
3887 Create: va' = vec_shift <va, offset>
3888 Create: va = vop <va, va'>
3889 } */
3890
3891 if (vect_print_dump_info (REPORT_DETAILS))
3892 fprintf (vect_dump, "Reduce using vector shifts");
3893
3894 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3895 new_temp = new_phi_result;
3896 for (bit_offset = vec_size_in_bits/2;
3897 bit_offset >= element_bitsize;
3898 bit_offset /= 2)
3899 {
3900 tree bitpos = size_int (bit_offset);
3901
3902 epilog_stmt = gimple_build_assign_with_ops (shift_code,
3903 vec_dest, new_temp, bitpos);
3904 new_name = make_ssa_name (vec_dest, epilog_stmt);
3905 gimple_assign_set_lhs (epilog_stmt, new_name);
3906 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3907
3908 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3909 new_name, new_temp);
3910 new_temp = make_ssa_name (vec_dest, epilog_stmt);
3911 gimple_assign_set_lhs (epilog_stmt, new_temp);
3912 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3913 }
3914
3915 extract_scalar_result = true;
3916 }
3917 else
3918 {
3919 tree rhs;
3920
3921 /*** Case 3: Create:
3922 s = extract_field <v_out2, 0>
3923 for (offset = element_size;
3924 offset < vector_size;
3925 offset += element_size;)
3926 {
3927 Create: s' = extract_field <v_out2, offset>
3928 Create: s = op <s, s'> // For non SLP cases
3929 } */
3930
3931 if (vect_print_dump_info (REPORT_DETAILS))
3932 fprintf (vect_dump, "Reduce using scalar code. ");
3933
3934 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3935 FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3936 {
3937 if (gimple_code (new_phi) == GIMPLE_PHI)
3938 vec_temp = PHI_RESULT (new_phi);
3939 else
3940 vec_temp = gimple_assign_lhs (new_phi);
3941 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3942 bitsize_zero_node);
3943 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3944 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3945 gimple_assign_set_lhs (epilog_stmt, new_temp);
3946 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3947
3948 /* In SLP we don't need to apply reduction operation, so we just
3949 collect s' values in SCALAR_RESULTS. */
3950 if (slp_reduc)
3951 VEC_safe_push (tree, heap, scalar_results, new_temp);
3952
3953 for (bit_offset = element_bitsize;
3954 bit_offset < vec_size_in_bits;
3955 bit_offset += element_bitsize)
3956 {
3957 tree bitpos = bitsize_int (bit_offset);
3958 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3959 bitsize, bitpos);
3960
3961 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3962 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3963 gimple_assign_set_lhs (epilog_stmt, new_name);
3964 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3965
3966 if (slp_reduc)
3967 {
3968 /* In SLP we don't need to apply reduction operation, so
3969 we just collect s' values in SCALAR_RESULTS. */
3970 new_temp = new_name;
3971 VEC_safe_push (tree, heap, scalar_results, new_name);
3972 }
3973 else
3974 {
3975 epilog_stmt = gimple_build_assign_with_ops (code,
3976 new_scalar_dest, new_name, new_temp);
3977 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3978 gimple_assign_set_lhs (epilog_stmt, new_temp);
3979 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3980 }
3981 }
3982 }
3983
3984 /* The only case where we need to reduce scalar results in SLP, is
3985 unrolling. If the size of SCALAR_RESULTS is greater than
3986 GROUP_SIZE, we reduce them combining elements modulo
3987 GROUP_SIZE. */
3988 if (slp_reduc)
3989 {
3990 tree res, first_res, new_res;
3991 gimple new_stmt;
3992
3993 /* Reduce multiple scalar results in case of SLP unrolling. */
3994 for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3995 j++)
3996 {
3997 first_res = VEC_index (tree, scalar_results, j % group_size);
3998 new_stmt = gimple_build_assign_with_ops (code,
3999 new_scalar_dest, first_res, res);
4000 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4001 gimple_assign_set_lhs (new_stmt, new_res);
4002 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4003 VEC_replace (tree, scalar_results, j % group_size, new_res);
4004 }
4005 }
4006 else
4007 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4008 VEC_safe_push (tree, heap, scalar_results, new_temp);
4009
4010 extract_scalar_result = false;
4011 }
4012 }
4013
4014 /* 2.4 Extract the final scalar result. Create:
4015 s_out3 = extract_field <v_out2, bitpos> */
4016
4017 if (extract_scalar_result)
4018 {
4019 tree rhs;
4020
4021 if (vect_print_dump_info (REPORT_DETAILS))
4022 fprintf (vect_dump, "extract scalar result");
4023
4024 if (BYTES_BIG_ENDIAN)
4025 bitpos = size_binop (MULT_EXPR,
4026 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4027 TYPE_SIZE (scalar_type));
4028 else
4029 bitpos = bitsize_zero_node;
4030
4031 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4032 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4033 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4034 gimple_assign_set_lhs (epilog_stmt, new_temp);
4035 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4036 VEC_safe_push (tree, heap, scalar_results, new_temp);
4037 }
4038
4039 vect_finalize_reduction:
4040
4041 if (double_reduc)
4042 loop = loop->inner;
4043
4044 /* 2.5 Adjust the final result by the initial value of the reduction
4045 variable. (When such adjustment is not needed, then
4046 'adjustment_def' is zero). For example, if code is PLUS we create:
4047 new_temp = loop_exit_def + adjustment_def */
4048
4049 if (adjustment_def)
4050 {
4051 gcc_assert (!slp_reduc);
4052 if (nested_in_vect_loop)
4053 {
4054 new_phi = VEC_index (gimple, new_phis, 0);
4055 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4056 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4057 new_dest = vect_create_destination_var (scalar_dest, vectype);
4058 }
4059 else
4060 {
4061 new_temp = VEC_index (tree, scalar_results, 0);
4062 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4063 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4064 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4065 }
4066
4067 epilog_stmt = gimple_build_assign (new_dest, expr);
4068 new_temp = make_ssa_name (new_dest, epilog_stmt);
4069 gimple_assign_set_lhs (epilog_stmt, new_temp);
4070 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4071 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4072 if (nested_in_vect_loop)
4073 {
4074 set_vinfo_for_stmt (epilog_stmt,
4075 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4076 NULL));
4077 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4078 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4079
4080 if (!double_reduc)
4081 VEC_quick_push (tree, scalar_results, new_temp);
4082 else
4083 VEC_replace (tree, scalar_results, 0, new_temp);
4084 }
4085 else
4086 VEC_replace (tree, scalar_results, 0, new_temp);
4087
4088 VEC_replace (gimple, new_phis, 0, epilog_stmt);
4089 }
4090
4091 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4092 phis with new adjusted scalar results, i.e., replace use <s_out0>
4093 with use <s_out4>.
4094
4095 Transform:
4096 loop_exit:
4097 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4098 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4099 v_out2 = reduce <v_out1>
4100 s_out3 = extract_field <v_out2, 0>
4101 s_out4 = adjust_result <s_out3>
4102 use <s_out0>
4103 use <s_out0>
4104
4105 into:
4106
4107 loop_exit:
4108 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4109 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4110 v_out2 = reduce <v_out1>
4111 s_out3 = extract_field <v_out2, 0>
4112 s_out4 = adjust_result <s_out3>
4113 use <s_out4>
4114 use <s_out4> */
4115
4116
4117 /* In SLP reduction chain we reduce vector results into one vector if
4118 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4119 the last stmt in the reduction chain, since we are looking for the loop
4120 exit phi node. */
4121 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4122 {
4123 scalar_dest = gimple_assign_lhs (VEC_index (gimple,
4124 SLP_TREE_SCALAR_STMTS (slp_node),
4125 group_size - 1));
4126 group_size = 1;
4127 }
4128
4129 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4130 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4131 need to match SCALAR_RESULTS with corresponding statements. The first
4132 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4133 the first vector stmt, etc.
4134 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4135 if (group_size > VEC_length (gimple, new_phis))
4136 {
4137 ratio = group_size / VEC_length (gimple, new_phis);
4138 gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
4139 }
4140 else
4141 ratio = 1;
4142
4143 for (k = 0; k < group_size; k++)
4144 {
4145 if (k % ratio == 0)
4146 {
4147 epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
4148 reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
4149 if (double_reduc)
4150 inner_phi = VEC_index (gimple, inner_phis, k / ratio);
4151 }
4152
4153 if (slp_reduc)
4154 {
4155 gimple current_stmt = VEC_index (gimple,
4156 SLP_TREE_SCALAR_STMTS (slp_node), k);
4157
4158 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4159 /* SLP statements can't participate in patterns. */
4160 gcc_assert (!orig_stmt);
4161 scalar_dest = gimple_assign_lhs (current_stmt);
4162 }
4163
4164 phis = VEC_alloc (gimple, heap, 3);
4165 /* Find the loop-closed-use at the loop exit of the original scalar
4166 result. (The reduction result is expected to have two immediate uses -
4167 one at the latch block, and one at the loop exit). */
4168 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4169 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4170 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4171
4172 /* We expect to have found an exit_phi because of loop-closed-ssa
4173 form. */
4174 gcc_assert (!VEC_empty (gimple, phis));
4175
4176 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4177 {
4178 if (outer_loop)
4179 {
4180 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4181 gimple vect_phi;
4182
4183 /* FORNOW. Currently not supporting the case that an inner-loop
4184 reduction is not used in the outer-loop (but only outside the
4185 outer-loop), unless it is double reduction. */
4186 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4187 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4188 || double_reduc);
4189
4190 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4191 if (!double_reduc
4192 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4193 != vect_double_reduction_def)
4194 continue;
4195
4196 /* Handle double reduction:
4197
4198 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4199 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4200 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4201 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4202
4203 At that point the regular reduction (stmt2 and stmt3) is
4204 already vectorized, as well as the exit phi node, stmt4.
4205 Here we vectorize the phi node of double reduction, stmt1, and
4206 update all relevant statements. */
4207
4208 /* Go through all the uses of s2 to find double reduction phi
4209 node, i.e., stmt1 above. */
4210 orig_name = PHI_RESULT (exit_phi);
4211 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4212 {
4213 stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4214 stmt_vec_info new_phi_vinfo;
4215 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4216 basic_block bb = gimple_bb (use_stmt);
4217 gimple use;
4218
4219 /* Check that USE_STMT is really double reduction phi
4220 node. */
4221 if (gimple_code (use_stmt) != GIMPLE_PHI
4222 || gimple_phi_num_args (use_stmt) != 2
4223 || !use_stmt_vinfo
4224 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4225 != vect_double_reduction_def
4226 || bb->loop_father != outer_loop)
4227 continue;
4228
4229 /* Create vector phi node for double reduction:
4230 vs1 = phi <vs0, vs2>
4231 vs1 was created previously in this function by a call to
4232 vect_get_vec_def_for_operand and is stored in
4233 vec_initial_def;
4234 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4235 vs0 is created here. */
4236
4237 /* Create vector phi node. */
4238 vect_phi = create_phi_node (vec_initial_def, bb);
4239 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4240 loop_vec_info_for_loop (outer_loop), NULL);
4241 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4242
4243 /* Create vs0 - initial def of the double reduction phi. */
4244 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4245 loop_preheader_edge (outer_loop));
4246 init_def = get_initial_def_for_reduction (stmt,
4247 preheader_arg, NULL);
4248 vect_phi_init = vect_init_vector (use_stmt, init_def,
4249 vectype, NULL);
4250
4251 /* Update phi node arguments with vs0 and vs2. */
4252 add_phi_arg (vect_phi, vect_phi_init,
4253 loop_preheader_edge (outer_loop),
4254 UNKNOWN_LOCATION);
4255 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4256 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4257 if (vect_print_dump_info (REPORT_DETAILS))
4258 {
4259 fprintf (vect_dump, "created double reduction phi "
4260 "node: ");
4261 print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
4262 }
4263
4264 vect_phi_res = PHI_RESULT (vect_phi);
4265
4266 /* Replace the use, i.e., set the correct vs1 in the regular
4267 reduction phi node. FORNOW, NCOPIES is always 1, so the
4268 loop is redundant. */
4269 use = reduction_phi;
4270 for (j = 0; j < ncopies; j++)
4271 {
4272 edge pr_edge = loop_preheader_edge (loop);
4273 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4274 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4275 }
4276 }
4277 }
4278 }
4279
4280 VEC_free (gimple, heap, phis);
4281 if (nested_in_vect_loop)
4282 {
4283 if (double_reduc)
4284 loop = outer_loop;
4285 else
4286 continue;
4287 }
4288
4289 phis = VEC_alloc (gimple, heap, 3);
4290 /* Find the loop-closed-use at the loop exit of the original scalar
4291 result. (The reduction result is expected to have two immediate uses,
4292 one at the latch block, and one at the loop exit). For double
4293 reductions we are looking for exit phis of the outer loop. */
4294 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4295 {
4296 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4297 VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4298 else
4299 {
4300 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4301 {
4302 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4303
4304 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4305 {
4306 if (!flow_bb_inside_loop_p (loop,
4307 gimple_bb (USE_STMT (phi_use_p))))
4308 VEC_safe_push (gimple, heap, phis,
4309 USE_STMT (phi_use_p));
4310 }
4311 }
4312 }
4313 }
4314
4315 FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4316 {
4317 /* Replace the uses: */
4318 orig_name = PHI_RESULT (exit_phi);
4319 scalar_result = VEC_index (tree, scalar_results, k);
4320 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4321 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4322 SET_USE (use_p, scalar_result);
4323 }
4324
4325 VEC_free (gimple, heap, phis);
4326 }
4327
4328 VEC_free (tree, heap, scalar_results);
4329 VEC_free (gimple, heap, new_phis);
4330 }
4331
4332
4333 /* Function vectorizable_reduction.
4334
4335 Check if STMT performs a reduction operation that can be vectorized.
4336 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4337 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4338 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4339
4340 This function also handles reduction idioms (patterns) that have been
4341 recognized in advance during vect_pattern_recog. In this case, STMT may be
4342 of this form:
4343 X = pattern_expr (arg0, arg1, ..., X)
4344 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4345 sequence that had been detected and replaced by the pattern-stmt (STMT).
4346
4347 In some cases of reduction patterns, the type of the reduction variable X is
4348 different than the type of the other arguments of STMT.
4349 In such cases, the vectype that is used when transforming STMT into a vector
4350 stmt is different than the vectype that is used to determine the
4351 vectorization factor, because it consists of a different number of elements
4352 than the actual number of elements that are being operated upon in parallel.
4353
4354 For example, consider an accumulation of shorts into an int accumulator.
4355 On some targets it's possible to vectorize this pattern operating on 8
4356 shorts at a time (hence, the vectype for purposes of determining the
4357 vectorization factor should be V8HI); on the other hand, the vectype that
4358 is used to create the vector form is actually V4SI (the type of the result).
4359
4360 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4361 indicates what is the actual level of parallelism (V8HI in the example), so
4362 that the right vectorization factor would be derived. This vectype
4363 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4364 be used to create the vectorized stmt. The right vectype for the vectorized
4365 stmt is obtained from the type of the result X:
4366 get_vectype_for_scalar_type (TREE_TYPE (X))
4367
4368 This means that, contrary to "regular" reductions (or "regular" stmts in
4369 general), the following equation:
4370 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4371 does *NOT* necessarily hold for reduction patterns. */
4372
4373 bool
4374 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4375 gimple *vec_stmt, slp_tree slp_node)
4376 {
4377 tree vec_dest;
4378 tree scalar_dest;
4379 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4380 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4381 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4382 tree vectype_in = NULL_TREE;
4383 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4384 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4385 enum tree_code code, orig_code, epilog_reduc_code;
4386 enum machine_mode vec_mode;
4387 int op_type;
4388 optab optab, reduc_optab;
4389 tree new_temp = NULL_TREE;
4390 tree def;
4391 gimple def_stmt;
4392 enum vect_def_type dt;
4393 gimple new_phi = NULL;
4394 tree scalar_type;
4395 bool is_simple_use;
4396 gimple orig_stmt;
4397 stmt_vec_info orig_stmt_info;
4398 tree expr = NULL_TREE;
4399 int i;
4400 int ncopies;
4401 int epilog_copies;
4402 stmt_vec_info prev_stmt_info, prev_phi_info;
4403 bool single_defuse_cycle = false;
4404 tree reduc_def = NULL_TREE;
4405 gimple new_stmt = NULL;
4406 int j;
4407 tree ops[3];
4408 bool nested_cycle = false, found_nested_cycle_def = false;
4409 gimple reduc_def_stmt = NULL;
4410 /* The default is that the reduction variable is the last in statement. */
4411 int reduc_index = 2;
4412 bool double_reduc = false, dummy;
4413 basic_block def_bb;
4414 struct loop * def_stmt_loop, *outer_loop = NULL;
4415 tree def_arg;
4416 gimple def_arg_stmt;
4417 VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
4418 VEC (gimple, heap) *phis = NULL;
4419 int vec_num;
4420 tree def0, def1, tem, op0, op1 = NULL_TREE;
4421
4422 /* In case of reduction chain we switch to the first stmt in the chain, but
4423 we don't update STMT_INFO, since only the last stmt is marked as reduction
4424 and has reduction properties. */
4425 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4426 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4427
4428 if (nested_in_vect_loop_p (loop, stmt))
4429 {
4430 outer_loop = loop;
4431 loop = loop->inner;
4432 nested_cycle = true;
4433 }
4434
4435 /* 1. Is vectorizable reduction? */
4436 /* Not supportable if the reduction variable is used in the loop, unless
4437 it's a reduction chain. */
4438 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4439 && !GROUP_FIRST_ELEMENT (stmt_info))
4440 return false;
4441
4442 /* Reductions that are not used even in an enclosing outer-loop,
4443 are expected to be "live" (used out of the loop). */
4444 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4445 && !STMT_VINFO_LIVE_P (stmt_info))
4446 return false;
4447
4448 /* Make sure it was already recognized as a reduction computation. */
4449 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4450 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4451 return false;
4452
4453 /* 2. Has this been recognized as a reduction pattern?
4454
4455 Check if STMT represents a pattern that has been recognized
4456 in earlier analysis stages. For stmts that represent a pattern,
4457 the STMT_VINFO_RELATED_STMT field records the last stmt in
4458 the original sequence that constitutes the pattern. */
4459
4460 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4461 if (orig_stmt)
4462 {
4463 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4464 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4465 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4466 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4467 }
4468
4469 /* 3. Check the operands of the operation. The first operands are defined
4470 inside the loop body. The last operand is the reduction variable,
4471 which is defined by the loop-header-phi. */
4472
4473 gcc_assert (is_gimple_assign (stmt));
4474
4475 /* Flatten RHS. */
4476 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4477 {
4478 case GIMPLE_SINGLE_RHS:
4479 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4480 if (op_type == ternary_op)
4481 {
4482 tree rhs = gimple_assign_rhs1 (stmt);
4483 ops[0] = TREE_OPERAND (rhs, 0);
4484 ops[1] = TREE_OPERAND (rhs, 1);
4485 ops[2] = TREE_OPERAND (rhs, 2);
4486 code = TREE_CODE (rhs);
4487 }
4488 else
4489 return false;
4490 break;
4491
4492 case GIMPLE_BINARY_RHS:
4493 code = gimple_assign_rhs_code (stmt);
4494 op_type = TREE_CODE_LENGTH (code);
4495 gcc_assert (op_type == binary_op);
4496 ops[0] = gimple_assign_rhs1 (stmt);
4497 ops[1] = gimple_assign_rhs2 (stmt);
4498 break;
4499
4500 case GIMPLE_TERNARY_RHS:
4501 code = gimple_assign_rhs_code (stmt);
4502 op_type = TREE_CODE_LENGTH (code);
4503 gcc_assert (op_type == ternary_op);
4504 ops[0] = gimple_assign_rhs1 (stmt);
4505 ops[1] = gimple_assign_rhs2 (stmt);
4506 ops[2] = gimple_assign_rhs3 (stmt);
4507 break;
4508
4509 case GIMPLE_UNARY_RHS:
4510 return false;
4511
4512 default:
4513 gcc_unreachable ();
4514 }
4515
4516 if (code == COND_EXPR && slp_node)
4517 return false;
4518
4519 scalar_dest = gimple_assign_lhs (stmt);
4520 scalar_type = TREE_TYPE (scalar_dest);
4521 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4522 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4523 return false;
4524
4525 /* Do not try to vectorize bit-precision reductions. */
4526 if ((TYPE_PRECISION (scalar_type)
4527 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4528 return false;
4529
4530 /* All uses but the last are expected to be defined in the loop.
4531 The last use is the reduction variable. In case of nested cycle this
4532 assumption is not true: we use reduc_index to record the index of the
4533 reduction variable. */
4534 for (i = 0; i < op_type-1; i++)
4535 {
4536 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4537 if (i == 0 && code == COND_EXPR)
4538 continue;
4539
4540 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4541 &def_stmt, &def, &dt, &tem);
4542 if (!vectype_in)
4543 vectype_in = tem;
4544 gcc_assert (is_simple_use);
4545
4546 if (dt != vect_internal_def
4547 && dt != vect_external_def
4548 && dt != vect_constant_def
4549 && dt != vect_induction_def
4550 && !(dt == vect_nested_cycle && nested_cycle))
4551 return false;
4552
4553 if (dt == vect_nested_cycle)
4554 {
4555 found_nested_cycle_def = true;
4556 reduc_def_stmt = def_stmt;
4557 reduc_index = i;
4558 }
4559 }
4560
4561 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4562 &def_stmt, &def, &dt, &tem);
4563 if (!vectype_in)
4564 vectype_in = tem;
4565 gcc_assert (is_simple_use);
4566 gcc_assert (dt == vect_reduction_def
4567 || dt == vect_nested_cycle
4568 || ((dt == vect_internal_def || dt == vect_external_def
4569 || dt == vect_constant_def || dt == vect_induction_def)
4570 && nested_cycle && found_nested_cycle_def));
4571 if (!found_nested_cycle_def)
4572 reduc_def_stmt = def_stmt;
4573
4574 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4575 if (orig_stmt)
4576 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4577 reduc_def_stmt,
4578 !nested_cycle,
4579 &dummy));
4580 else
4581 {
4582 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4583 !nested_cycle, &dummy);
4584 /* We changed STMT to be the first stmt in reduction chain, hence we
4585 check that in this case the first element in the chain is STMT. */
4586 gcc_assert (stmt == tmp
4587 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4588 }
4589
4590 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4591 return false;
4592
4593 if (slp_node || PURE_SLP_STMT (stmt_info))
4594 ncopies = 1;
4595 else
4596 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4597 / TYPE_VECTOR_SUBPARTS (vectype_in));
4598
4599 gcc_assert (ncopies >= 1);
4600
4601 vec_mode = TYPE_MODE (vectype_in);
4602
4603 if (code == COND_EXPR)
4604 {
4605 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4606 {
4607 if (vect_print_dump_info (REPORT_DETAILS))
4608 fprintf (vect_dump, "unsupported condition in reduction");
4609
4610 return false;
4611 }
4612 }
4613 else
4614 {
4615 /* 4. Supportable by target? */
4616
4617 /* 4.1. check support for the operation in the loop */
4618 optab = optab_for_tree_code (code, vectype_in, optab_default);
4619 if (!optab)
4620 {
4621 if (vect_print_dump_info (REPORT_DETAILS))
4622 fprintf (vect_dump, "no optab.");
4623
4624 return false;
4625 }
4626
4627 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4628 {
4629 if (vect_print_dump_info (REPORT_DETAILS))
4630 fprintf (vect_dump, "op not supported by target.");
4631
4632 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4633 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4634 < vect_min_worthwhile_factor (code))
4635 return false;
4636
4637 if (vect_print_dump_info (REPORT_DETAILS))
4638 fprintf (vect_dump, "proceeding using word mode.");
4639 }
4640
4641 /* Worthwhile without SIMD support? */
4642 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4643 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4644 < vect_min_worthwhile_factor (code))
4645 {
4646 if (vect_print_dump_info (REPORT_DETAILS))
4647 fprintf (vect_dump, "not worthwhile without SIMD support.");
4648
4649 return false;
4650 }
4651 }
4652
4653 /* 4.2. Check support for the epilog operation.
4654
4655 If STMT represents a reduction pattern, then the type of the
4656 reduction variable may be different than the type of the rest
4657 of the arguments. For example, consider the case of accumulation
4658 of shorts into an int accumulator; The original code:
4659 S1: int_a = (int) short_a;
4660 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4661
4662 was replaced with:
4663 STMT: int_acc = widen_sum <short_a, int_acc>
4664
4665 This means that:
4666 1. The tree-code that is used to create the vector operation in the
4667 epilog code (that reduces the partial results) is not the
4668 tree-code of STMT, but is rather the tree-code of the original
4669 stmt from the pattern that STMT is replacing. I.e, in the example
4670 above we want to use 'widen_sum' in the loop, but 'plus' in the
4671 epilog.
4672 2. The type (mode) we use to check available target support
4673 for the vector operation to be created in the *epilog*, is
4674 determined by the type of the reduction variable (in the example
4675 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4676 However the type (mode) we use to check available target support
4677 for the vector operation to be created *inside the loop*, is
4678 determined by the type of the other arguments to STMT (in the
4679 example we'd check this: optab_handler (widen_sum_optab,
4680 vect_short_mode)).
4681
4682 This is contrary to "regular" reductions, in which the types of all
4683 the arguments are the same as the type of the reduction variable.
4684 For "regular" reductions we can therefore use the same vector type
4685 (and also the same tree-code) when generating the epilog code and
4686 when generating the code inside the loop. */
4687
4688 if (orig_stmt)
4689 {
4690 /* This is a reduction pattern: get the vectype from the type of the
4691 reduction variable, and get the tree-code from orig_stmt. */
4692 orig_code = gimple_assign_rhs_code (orig_stmt);
4693 gcc_assert (vectype_out);
4694 vec_mode = TYPE_MODE (vectype_out);
4695 }
4696 else
4697 {
4698 /* Regular reduction: use the same vectype and tree-code as used for
4699 the vector code inside the loop can be used for the epilog code. */
4700 orig_code = code;
4701 }
4702
4703 if (nested_cycle)
4704 {
4705 def_bb = gimple_bb (reduc_def_stmt);
4706 def_stmt_loop = def_bb->loop_father;
4707 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4708 loop_preheader_edge (def_stmt_loop));
4709 if (TREE_CODE (def_arg) == SSA_NAME
4710 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4711 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4712 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4713 && vinfo_for_stmt (def_arg_stmt)
4714 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4715 == vect_double_reduction_def)
4716 double_reduc = true;
4717 }
4718
4719 epilog_reduc_code = ERROR_MARK;
4720 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4721 {
4722 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4723 optab_default);
4724 if (!reduc_optab)
4725 {
4726 if (vect_print_dump_info (REPORT_DETAILS))
4727 fprintf (vect_dump, "no optab for reduction.");
4728
4729 epilog_reduc_code = ERROR_MARK;
4730 }
4731
4732 if (reduc_optab
4733 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4734 {
4735 if (vect_print_dump_info (REPORT_DETAILS))
4736 fprintf (vect_dump, "reduc op not supported by target.");
4737
4738 epilog_reduc_code = ERROR_MARK;
4739 }
4740 }
4741 else
4742 {
4743 if (!nested_cycle || double_reduc)
4744 {
4745 if (vect_print_dump_info (REPORT_DETAILS))
4746 fprintf (vect_dump, "no reduc code for scalar code.");
4747
4748 return false;
4749 }
4750 }
4751
4752 if (double_reduc && ncopies > 1)
4753 {
4754 if (vect_print_dump_info (REPORT_DETAILS))
4755 fprintf (vect_dump, "multiple types in double reduction");
4756
4757 return false;
4758 }
4759
4760 /* In case of widenning multiplication by a constant, we update the type
4761 of the constant to be the type of the other operand. We check that the
4762 constant fits the type in the pattern recognition pass. */
4763 if (code == DOT_PROD_EXPR
4764 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4765 {
4766 if (TREE_CODE (ops[0]) == INTEGER_CST)
4767 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4768 else if (TREE_CODE (ops[1]) == INTEGER_CST)
4769 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4770 else
4771 {
4772 if (vect_print_dump_info (REPORT_DETAILS))
4773 fprintf (vect_dump, "invalid types in dot-prod");
4774
4775 return false;
4776 }
4777 }
4778
4779 if (!vec_stmt) /* transformation not required. */
4780 {
4781 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4782 return false;
4783 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4784 return true;
4785 }
4786
4787 /** Transform. **/
4788
4789 if (vect_print_dump_info (REPORT_DETAILS))
4790 fprintf (vect_dump, "transform reduction.");
4791
4792 /* FORNOW: Multiple types are not supported for condition. */
4793 if (code == COND_EXPR)
4794 gcc_assert (ncopies == 1);
4795
4796 /* Create the destination vector */
4797 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4798
4799 /* In case the vectorization factor (VF) is bigger than the number
4800 of elements that we can fit in a vectype (nunits), we have to generate
4801 more than one vector stmt - i.e - we need to "unroll" the
4802 vector stmt by a factor VF/nunits. For more details see documentation
4803 in vectorizable_operation. */
4804
4805 /* If the reduction is used in an outer loop we need to generate
4806 VF intermediate results, like so (e.g. for ncopies=2):
4807 r0 = phi (init, r0)
4808 r1 = phi (init, r1)
4809 r0 = x0 + r0;
4810 r1 = x1 + r1;
4811 (i.e. we generate VF results in 2 registers).
4812 In this case we have a separate def-use cycle for each copy, and therefore
4813 for each copy we get the vector def for the reduction variable from the
4814 respective phi node created for this copy.
4815
4816 Otherwise (the reduction is unused in the loop nest), we can combine
4817 together intermediate results, like so (e.g. for ncopies=2):
4818 r = phi (init, r)
4819 r = x0 + r;
4820 r = x1 + r;
4821 (i.e. we generate VF/2 results in a single register).
4822 In this case for each copy we get the vector def for the reduction variable
4823 from the vectorized reduction operation generated in the previous iteration.
4824 */
4825
4826 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4827 {
4828 single_defuse_cycle = true;
4829 epilog_copies = 1;
4830 }
4831 else
4832 epilog_copies = ncopies;
4833
4834 prev_stmt_info = NULL;
4835 prev_phi_info = NULL;
4836 if (slp_node)
4837 {
4838 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4839 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4840 == TYPE_VECTOR_SUBPARTS (vectype_in));
4841 }
4842 else
4843 {
4844 vec_num = 1;
4845 vec_oprnds0 = VEC_alloc (tree, heap, 1);
4846 if (op_type == ternary_op)
4847 vec_oprnds1 = VEC_alloc (tree, heap, 1);
4848 }
4849
4850 phis = VEC_alloc (gimple, heap, vec_num);
4851 vect_defs = VEC_alloc (tree, heap, vec_num);
4852 if (!slp_node)
4853 VEC_quick_push (tree, vect_defs, NULL_TREE);
4854
4855 for (j = 0; j < ncopies; j++)
4856 {
4857 if (j == 0 || !single_defuse_cycle)
4858 {
4859 for (i = 0; i < vec_num; i++)
4860 {
4861 /* Create the reduction-phi that defines the reduction
4862 operand. */
4863 new_phi = create_phi_node (vec_dest, loop->header);
4864 set_vinfo_for_stmt (new_phi,
4865 new_stmt_vec_info (new_phi, loop_vinfo,
4866 NULL));
4867 if (j == 0 || slp_node)
4868 VEC_quick_push (gimple, phis, new_phi);
4869 }
4870 }
4871
4872 if (code == COND_EXPR)
4873 {
4874 gcc_assert (!slp_node);
4875 vectorizable_condition (stmt, gsi, vec_stmt,
4876 PHI_RESULT (VEC_index (gimple, phis, 0)),
4877 reduc_index, NULL);
4878 /* Multiple types are not supported for condition. */
4879 break;
4880 }
4881
4882 /* Handle uses. */
4883 if (j == 0)
4884 {
4885 op0 = ops[!reduc_index];
4886 if (op_type == ternary_op)
4887 {
4888 if (reduc_index == 0)
4889 op1 = ops[2];
4890 else
4891 op1 = ops[1];
4892 }
4893
4894 if (slp_node)
4895 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4896 slp_node, -1);
4897 else
4898 {
4899 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4900 stmt, NULL);
4901 VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4902 if (op_type == ternary_op)
4903 {
4904 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4905 NULL);
4906 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4907 }
4908 }
4909 }
4910 else
4911 {
4912 if (!slp_node)
4913 {
4914 enum vect_def_type dt;
4915 gimple dummy_stmt;
4916 tree dummy;
4917
4918 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
4919 &dummy_stmt, &dummy, &dt);
4920 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
4921 loop_vec_def0);
4922 VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4923 if (op_type == ternary_op)
4924 {
4925 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
4926 &dummy, &dt);
4927 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4928 loop_vec_def1);
4929 VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4930 }
4931 }
4932
4933 if (single_defuse_cycle)
4934 reduc_def = gimple_assign_lhs (new_stmt);
4935
4936 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4937 }
4938
4939 FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4940 {
4941 if (slp_node)
4942 reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4943 else
4944 {
4945 if (!single_defuse_cycle || j == 0)
4946 reduc_def = PHI_RESULT (new_phi);
4947 }
4948
4949 def1 = ((op_type == ternary_op)
4950 ? VEC_index (tree, vec_oprnds1, i) : NULL);
4951 if (op_type == binary_op)
4952 {
4953 if (reduc_index == 0)
4954 expr = build2 (code, vectype_out, reduc_def, def0);
4955 else
4956 expr = build2 (code, vectype_out, def0, reduc_def);
4957 }
4958 else
4959 {
4960 if (reduc_index == 0)
4961 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4962 else
4963 {
4964 if (reduc_index == 1)
4965 expr = build3 (code, vectype_out, def0, reduc_def, def1);
4966 else
4967 expr = build3 (code, vectype_out, def0, def1, reduc_def);
4968 }
4969 }
4970
4971 new_stmt = gimple_build_assign (vec_dest, expr);
4972 new_temp = make_ssa_name (vec_dest, new_stmt);
4973 gimple_assign_set_lhs (new_stmt, new_temp);
4974 vect_finish_stmt_generation (stmt, new_stmt, gsi);
4975
4976 if (slp_node)
4977 {
4978 VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4979 VEC_quick_push (tree, vect_defs, new_temp);
4980 }
4981 else
4982 VEC_replace (tree, vect_defs, 0, new_temp);
4983 }
4984
4985 if (slp_node)
4986 continue;
4987
4988 if (j == 0)
4989 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4990 else
4991 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4992
4993 prev_stmt_info = vinfo_for_stmt (new_stmt);
4994 prev_phi_info = vinfo_for_stmt (new_phi);
4995 }
4996
4997 /* Finalize the reduction-phi (set its arguments) and create the
4998 epilog reduction code. */
4999 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5000 {
5001 new_temp = gimple_assign_lhs (*vec_stmt);
5002 VEC_replace (tree, vect_defs, 0, new_temp);
5003 }
5004
5005 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5006 epilog_reduc_code, phis, reduc_index,
5007 double_reduc, slp_node);
5008
5009 VEC_free (gimple, heap, phis);
5010 VEC_free (tree, heap, vec_oprnds0);
5011 if (vec_oprnds1)
5012 VEC_free (tree, heap, vec_oprnds1);
5013
5014 return true;
5015 }
5016
5017 /* Function vect_min_worthwhile_factor.
5018
5019 For a loop where we could vectorize the operation indicated by CODE,
5020 return the minimum vectorization factor that makes it worthwhile
5021 to use generic vectors. */
5022 int
5023 vect_min_worthwhile_factor (enum tree_code code)
5024 {
5025 switch (code)
5026 {
5027 case PLUS_EXPR:
5028 case MINUS_EXPR:
5029 case NEGATE_EXPR:
5030 return 4;
5031
5032 case BIT_AND_EXPR:
5033 case BIT_IOR_EXPR:
5034 case BIT_XOR_EXPR:
5035 case BIT_NOT_EXPR:
5036 return 2;
5037
5038 default:
5039 return INT_MAX;
5040 }
5041 }
5042
5043
5044 /* Function vectorizable_induction
5045
5046 Check if PHI performs an induction computation that can be vectorized.
5047 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5048 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5049 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5050
5051 bool
5052 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5053 gimple *vec_stmt)
5054 {
5055 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5056 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5057 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5058 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5059 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5060 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5061 tree vec_def;
5062
5063 gcc_assert (ncopies >= 1);
5064 /* FORNOW. These restrictions should be relaxed. */
5065 if (nested_in_vect_loop_p (loop, phi))
5066 {
5067 imm_use_iterator imm_iter;
5068 use_operand_p use_p;
5069 gimple exit_phi;
5070 edge latch_e;
5071 tree loop_arg;
5072
5073 if (ncopies > 1)
5074 {
5075 if (vect_print_dump_info (REPORT_DETAILS))
5076 fprintf (vect_dump, "multiple types in nested loop.");
5077 return false;
5078 }
5079
5080 exit_phi = NULL;
5081 latch_e = loop_latch_edge (loop->inner);
5082 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5083 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5084 {
5085 if (!flow_bb_inside_loop_p (loop->inner,
5086 gimple_bb (USE_STMT (use_p))))
5087 {
5088 exit_phi = USE_STMT (use_p);
5089 break;
5090 }
5091 }
5092 if (exit_phi)
5093 {
5094 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5095 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5096 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5097 {
5098 if (vect_print_dump_info (REPORT_DETAILS))
5099 fprintf (vect_dump, "inner-loop induction only used outside "
5100 "of the outer vectorized loop.");
5101 return false;
5102 }
5103 }
5104 }
5105
5106 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5107 return false;
5108
5109 /* FORNOW: SLP not supported. */
5110 if (STMT_SLP_TYPE (stmt_info))
5111 return false;
5112
5113 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5114
5115 if (gimple_code (phi) != GIMPLE_PHI)
5116 return false;
5117
5118 if (!vec_stmt) /* transformation not required. */
5119 {
5120 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5121 if (vect_print_dump_info (REPORT_DETAILS))
5122 fprintf (vect_dump, "=== vectorizable_induction ===");
5123 vect_model_induction_cost (stmt_info, ncopies);
5124 return true;
5125 }
5126
5127 /** Transform. **/
5128
5129 if (vect_print_dump_info (REPORT_DETAILS))
5130 fprintf (vect_dump, "transform induction phi.");
5131
5132 vec_def = get_initial_def_for_induction (phi);
5133 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5134 return true;
5135 }
5136
5137 /* Function vectorizable_live_operation.
5138
5139 STMT computes a value that is used outside the loop. Check if
5140 it can be supported. */
5141
5142 bool
5143 vectorizable_live_operation (gimple stmt,
5144 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5145 gimple *vec_stmt ATTRIBUTE_UNUSED)
5146 {
5147 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5148 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5149 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5150 int i;
5151 int op_type;
5152 tree op;
5153 tree def;
5154 gimple def_stmt;
5155 enum vect_def_type dt;
5156 enum tree_code code;
5157 enum gimple_rhs_class rhs_class;
5158
5159 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5160
5161 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5162 return false;
5163
5164 if (!is_gimple_assign (stmt))
5165 return false;
5166
5167 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5168 return false;
5169
5170 /* FORNOW. CHECKME. */
5171 if (nested_in_vect_loop_p (loop, stmt))
5172 return false;
5173
5174 code = gimple_assign_rhs_code (stmt);
5175 op_type = TREE_CODE_LENGTH (code);
5176 rhs_class = get_gimple_rhs_class (code);
5177 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5178 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5179
5180 /* FORNOW: support only if all uses are invariant. This means
5181 that the scalar operations can remain in place, unvectorized.
5182 The original last scalar value that they compute will be used. */
5183
5184 for (i = 0; i < op_type; i++)
5185 {
5186 if (rhs_class == GIMPLE_SINGLE_RHS)
5187 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5188 else
5189 op = gimple_op (stmt, i + 1);
5190 if (op
5191 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5192 &dt))
5193 {
5194 if (vect_print_dump_info (REPORT_DETAILS))
5195 fprintf (vect_dump, "use not simple.");
5196 return false;
5197 }
5198
5199 if (dt != vect_external_def && dt != vect_constant_def)
5200 return false;
5201 }
5202
5203 /* No transformation is required for the cases we currently support. */
5204 return true;
5205 }
5206
5207 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5208
5209 static void
5210 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5211 {
5212 ssa_op_iter op_iter;
5213 imm_use_iterator imm_iter;
5214 def_operand_p def_p;
5215 gimple ustmt;
5216
5217 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5218 {
5219 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5220 {
5221 basic_block bb;
5222
5223 if (!is_gimple_debug (ustmt))
5224 continue;
5225
5226 bb = gimple_bb (ustmt);
5227
5228 if (!flow_bb_inside_loop_p (loop, bb))
5229 {
5230 if (gimple_debug_bind_p (ustmt))
5231 {
5232 if (vect_print_dump_info (REPORT_DETAILS))
5233 fprintf (vect_dump, "killing debug use");
5234
5235 gimple_debug_bind_reset_value (ustmt);
5236 update_stmt (ustmt);
5237 }
5238 else
5239 gcc_unreachable ();
5240 }
5241 }
5242 }
5243 }
5244
5245 /* Function vect_transform_loop.
5246
5247 The analysis phase has determined that the loop is vectorizable.
5248 Vectorize the loop - created vectorized stmts to replace the scalar
5249 stmts in the loop, and update the loop exit condition. */
5250
5251 void
5252 vect_transform_loop (loop_vec_info loop_vinfo)
5253 {
5254 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5255 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5256 int nbbs = loop->num_nodes;
5257 gimple_stmt_iterator si;
5258 int i;
5259 tree ratio = NULL;
5260 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5261 bool grouped_store;
5262 bool slp_scheduled = false;
5263 unsigned int nunits;
5264 gimple stmt, pattern_stmt;
5265 gimple_seq pattern_def_seq = NULL;
5266 gimple_stmt_iterator pattern_def_si = gsi_none ();
5267 bool transform_pattern_stmt = false;
5268 bool check_profitability;
5269 int th;
5270
5271 if (vect_print_dump_info (REPORT_DETAILS))
5272 fprintf (vect_dump, "=== vec_transform_loop ===");
5273
5274 /* Use the more conservative vectorization threshold. If the number
5275 of iterations is constant assume the cost check has been performed
5276 by our caller. If the threshold makes all loops profitable that
5277 run at least the vectorization factor number of times checking
5278 is pointless, too. */
5279 th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5280 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5281 th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5282 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5283 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5284 {
5285 if (vect_print_dump_info (REPORT_COST))
5286 fprintf (vect_dump,
5287 "Profitability threshold is %d loop iterations.", th);
5288 check_profitability = true;
5289 }
5290
5291 /* Peel the loop if there are data refs with unknown alignment.
5292 Only one data ref with unknown store is allowed. */
5293
5294 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5295 {
5296 vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5297 check_profitability = false;
5298 }
5299
5300 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5301 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5302 {
5303 vect_loop_versioning (loop_vinfo, th, check_profitability);
5304 check_profitability = false;
5305 }
5306
5307 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5308 compile time constant), or it is a constant that doesn't divide by the
5309 vectorization factor, then an epilog loop needs to be created.
5310 We therefore duplicate the loop: the original loop will be vectorized,
5311 and will compute the first (n/VF) iterations. The second copy of the loop
5312 will remain scalar and will compute the remaining (n%VF) iterations.
5313 (VF is the vectorization factor). */
5314
5315 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5316 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5317 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5318 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5319 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5320 th, check_profitability);
5321 else
5322 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5323 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5324
5325 /* 1) Make sure the loop header has exactly two entries
5326 2) Make sure we have a preheader basic block. */
5327
5328 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5329
5330 split_edge (loop_preheader_edge (loop));
5331
5332 /* FORNOW: the vectorizer supports only loops which body consist
5333 of one basic block (header + empty latch). When the vectorizer will
5334 support more involved loop forms, the order by which the BBs are
5335 traversed need to be reconsidered. */
5336
5337 for (i = 0; i < nbbs; i++)
5338 {
5339 basic_block bb = bbs[i];
5340 stmt_vec_info stmt_info;
5341 gimple phi;
5342
5343 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5344 {
5345 phi = gsi_stmt (si);
5346 if (vect_print_dump_info (REPORT_DETAILS))
5347 {
5348 fprintf (vect_dump, "------>vectorizing phi: ");
5349 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
5350 }
5351 stmt_info = vinfo_for_stmt (phi);
5352 if (!stmt_info)
5353 continue;
5354
5355 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5356 vect_loop_kill_debug_uses (loop, phi);
5357
5358 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5359 && !STMT_VINFO_LIVE_P (stmt_info))
5360 continue;
5361
5362 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5363 != (unsigned HOST_WIDE_INT) vectorization_factor)
5364 && vect_print_dump_info (REPORT_DETAILS))
5365 fprintf (vect_dump, "multiple-types.");
5366
5367 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5368 {
5369 if (vect_print_dump_info (REPORT_DETAILS))
5370 fprintf (vect_dump, "transform phi.");
5371 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5372 }
5373 }
5374
5375 pattern_stmt = NULL;
5376 for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5377 {
5378 bool is_store;
5379
5380 if (transform_pattern_stmt)
5381 stmt = pattern_stmt;
5382 else
5383 stmt = gsi_stmt (si);
5384
5385 if (vect_print_dump_info (REPORT_DETAILS))
5386 {
5387 fprintf (vect_dump, "------>vectorizing statement: ");
5388 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
5389 }
5390
5391 stmt_info = vinfo_for_stmt (stmt);
5392
5393 /* vector stmts created in the outer-loop during vectorization of
5394 stmts in an inner-loop may not have a stmt_info, and do not
5395 need to be vectorized. */
5396 if (!stmt_info)
5397 {
5398 gsi_next (&si);
5399 continue;
5400 }
5401
5402 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5403 vect_loop_kill_debug_uses (loop, stmt);
5404
5405 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5406 && !STMT_VINFO_LIVE_P (stmt_info))
5407 {
5408 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5409 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5410 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5411 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5412 {
5413 stmt = pattern_stmt;
5414 stmt_info = vinfo_for_stmt (stmt);
5415 }
5416 else
5417 {
5418 gsi_next (&si);
5419 continue;
5420 }
5421 }
5422 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5423 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5424 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5425 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5426 transform_pattern_stmt = true;
5427
5428 /* If pattern statement has def stmts, vectorize them too. */
5429 if (is_pattern_stmt_p (stmt_info))
5430 {
5431 if (pattern_def_seq == NULL)
5432 {
5433 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5434 pattern_def_si = gsi_start (pattern_def_seq);
5435 }
5436 else if (!gsi_end_p (pattern_def_si))
5437 gsi_next (&pattern_def_si);
5438 if (pattern_def_seq != NULL)
5439 {
5440 gimple pattern_def_stmt = NULL;
5441 stmt_vec_info pattern_def_stmt_info = NULL;
5442
5443 while (!gsi_end_p (pattern_def_si))
5444 {
5445 pattern_def_stmt = gsi_stmt (pattern_def_si);
5446 pattern_def_stmt_info
5447 = vinfo_for_stmt (pattern_def_stmt);
5448 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5449 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5450 break;
5451 gsi_next (&pattern_def_si);
5452 }
5453
5454 if (!gsi_end_p (pattern_def_si))
5455 {
5456 if (vect_print_dump_info (REPORT_DETAILS))
5457 {
5458 fprintf (vect_dump, "==> vectorizing pattern def"
5459 " stmt: ");
5460 print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
5461 TDF_SLIM);
5462 }
5463
5464 stmt = pattern_def_stmt;
5465 stmt_info = pattern_def_stmt_info;
5466 }
5467 else
5468 {
5469 pattern_def_si = gsi_none ();
5470 transform_pattern_stmt = false;
5471 }
5472 }
5473 else
5474 transform_pattern_stmt = false;
5475 }
5476
5477 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5478 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5479 STMT_VINFO_VECTYPE (stmt_info));
5480 if (!STMT_SLP_TYPE (stmt_info)
5481 && nunits != (unsigned int) vectorization_factor
5482 && vect_print_dump_info (REPORT_DETAILS))
5483 /* For SLP VF is set according to unrolling factor, and not to
5484 vector size, hence for SLP this print is not valid. */
5485 fprintf (vect_dump, "multiple-types.");
5486
5487 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5488 reached. */
5489 if (STMT_SLP_TYPE (stmt_info))
5490 {
5491 if (!slp_scheduled)
5492 {
5493 slp_scheduled = true;
5494
5495 if (vect_print_dump_info (REPORT_DETAILS))
5496 fprintf (vect_dump, "=== scheduling SLP instances ===");
5497
5498 vect_schedule_slp (loop_vinfo, NULL);
5499 }
5500
5501 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5502 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5503 {
5504 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5505 {
5506 pattern_def_seq = NULL;
5507 gsi_next (&si);
5508 }
5509 continue;
5510 }
5511 }
5512
5513 /* -------- vectorize statement ------------ */
5514 if (vect_print_dump_info (REPORT_DETAILS))
5515 fprintf (vect_dump, "transform statement.");
5516
5517 grouped_store = false;
5518 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5519 if (is_store)
5520 {
5521 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5522 {
5523 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5524 interleaving chain was completed - free all the stores in
5525 the chain. */
5526 gsi_next (&si);
5527 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5528 continue;
5529 }
5530 else
5531 {
5532 /* Free the attached stmt_vec_info and remove the stmt. */
5533 gimple store = gsi_stmt (si);
5534 free_stmt_vec_info (store);
5535 unlink_stmt_vdef (store);
5536 gsi_remove (&si, true);
5537 release_defs (store);
5538 continue;
5539 }
5540 }
5541
5542 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5543 {
5544 pattern_def_seq = NULL;
5545 gsi_next (&si);
5546 }
5547 } /* stmts in BB */
5548 } /* BBs in loop */
5549
5550 slpeel_make_loop_iterate_ntimes (loop, ratio);
5551
5552 /* The memory tags and pointers in vectorized statements need to
5553 have their SSA forms updated. FIXME, why can't this be delayed
5554 until all the loops have been transformed? */
5555 update_ssa (TODO_update_ssa);
5556
5557 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5558 fprintf (vect_dump, "LOOP VECTORIZED.");
5559 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5560 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
5561 }