a37e3c00f72f5ba39aee2049189d00bd2d5cf80d
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software
3 Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5 Ira Rosen <irar@il.ibm.com>
6
7 This file is part of GCC.
8
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
13
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
22
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "cfgloop.h"
34 #include "cfglayout.h"
35 #include "expr.h"
36 #include "recog.h"
37 #include "optabs.h"
38 #include "params.h"
39 #include "toplev.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "tree-vectorizer.h"
43
44 /* Loop Vectorization Pass.
45
46 This pass tries to vectorize loops.
47
48 For example, the vectorizer transforms the following simple loop:
49
50 short a[N]; short b[N]; short c[N]; int i;
51
52 for (i=0; i<N; i++){
53 a[i] = b[i] + c[i];
54 }
55
56 as if it was manually vectorized by rewriting the source code into:
57
58 typedef int __attribute__((mode(V8HI))) v8hi;
59 short a[N]; short b[N]; short c[N]; int i;
60 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
61 v8hi va, vb, vc;
62
63 for (i=0; i<N/8; i++){
64 vb = pb[i];
65 vc = pc[i];
66 va = vb + vc;
67 pa[i] = va;
68 }
69
70 The main entry to this pass is vectorize_loops(), in which
71 the vectorizer applies a set of analyses on a given set of loops,
72 followed by the actual vectorization transformation for the loops that
73 had successfully passed the analysis phase.
74 Throughout this pass we make a distinction between two types of
75 data: scalars (which are represented by SSA_NAMES), and memory references
76 ("data-refs"). These two types of data require different handling both
77 during analysis and transformation. The types of data-refs that the
78 vectorizer currently supports are ARRAY_REFS which base is an array DECL
79 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80 accesses are required to have a simple (consecutive) access pattern.
81
82 Analysis phase:
83 ===============
84 The driver for the analysis phase is vect_analyze_loop().
85 It applies a set of analyses, some of which rely on the scalar evolution
86 analyzer (scev) developed by Sebastian Pop.
87
88 During the analysis phase the vectorizer records some information
89 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90 loop, as well as general information about the loop as a whole, which is
91 recorded in a "loop_vec_info" struct attached to each loop.
92
93 Transformation phase:
94 =====================
95 The loop transformation phase scans all the stmts in the loop, and
96 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97 the loop that needs to be vectorized. It inserts the vector code sequence
98 just before the scalar stmt S, and records a pointer to the vector code
99 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100 attached to S). This pointer will be used for the vectorization of following
101 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102 otherwise, we rely on dead code elimination for removing it.
103
104 For example, say stmt S1 was vectorized into stmt VS1:
105
106 VS1: vb = px[i];
107 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
108 S2: a = b;
109
110 To vectorize stmt S2, the vectorizer first finds the stmt that defines
111 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
113 resulting sequence would be:
114
115 VS1: vb = px[i];
116 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
117 VS2: va = vb;
118 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
119
120 Operands that are not SSA_NAMEs, are data-refs that appear in
121 load/store operations (like 'x[i]' in S1), and are handled differently.
122
123 Target modeling:
124 =================
125 Currently the only target specific information that is used is the
126 size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
127 support different sizes of vectors, for now will need to specify one value
128 for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
129
130 Since we only vectorize operations which vector form can be
131 expressed using existing tree codes, to verify that an operation is
132 supported, the vectorizer checks the relevant optab at the relevant
133 machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
134 the value found is CODE_FOR_nothing, then there's no target support, and
135 we can't vectorize the stmt.
136
137 For additional information on this project see:
138 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
139 */
140
141 /* Function vect_determine_vectorization_factor
142
143 Determine the vectorization factor (VF). VF is the number of data elements
144 that are operated upon in parallel in a single iteration of the vectorized
145 loop. For example, when vectorizing a loop that operates on 4byte elements,
146 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
147 elements can fit in a single vector register.
148
149 We currently support vectorization of loops in which all types operated upon
150 are of the same size. Therefore this function currently sets VF according to
151 the size of the types operated upon, and fails if there are multiple sizes
152 in the loop.
153
154 VF is also the factor by which the loop iterations are strip-mined, e.g.:
155 original loop:
156 for (i=0; i<N; i++){
157 a[i] = b[i] + c[i];
158 }
159
160 vectorized loop:
161 for (i=0; i<N; i+=VF){
162 a[i:VF] = b[i:VF] + c[i:VF];
163 }
164 */
165
166 static bool
167 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
168 {
169 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
170 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
171 int nbbs = loop->num_nodes;
172 gimple_stmt_iterator si;
173 unsigned int vectorization_factor = 0;
174 tree scalar_type;
175 gimple phi;
176 tree vectype;
177 unsigned int nunits;
178 stmt_vec_info stmt_info;
179 int i;
180 HOST_WIDE_INT dummy;
181
182 if (vect_print_dump_info (REPORT_DETAILS))
183 fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
184
185 for (i = 0; i < nbbs; i++)
186 {
187 basic_block bb = bbs[i];
188
189 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
190 {
191 phi = gsi_stmt (si);
192 stmt_info = vinfo_for_stmt (phi);
193 if (vect_print_dump_info (REPORT_DETAILS))
194 {
195 fprintf (vect_dump, "==> examining phi: ");
196 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
197 }
198
199 gcc_assert (stmt_info);
200
201 if (STMT_VINFO_RELEVANT_P (stmt_info))
202 {
203 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
204 scalar_type = TREE_TYPE (PHI_RESULT (phi));
205
206 if (vect_print_dump_info (REPORT_DETAILS))
207 {
208 fprintf (vect_dump, "get vectype for scalar type: ");
209 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
210 }
211
212 vectype = get_vectype_for_scalar_type (scalar_type);
213 if (!vectype)
214 {
215 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
216 {
217 fprintf (vect_dump,
218 "not vectorized: unsupported data-type ");
219 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
220 }
221 return false;
222 }
223 STMT_VINFO_VECTYPE (stmt_info) = vectype;
224
225 if (vect_print_dump_info (REPORT_DETAILS))
226 {
227 fprintf (vect_dump, "vectype: ");
228 print_generic_expr (vect_dump, vectype, TDF_SLIM);
229 }
230
231 nunits = TYPE_VECTOR_SUBPARTS (vectype);
232 if (vect_print_dump_info (REPORT_DETAILS))
233 fprintf (vect_dump, "nunits = %d", nunits);
234
235 if (!vectorization_factor
236 || (nunits > vectorization_factor))
237 vectorization_factor = nunits;
238 }
239 }
240
241 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
242 {
243 gimple stmt = gsi_stmt (si);
244 stmt_info = vinfo_for_stmt (stmt);
245
246 if (vect_print_dump_info (REPORT_DETAILS))
247 {
248 fprintf (vect_dump, "==> examining statement: ");
249 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
250 }
251
252 gcc_assert (stmt_info);
253
254 /* skip stmts which do not need to be vectorized. */
255 if (!STMT_VINFO_RELEVANT_P (stmt_info)
256 && !STMT_VINFO_LIVE_P (stmt_info))
257 {
258 if (vect_print_dump_info (REPORT_DETAILS))
259 fprintf (vect_dump, "skip.");
260 continue;
261 }
262
263 if (gimple_get_lhs (stmt) == NULL_TREE)
264 {
265 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
266 {
267 fprintf (vect_dump, "not vectorized: irregular stmt.");
268 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
269 }
270 return false;
271 }
272
273 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
274 {
275 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
276 {
277 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
278 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
279 }
280 return false;
281 }
282
283 if (STMT_VINFO_VECTYPE (stmt_info))
284 {
285 /* The only case when a vectype had been already set is for stmts
286 that contain a dataref, or for "pattern-stmts" (stmts generated
287 by the vectorizer to represent/replace a certain idiom). */
288 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
289 || is_pattern_stmt_p (stmt_info));
290 vectype = STMT_VINFO_VECTYPE (stmt_info);
291 }
292 else
293 {
294
295 gcc_assert (! STMT_VINFO_DATA_REF (stmt_info)
296 && !is_pattern_stmt_p (stmt_info));
297
298 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
299 &dummy);
300 if (vect_print_dump_info (REPORT_DETAILS))
301 {
302 fprintf (vect_dump, "get vectype for scalar type: ");
303 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
304 }
305
306 vectype = get_vectype_for_scalar_type (scalar_type);
307 if (!vectype)
308 {
309 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
310 {
311 fprintf (vect_dump,
312 "not vectorized: unsupported data-type ");
313 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
314 }
315 return false;
316 }
317 STMT_VINFO_VECTYPE (stmt_info) = vectype;
318 }
319
320 if (vect_print_dump_info (REPORT_DETAILS))
321 {
322 fprintf (vect_dump, "vectype: ");
323 print_generic_expr (vect_dump, vectype, TDF_SLIM);
324 }
325
326 nunits = TYPE_VECTOR_SUBPARTS (vectype);
327 if (vect_print_dump_info (REPORT_DETAILS))
328 fprintf (vect_dump, "nunits = %d", nunits);
329
330 if (!vectorization_factor
331 || (nunits > vectorization_factor))
332 vectorization_factor = nunits;
333
334 }
335 }
336
337 /* TODO: Analyze cost. Decide if worth while to vectorize. */
338 if (vect_print_dump_info (REPORT_DETAILS))
339 fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
340 if (vectorization_factor <= 1)
341 {
342 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
343 fprintf (vect_dump, "not vectorized: unsupported data-type");
344 return false;
345 }
346 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
347
348 return true;
349 }
350
351
352 /* Function vect_is_simple_iv_evolution.
353
354 FORNOW: A simple evolution of an induction variables in the loop is
355 considered a polynomial evolution with constant step. */
356
357 static bool
358 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
359 tree * step)
360 {
361 tree init_expr;
362 tree step_expr;
363 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
364
365 /* When there is no evolution in this loop, the evolution function
366 is not "simple". */
367 if (evolution_part == NULL_TREE)
368 return false;
369
370 /* When the evolution is a polynomial of degree >= 2
371 the evolution function is not "simple". */
372 if (tree_is_chrec (evolution_part))
373 return false;
374
375 step_expr = evolution_part;
376 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
377
378 if (vect_print_dump_info (REPORT_DETAILS))
379 {
380 fprintf (vect_dump, "step: ");
381 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
382 fprintf (vect_dump, ", init: ");
383 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
384 }
385
386 *init = init_expr;
387 *step = step_expr;
388
389 if (TREE_CODE (step_expr) != INTEGER_CST)
390 {
391 if (vect_print_dump_info (REPORT_DETAILS))
392 fprintf (vect_dump, "step unknown.");
393 return false;
394 }
395
396 return true;
397 }
398
399 /* Function vect_analyze_scalar_cycles_1.
400
401 Examine the cross iteration def-use cycles of scalar variables
402 in LOOP. LOOP_VINFO represents the loop that is now being
403 considered for vectorization (can be LOOP, or an outer-loop
404 enclosing LOOP). */
405
406 static void
407 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
408 {
409 basic_block bb = loop->header;
410 tree dumy;
411 VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
412 gimple_stmt_iterator gsi;
413
414 if (vect_print_dump_info (REPORT_DETAILS))
415 fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
416
417 /* First - identify all inductions. Reduction detection assumes that all the
418 inductions have been identified, therefore, this order must not be
419 changed. */
420 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
421 {
422 gimple phi = gsi_stmt (gsi);
423 tree access_fn = NULL;
424 tree def = PHI_RESULT (phi);
425 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
426
427 if (vect_print_dump_info (REPORT_DETAILS))
428 {
429 fprintf (vect_dump, "Analyze phi: ");
430 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
431 }
432
433 /* Skip virtual phi's. The data dependences that are associated with
434 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
435 if (!is_gimple_reg (SSA_NAME_VAR (def)))
436 continue;
437
438 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
439
440 /* Analyze the evolution function. */
441 access_fn = analyze_scalar_evolution (loop, def);
442 if (access_fn && vect_print_dump_info (REPORT_DETAILS))
443 {
444 fprintf (vect_dump, "Access function of PHI: ");
445 print_generic_expr (vect_dump, access_fn, TDF_SLIM);
446 }
447
448 if (!access_fn
449 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
450 {
451 VEC_safe_push (gimple, heap, worklist, phi);
452 continue;
453 }
454
455 if (vect_print_dump_info (REPORT_DETAILS))
456 fprintf (vect_dump, "Detected induction.");
457 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
458 }
459
460
461 /* Second - identify all reductions and nested cycles. */
462 while (VEC_length (gimple, worklist) > 0)
463 {
464 gimple phi = VEC_pop (gimple, worklist);
465 tree def = PHI_RESULT (phi);
466 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
467 gimple reduc_stmt;
468 bool nested_cycle;
469
470 if (vect_print_dump_info (REPORT_DETAILS))
471 {
472 fprintf (vect_dump, "Analyze phi: ");
473 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
474 }
475
476 gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
477 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
478
479 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
480 reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle);
481 if (reduc_stmt)
482 {
483 if (nested_cycle)
484 {
485 if (vect_print_dump_info (REPORT_DETAILS))
486 fprintf (vect_dump, "Detected vectorizable nested cycle.");
487
488 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
489 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
490 vect_nested_cycle;
491 }
492 else
493 {
494 if (vect_print_dump_info (REPORT_DETAILS))
495 fprintf (vect_dump, "Detected reduction.");
496
497 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
498 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
499 vect_reduction_def;
500 }
501 }
502 else
503 if (vect_print_dump_info (REPORT_DETAILS))
504 fprintf (vect_dump, "Unknown def-use cycle pattern.");
505 }
506
507 VEC_free (gimple, heap, worklist);
508 }
509
510
511 /* Function vect_analyze_scalar_cycles.
512
513 Examine the cross iteration def-use cycles of scalar variables, by
514 analyzing the loop-header PHIs of scalar variables; Classify each
515 cycle as one of the following: invariant, induction, reduction, unknown.
516 We do that for the loop represented by LOOP_VINFO, and also to its
517 inner-loop, if exists.
518 Examples for scalar cycles:
519
520 Example1: reduction:
521
522 loop1:
523 for (i=0; i<N; i++)
524 sum += a[i];
525
526 Example2: induction:
527
528 loop2:
529 for (i=0; i<N; i++)
530 a[i] = i; */
531
532 static void
533 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
534 {
535 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
536
537 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
538
539 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
540 Reductions in such inner-loop therefore have different properties than
541 the reductions in the nest that gets vectorized:
542 1. When vectorized, they are executed in the same order as in the original
543 scalar loop, so we can't change the order of computation when
544 vectorizing them.
545 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
546 current checks are too strict. */
547
548 if (loop->inner)
549 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
550 }
551
552 /* Function vect_get_loop_niters.
553
554 Determine how many iterations the loop is executed.
555 If an expression that represents the number of iterations
556 can be constructed, place it in NUMBER_OF_ITERATIONS.
557 Return the loop exit condition. */
558
559 static gimple
560 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
561 {
562 tree niters;
563
564 if (vect_print_dump_info (REPORT_DETAILS))
565 fprintf (vect_dump, "=== get_loop_niters ===");
566
567 niters = number_of_exit_cond_executions (loop);
568
569 if (niters != NULL_TREE
570 && niters != chrec_dont_know)
571 {
572 *number_of_iterations = niters;
573
574 if (vect_print_dump_info (REPORT_DETAILS))
575 {
576 fprintf (vect_dump, "==> get_loop_niters:" );
577 print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
578 }
579 }
580
581 return get_loop_exit_condition (loop);
582 }
583
584
585 /* Function bb_in_loop_p
586
587 Used as predicate for dfs order traversal of the loop bbs. */
588
589 static bool
590 bb_in_loop_p (const_basic_block bb, const void *data)
591 {
592 const struct loop *const loop = (const struct loop *)data;
593 if (flow_bb_inside_loop_p (loop, bb))
594 return true;
595 return false;
596 }
597
598
599 /* Function new_loop_vec_info.
600
601 Create and initialize a new loop_vec_info struct for LOOP, as well as
602 stmt_vec_info structs for all the stmts in LOOP. */
603
604 static loop_vec_info
605 new_loop_vec_info (struct loop *loop)
606 {
607 loop_vec_info res;
608 basic_block *bbs;
609 gimple_stmt_iterator si;
610 unsigned int i, nbbs;
611
612 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
613 LOOP_VINFO_LOOP (res) = loop;
614
615 bbs = get_loop_body (loop);
616
617 /* Create/Update stmt_info for all stmts in the loop. */
618 for (i = 0; i < loop->num_nodes; i++)
619 {
620 basic_block bb = bbs[i];
621
622 /* BBs in a nested inner-loop will have been already processed (because
623 we will have called vect_analyze_loop_form for any nested inner-loop).
624 Therefore, for stmts in an inner-loop we just want to update the
625 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
626 loop_info of the outer-loop we are currently considering to vectorize
627 (instead of the loop_info of the inner-loop).
628 For stmts in other BBs we need to create a stmt_info from scratch. */
629 if (bb->loop_father != loop)
630 {
631 /* Inner-loop bb. */
632 gcc_assert (loop->inner && bb->loop_father == loop->inner);
633 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
634 {
635 gimple phi = gsi_stmt (si);
636 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
637 loop_vec_info inner_loop_vinfo =
638 STMT_VINFO_LOOP_VINFO (stmt_info);
639 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
640 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
641 }
642 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
643 {
644 gimple stmt = gsi_stmt (si);
645 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
646 loop_vec_info inner_loop_vinfo =
647 STMT_VINFO_LOOP_VINFO (stmt_info);
648 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
649 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
650 }
651 }
652 else
653 {
654 /* bb in current nest. */
655 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
656 {
657 gimple phi = gsi_stmt (si);
658 gimple_set_uid (phi, 0);
659 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
660 }
661
662 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
663 {
664 gimple stmt = gsi_stmt (si);
665 gimple_set_uid (stmt, 0);
666 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
667 }
668 }
669 }
670
671 /* CHECKME: We want to visit all BBs before their successors (except for
672 latch blocks, for which this assertion wouldn't hold). In the simple
673 case of the loop forms we allow, a dfs order of the BBs would the same
674 as reversed postorder traversal, so we are safe. */
675
676 free (bbs);
677 bbs = XCNEWVEC (basic_block, loop->num_nodes);
678 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
679 bbs, loop->num_nodes, loop);
680 gcc_assert (nbbs == loop->num_nodes);
681
682 LOOP_VINFO_BBS (res) = bbs;
683 LOOP_VINFO_NITERS (res) = NULL;
684 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
685 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
686 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
687 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
688 LOOP_VINFO_VECT_FACTOR (res) = 0;
689 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
690 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
691 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
692 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
693 VEC_alloc (gimple, heap,
694 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
695 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
696 VEC_alloc (ddr_p, heap,
697 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
698 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
699 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
700 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
701
702 return res;
703 }
704
705
706 /* Function destroy_loop_vec_info.
707
708 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
709 stmts in the loop. */
710
711 void
712 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
713 {
714 struct loop *loop;
715 basic_block *bbs;
716 int nbbs;
717 gimple_stmt_iterator si;
718 int j;
719 VEC (slp_instance, heap) *slp_instances;
720 slp_instance instance;
721
722 if (!loop_vinfo)
723 return;
724
725 loop = LOOP_VINFO_LOOP (loop_vinfo);
726
727 bbs = LOOP_VINFO_BBS (loop_vinfo);
728 nbbs = loop->num_nodes;
729
730 if (!clean_stmts)
731 {
732 free (LOOP_VINFO_BBS (loop_vinfo));
733 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
734 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
735 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
736
737 free (loop_vinfo);
738 loop->aux = NULL;
739 return;
740 }
741
742 for (j = 0; j < nbbs; j++)
743 {
744 basic_block bb = bbs[j];
745 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
746 free_stmt_vec_info (gsi_stmt (si));
747
748 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
749 {
750 gimple stmt = gsi_stmt (si);
751 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
752
753 if (stmt_info)
754 {
755 /* Check if this is a "pattern stmt" (introduced by the
756 vectorizer during the pattern recognition pass). */
757 bool remove_stmt_p = false;
758 gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
759 if (orig_stmt)
760 {
761 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
762 if (orig_stmt_info
763 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
764 remove_stmt_p = true;
765 }
766
767 /* Free stmt_vec_info. */
768 free_stmt_vec_info (stmt);
769
770 /* Remove dead "pattern stmts". */
771 if (remove_stmt_p)
772 gsi_remove (&si, true);
773 }
774 gsi_next (&si);
775 }
776 }
777
778 free (LOOP_VINFO_BBS (loop_vinfo));
779 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
780 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
781 VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
782 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
783 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
784 for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
785 vect_free_slp_instance (instance);
786
787 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
788 VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
789
790 free (loop_vinfo);
791 loop->aux = NULL;
792 }
793
794
795 /* Function vect_analyze_loop_1.
796
797 Apply a set of analyses on LOOP, and create a loop_vec_info struct
798 for it. The different analyses will record information in the
799 loop_vec_info struct. This is a subset of the analyses applied in
800 vect_analyze_loop, to be applied on an inner-loop nested in the loop
801 that is now considered for (outer-loop) vectorization. */
802
803 static loop_vec_info
804 vect_analyze_loop_1 (struct loop *loop)
805 {
806 loop_vec_info loop_vinfo;
807
808 if (vect_print_dump_info (REPORT_DETAILS))
809 fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
810
811 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
812
813 loop_vinfo = vect_analyze_loop_form (loop);
814 if (!loop_vinfo)
815 {
816 if (vect_print_dump_info (REPORT_DETAILS))
817 fprintf (vect_dump, "bad inner-loop form.");
818 return NULL;
819 }
820
821 return loop_vinfo;
822 }
823
824
825 /* Function vect_analyze_loop_form.
826
827 Verify that certain CFG restrictions hold, including:
828 - the loop has a pre-header
829 - the loop has a single entry and exit
830 - the loop exit condition is simple enough, and the number of iterations
831 can be analyzed (a countable loop). */
832
833 loop_vec_info
834 vect_analyze_loop_form (struct loop *loop)
835 {
836 loop_vec_info loop_vinfo;
837 gimple loop_cond;
838 tree number_of_iterations = NULL;
839 loop_vec_info inner_loop_vinfo = NULL;
840
841 if (vect_print_dump_info (REPORT_DETAILS))
842 fprintf (vect_dump, "=== vect_analyze_loop_form ===");
843
844 /* Different restrictions apply when we are considering an inner-most loop,
845 vs. an outer (nested) loop.
846 (FORNOW. May want to relax some of these restrictions in the future). */
847
848 if (!loop->inner)
849 {
850 /* Inner-most loop. We currently require that the number of BBs is
851 exactly 2 (the header and latch). Vectorizable inner-most loops
852 look like this:
853
854 (pre-header)
855 |
856 header <--------+
857 | | |
858 | +--> latch --+
859 |
860 (exit-bb) */
861
862 if (loop->num_nodes != 2)
863 {
864 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
865 fprintf (vect_dump, "not vectorized: control flow in loop.");
866 return NULL;
867 }
868
869 if (empty_block_p (loop->header))
870 {
871 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
872 fprintf (vect_dump, "not vectorized: empty loop.");
873 return NULL;
874 }
875 }
876 else
877 {
878 struct loop *innerloop = loop->inner;
879 edge backedge, entryedge;
880
881 /* Nested loop. We currently require that the loop is doubly-nested,
882 contains a single inner loop, and the number of BBs is exactly 5.
883 Vectorizable outer-loops look like this:
884
885 (pre-header)
886 |
887 header <---+
888 | |
889 inner-loop |
890 | |
891 tail ------+
892 |
893 (exit-bb)
894
895 The inner-loop has the properties expected of inner-most loops
896 as described above. */
897
898 if ((loop->inner)->inner || (loop->inner)->next)
899 {
900 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
901 fprintf (vect_dump, "not vectorized: multiple nested loops.");
902 return NULL;
903 }
904
905 /* Analyze the inner-loop. */
906 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
907 if (!inner_loop_vinfo)
908 {
909 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
910 fprintf (vect_dump, "not vectorized: Bad inner loop.");
911 return NULL;
912 }
913
914 if (!expr_invariant_in_loop_p (loop,
915 LOOP_VINFO_NITERS (inner_loop_vinfo)))
916 {
917 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
918 fprintf (vect_dump,
919 "not vectorized: inner-loop count not invariant.");
920 destroy_loop_vec_info (inner_loop_vinfo, true);
921 return NULL;
922 }
923
924 if (loop->num_nodes != 5)
925 {
926 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
927 fprintf (vect_dump, "not vectorized: control flow in loop.");
928 destroy_loop_vec_info (inner_loop_vinfo, true);
929 return NULL;
930 }
931
932 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
933 backedge = EDGE_PRED (innerloop->header, 1);
934 entryedge = EDGE_PRED (innerloop->header, 0);
935 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
936 {
937 backedge = EDGE_PRED (innerloop->header, 0);
938 entryedge = EDGE_PRED (innerloop->header, 1);
939 }
940
941 if (entryedge->src != loop->header
942 || !single_exit (innerloop)
943 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
944 {
945 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
946 fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
947 destroy_loop_vec_info (inner_loop_vinfo, true);
948 return NULL;
949 }
950
951 if (vect_print_dump_info (REPORT_DETAILS))
952 fprintf (vect_dump, "Considering outer-loop vectorization.");
953 }
954
955 if (!single_exit (loop)
956 || EDGE_COUNT (loop->header->preds) != 2)
957 {
958 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
959 {
960 if (!single_exit (loop))
961 fprintf (vect_dump, "not vectorized: multiple exits.");
962 else if (EDGE_COUNT (loop->header->preds) != 2)
963 fprintf (vect_dump, "not vectorized: too many incoming edges.");
964 }
965 if (inner_loop_vinfo)
966 destroy_loop_vec_info (inner_loop_vinfo, true);
967 return NULL;
968 }
969
970 /* We assume that the loop exit condition is at the end of the loop. i.e,
971 that the loop is represented as a do-while (with a proper if-guard
972 before the loop if needed), where the loop header contains all the
973 executable statements, and the latch is empty. */
974 if (!empty_block_p (loop->latch)
975 || phi_nodes (loop->latch))
976 {
977 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
978 fprintf (vect_dump, "not vectorized: unexpected loop form.");
979 if (inner_loop_vinfo)
980 destroy_loop_vec_info (inner_loop_vinfo, true);
981 return NULL;
982 }
983
984 /* Make sure there exists a single-predecessor exit bb: */
985 if (!single_pred_p (single_exit (loop)->dest))
986 {
987 edge e = single_exit (loop);
988 if (!(e->flags & EDGE_ABNORMAL))
989 {
990 split_loop_exit_edge (e);
991 if (vect_print_dump_info (REPORT_DETAILS))
992 fprintf (vect_dump, "split exit edge.");
993 }
994 else
995 {
996 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
997 fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
998 if (inner_loop_vinfo)
999 destroy_loop_vec_info (inner_loop_vinfo, true);
1000 return NULL;
1001 }
1002 }
1003
1004 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1005 if (!loop_cond)
1006 {
1007 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1008 fprintf (vect_dump, "not vectorized: complicated exit condition.");
1009 if (inner_loop_vinfo)
1010 destroy_loop_vec_info (inner_loop_vinfo, true);
1011 return NULL;
1012 }
1013
1014 if (!number_of_iterations)
1015 {
1016 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1017 fprintf (vect_dump,
1018 "not vectorized: number of iterations cannot be computed.");
1019 if (inner_loop_vinfo)
1020 destroy_loop_vec_info (inner_loop_vinfo, true);
1021 return NULL;
1022 }
1023
1024 if (chrec_contains_undetermined (number_of_iterations))
1025 {
1026 if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1027 fprintf (vect_dump, "Infinite number of iterations.");
1028 if (inner_loop_vinfo)
1029 destroy_loop_vec_info (inner_loop_vinfo, true);
1030 return NULL;
1031 }
1032
1033 if (!NITERS_KNOWN_P (number_of_iterations))
1034 {
1035 if (vect_print_dump_info (REPORT_DETAILS))
1036 {
1037 fprintf (vect_dump, "Symbolic number of iterations is ");
1038 print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1039 }
1040 }
1041 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1042 {
1043 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1044 fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1045 if (inner_loop_vinfo)
1046 destroy_loop_vec_info (inner_loop_vinfo, false);
1047 return NULL;
1048 }
1049
1050 loop_vinfo = new_loop_vec_info (loop);
1051 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1052 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1053
1054 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1055
1056 /* CHECKME: May want to keep it around it in the future. */
1057 if (inner_loop_vinfo)
1058 destroy_loop_vec_info (inner_loop_vinfo, false);
1059
1060 gcc_assert (!loop->aux);
1061 loop->aux = loop_vinfo;
1062 return loop_vinfo;
1063 }
1064
1065
1066 /* Function vect_analyze_loop_operations.
1067
1068 Scan the loop stmts and make sure they are all vectorizable. */
1069
1070 static bool
1071 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1072 {
1073 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1074 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1075 int nbbs = loop->num_nodes;
1076 gimple_stmt_iterator si;
1077 unsigned int vectorization_factor = 0;
1078 int i;
1079 gimple phi;
1080 stmt_vec_info stmt_info;
1081 bool need_to_vectorize = false;
1082 int min_profitable_iters;
1083 int min_scalar_loop_bound;
1084 unsigned int th;
1085 bool only_slp_in_loop = true, ok;
1086
1087 if (vect_print_dump_info (REPORT_DETAILS))
1088 fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1089
1090 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1091 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1092
1093 for (i = 0; i < nbbs; i++)
1094 {
1095 basic_block bb = bbs[i];
1096
1097 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1098 {
1099 phi = gsi_stmt (si);
1100 ok = true;
1101
1102 stmt_info = vinfo_for_stmt (phi);
1103 if (vect_print_dump_info (REPORT_DETAILS))
1104 {
1105 fprintf (vect_dump, "examining phi: ");
1106 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1107 }
1108
1109 if (! is_loop_header_bb_p (bb))
1110 {
1111 /* inner-loop loop-closed exit phi in outer-loop vectorization
1112 (i.e. a phi in the tail of the outer-loop).
1113 FORNOW: we currently don't support the case that these phis
1114 are not used in the outerloop, cause this case requires
1115 to actually do something here. */
1116 if (!STMT_VINFO_RELEVANT_P (stmt_info)
1117 || STMT_VINFO_LIVE_P (stmt_info))
1118 {
1119 if (vect_print_dump_info (REPORT_DETAILS))
1120 fprintf (vect_dump,
1121 "Unsupported loop-closed phi in outer-loop.");
1122 return false;
1123 }
1124 continue;
1125 }
1126
1127 gcc_assert (stmt_info);
1128
1129 if (STMT_VINFO_LIVE_P (stmt_info))
1130 {
1131 /* FORNOW: not yet supported. */
1132 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1133 fprintf (vect_dump, "not vectorized: value used after loop.");
1134 return false;
1135 }
1136
1137 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1138 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1139 {
1140 /* A scalar-dependence cycle that we don't support. */
1141 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1142 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1143 return false;
1144 }
1145
1146 if (STMT_VINFO_RELEVANT_P (stmt_info))
1147 {
1148 need_to_vectorize = true;
1149 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1150 ok = vectorizable_induction (phi, NULL, NULL);
1151 }
1152
1153 if (!ok)
1154 {
1155 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1156 {
1157 fprintf (vect_dump,
1158 "not vectorized: relevant phi not supported: ");
1159 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1160 }
1161 return false;
1162 }
1163 }
1164
1165 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1166 {
1167 gimple stmt = gsi_stmt (si);
1168 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1169
1170 gcc_assert (stmt_info);
1171
1172 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1173 return false;
1174
1175 if (STMT_VINFO_RELEVANT_P (stmt_info) && !PURE_SLP_STMT (stmt_info))
1176 /* STMT needs both SLP and loop-based vectorization. */
1177 only_slp_in_loop = false;
1178 }
1179 } /* bbs */
1180
1181 /* All operations in the loop are either irrelevant (deal with loop
1182 control, or dead), or only used outside the loop and can be moved
1183 out of the loop (e.g. invariants, inductions). The loop can be
1184 optimized away by scalar optimizations. We're better off not
1185 touching this loop. */
1186 if (!need_to_vectorize)
1187 {
1188 if (vect_print_dump_info (REPORT_DETAILS))
1189 fprintf (vect_dump,
1190 "All the computation can be taken out of the loop.");
1191 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1192 fprintf (vect_dump,
1193 "not vectorized: redundant loop. no profit to vectorize.");
1194 return false;
1195 }
1196
1197 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1198 vectorization factor of the loop is the unrolling factor required by the
1199 SLP instances. If that unrolling factor is 1, we say, that we perform
1200 pure SLP on loop - cross iteration parallelism is not exploited. */
1201 if (only_slp_in_loop)
1202 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1203 else
1204 vectorization_factor = least_common_multiple (vectorization_factor,
1205 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1206
1207 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1208
1209 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1210 && vect_print_dump_info (REPORT_DETAILS))
1211 fprintf (vect_dump,
1212 "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1213 vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1214
1215 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1216 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1217 {
1218 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1219 fprintf (vect_dump, "not vectorized: iteration count too small.");
1220 if (vect_print_dump_info (REPORT_DETAILS))
1221 fprintf (vect_dump,"not vectorized: iteration count smaller than "
1222 "vectorization factor.");
1223 return false;
1224 }
1225
1226 /* Analyze cost. Decide if worth while to vectorize. */
1227
1228 /* Once VF is set, SLP costs should be updated since the number of created
1229 vector stmts depends on VF. */
1230 vect_update_slp_costs_according_to_vf (loop_vinfo);
1231
1232 min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1233 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1234
1235 if (min_profitable_iters < 0)
1236 {
1237 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1238 fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1239 if (vect_print_dump_info (REPORT_DETAILS))
1240 fprintf (vect_dump, "not vectorized: vector version will never be "
1241 "profitable.");
1242 return false;
1243 }
1244
1245 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1246 * vectorization_factor) - 1);
1247
1248 /* Use the cost model only if it is more conservative than user specified
1249 threshold. */
1250
1251 th = (unsigned) min_scalar_loop_bound;
1252 if (min_profitable_iters
1253 && (!min_scalar_loop_bound
1254 || min_profitable_iters > min_scalar_loop_bound))
1255 th = (unsigned) min_profitable_iters;
1256
1257 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1258 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1259 {
1260 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1261 fprintf (vect_dump, "not vectorized: vectorization not "
1262 "profitable.");
1263 if (vect_print_dump_info (REPORT_DETAILS))
1264 fprintf (vect_dump, "not vectorized: iteration count smaller than "
1265 "user specified loop bound parameter or minimum "
1266 "profitable iterations (whichever is more conservative).");
1267 return false;
1268 }
1269
1270 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1271 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1272 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1273 {
1274 if (vect_print_dump_info (REPORT_DETAILS))
1275 fprintf (vect_dump, "epilog loop required.");
1276 if (!vect_can_advance_ivs_p (loop_vinfo))
1277 {
1278 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1279 fprintf (vect_dump,
1280 "not vectorized: can't create epilog loop 1.");
1281 return false;
1282 }
1283 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1284 {
1285 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1286 fprintf (vect_dump,
1287 "not vectorized: can't create epilog loop 2.");
1288 return false;
1289 }
1290 }
1291
1292 return true;
1293 }
1294
1295
1296 /* Function vect_analyze_loop.
1297
1298 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1299 for it. The different analyses will record information in the
1300 loop_vec_info struct. */
1301 loop_vec_info
1302 vect_analyze_loop (struct loop *loop)
1303 {
1304 bool ok;
1305 loop_vec_info loop_vinfo;
1306
1307 if (vect_print_dump_info (REPORT_DETAILS))
1308 fprintf (vect_dump, "===== analyze_loop_nest =====");
1309
1310 if (loop_outer (loop)
1311 && loop_vec_info_for_loop (loop_outer (loop))
1312 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1313 {
1314 if (vect_print_dump_info (REPORT_DETAILS))
1315 fprintf (vect_dump, "outer-loop already vectorized.");
1316 return NULL;
1317 }
1318
1319 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1320
1321 loop_vinfo = vect_analyze_loop_form (loop);
1322 if (!loop_vinfo)
1323 {
1324 if (vect_print_dump_info (REPORT_DETAILS))
1325 fprintf (vect_dump, "bad loop form.");
1326 return NULL;
1327 }
1328
1329 /* Find all data references in the loop (which correspond to vdefs/vuses)
1330 and analyze their evolution in the loop.
1331
1332 FORNOW: Handle only simple, array references, which
1333 alignment can be forced, and aligned pointer-references. */
1334
1335 ok = vect_analyze_data_refs (loop_vinfo, NULL);
1336 if (!ok)
1337 {
1338 if (vect_print_dump_info (REPORT_DETAILS))
1339 fprintf (vect_dump, "bad data references.");
1340 destroy_loop_vec_info (loop_vinfo, true);
1341 return NULL;
1342 }
1343
1344 /* Classify all cross-iteration scalar data-flow cycles.
1345 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1346
1347 vect_analyze_scalar_cycles (loop_vinfo);
1348
1349 vect_pattern_recog (loop_vinfo);
1350
1351 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1352
1353 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1354 if (!ok)
1355 {
1356 if (vect_print_dump_info (REPORT_DETAILS))
1357 fprintf (vect_dump, "unexpected pattern.");
1358 destroy_loop_vec_info (loop_vinfo, true);
1359 return NULL;
1360 }
1361
1362 /* Analyze the alignment of the data-refs in the loop.
1363 Fail if a data reference is found that cannot be vectorized. */
1364
1365 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1366 if (!ok)
1367 {
1368 if (vect_print_dump_info (REPORT_DETAILS))
1369 fprintf (vect_dump, "bad data alignment.");
1370 destroy_loop_vec_info (loop_vinfo, true);
1371 return NULL;
1372 }
1373
1374 ok = vect_determine_vectorization_factor (loop_vinfo);
1375 if (!ok)
1376 {
1377 if (vect_print_dump_info (REPORT_DETAILS))
1378 fprintf (vect_dump, "can't determine vectorization factor.");
1379 destroy_loop_vec_info (loop_vinfo, true);
1380 return NULL;
1381 }
1382
1383 /* Analyze data dependences between the data-refs in the loop.
1384 FORNOW: fail at the first data dependence that we encounter. */
1385
1386 ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL);
1387 if (!ok)
1388 {
1389 if (vect_print_dump_info (REPORT_DETAILS))
1390 fprintf (vect_dump, "bad data dependence.");
1391 destroy_loop_vec_info (loop_vinfo, true);
1392 return NULL;
1393 }
1394
1395 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1396 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1397
1398 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1399 if (!ok)
1400 {
1401 if (vect_print_dump_info (REPORT_DETAILS))
1402 fprintf (vect_dump, "bad data access.");
1403 destroy_loop_vec_info (loop_vinfo, true);
1404 return NULL;
1405 }
1406
1407 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1408 It is important to call pruning after vect_analyze_data_ref_accesses,
1409 since we use grouping information gathered by interleaving analysis. */
1410 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1411 if (!ok)
1412 {
1413 if (vect_print_dump_info (REPORT_DETAILS))
1414 fprintf (vect_dump, "too long list of versioning for alias "
1415 "run-time tests.");
1416 destroy_loop_vec_info (loop_vinfo, true);
1417 return NULL;
1418 }
1419
1420 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1421 ok = vect_analyze_slp (loop_vinfo, NULL);
1422 if (ok)
1423 {
1424 /* Decide which possible SLP instances to SLP. */
1425 vect_make_slp_decision (loop_vinfo);
1426
1427 /* Find stmts that need to be both vectorized and SLPed. */
1428 vect_detect_hybrid_slp (loop_vinfo);
1429 }
1430
1431 /* This pass will decide on using loop versioning and/or loop peeling in
1432 order to enhance the alignment of data references in the loop. */
1433
1434 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1435 if (!ok)
1436 {
1437 if (vect_print_dump_info (REPORT_DETAILS))
1438 fprintf (vect_dump, "bad data alignment.");
1439 destroy_loop_vec_info (loop_vinfo, true);
1440 return NULL;
1441 }
1442
1443 /* Scan all the operations in the loop and make sure they are
1444 vectorizable. */
1445
1446 ok = vect_analyze_loop_operations (loop_vinfo);
1447 if (!ok)
1448 {
1449 if (vect_print_dump_info (REPORT_DETAILS))
1450 fprintf (vect_dump, "bad operation or unsupported loop bound.");
1451 destroy_loop_vec_info (loop_vinfo, true);
1452 return NULL;
1453 }
1454
1455 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1456
1457 return loop_vinfo;
1458 }
1459
1460
1461 /* Function reduction_code_for_scalar_code
1462
1463 Input:
1464 CODE - tree_code of a reduction operations.
1465
1466 Output:
1467 REDUC_CODE - the corresponding tree-code to be used to reduce the
1468 vector of partial results into a single scalar result (which
1469 will also reside in a vector).
1470
1471 Return TRUE if a corresponding REDUC_CODE was found, FALSE otherwise. */
1472
1473 static bool
1474 reduction_code_for_scalar_code (enum tree_code code,
1475 enum tree_code *reduc_code)
1476 {
1477 switch (code)
1478 {
1479 case MAX_EXPR:
1480 *reduc_code = REDUC_MAX_EXPR;
1481 return true;
1482
1483 case MIN_EXPR:
1484 *reduc_code = REDUC_MIN_EXPR;
1485 return true;
1486
1487 case PLUS_EXPR:
1488 *reduc_code = REDUC_PLUS_EXPR;
1489 return true;
1490
1491 default:
1492 return false;
1493 }
1494 }
1495
1496
1497 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1498 STMT is printed with a message MSG. */
1499
1500 static void
1501 report_vect_op (gimple stmt, const char *msg)
1502 {
1503 fprintf (vect_dump, "%s", msg);
1504 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1505 }
1506
1507
1508 /* Function vect_is_simple_reduction
1509
1510 Detect a cross-iteration def-use cycle that represents a simple
1511 reduction computation. We look for the following pattern:
1512
1513 loop_header:
1514 a1 = phi < a0, a2 >
1515 a3 = ...
1516 a2 = operation (a3, a1)
1517
1518 such that:
1519 1. operation is commutative and associative and it is safe to
1520 change the order of the computation (if CHECK_REDUCTION is true)
1521 2. no uses for a2 in the loop (a2 is used out of the loop)
1522 3. no uses of a1 in the loop besides the reduction operation.
1523
1524 Condition 1 is tested here.
1525 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1526
1527 Also detect a cross-iteration def-use cycle in nested loops, i.e., nested
1528 cycles, if CHECK_REDUCTION is false. */
1529
1530 gimple
1531 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1532 bool check_reduction)
1533 {
1534 struct loop *loop = (gimple_bb (phi))->loop_father;
1535 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1536 edge latch_e = loop_latch_edge (loop);
1537 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1538 gimple def_stmt, def1, def2;
1539 enum tree_code code;
1540 tree op1, op2;
1541 tree type;
1542 int nloop_uses;
1543 tree name;
1544 imm_use_iterator imm_iter;
1545 use_operand_p use_p;
1546
1547 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1548 otherwise, we assume outer loop vectorization. */
1549 gcc_assert ((check_reduction && loop == vect_loop)
1550 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1551
1552 name = PHI_RESULT (phi);
1553 nloop_uses = 0;
1554 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1555 {
1556 gimple use_stmt = USE_STMT (use_p);
1557 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1558 && vinfo_for_stmt (use_stmt)
1559 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1560 nloop_uses++;
1561 if (nloop_uses > 1)
1562 {
1563 if (vect_print_dump_info (REPORT_DETAILS))
1564 fprintf (vect_dump, "reduction used in loop.");
1565 return NULL;
1566 }
1567 }
1568
1569 if (TREE_CODE (loop_arg) != SSA_NAME)
1570 {
1571 if (vect_print_dump_info (REPORT_DETAILS))
1572 {
1573 fprintf (vect_dump, "reduction: not ssa_name: ");
1574 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1575 }
1576 return NULL;
1577 }
1578
1579 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1580 if (!def_stmt)
1581 {
1582 if (vect_print_dump_info (REPORT_DETAILS))
1583 fprintf (vect_dump, "reduction: no def_stmt.");
1584 return NULL;
1585 }
1586
1587 if (!is_gimple_assign (def_stmt))
1588 {
1589 if (vect_print_dump_info (REPORT_DETAILS))
1590 print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1591 return NULL;
1592 }
1593
1594 name = gimple_assign_lhs (def_stmt);
1595 nloop_uses = 0;
1596 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1597 {
1598 gimple use_stmt = USE_STMT (use_p);
1599 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1600 && vinfo_for_stmt (use_stmt)
1601 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1602 nloop_uses++;
1603 if (nloop_uses > 1)
1604 {
1605 if (vect_print_dump_info (REPORT_DETAILS))
1606 fprintf (vect_dump, "reduction used in loop.");
1607 return NULL;
1608 }
1609 }
1610
1611 code = gimple_assign_rhs_code (def_stmt);
1612
1613 if (check_reduction
1614 && (!commutative_tree_code (code) || !associative_tree_code (code)))
1615 {
1616 if (vect_print_dump_info (REPORT_DETAILS))
1617 report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1618 return NULL;
1619 }
1620
1621 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1622 {
1623 if (vect_print_dump_info (REPORT_DETAILS))
1624 report_vect_op (def_stmt, "reduction: not binary operation: ");
1625 return NULL;
1626 }
1627
1628 op1 = gimple_assign_rhs1 (def_stmt);
1629 op2 = gimple_assign_rhs2 (def_stmt);
1630 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1631 {
1632 if (vect_print_dump_info (REPORT_DETAILS))
1633 report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1634 return NULL;
1635 }
1636
1637 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1638 if (TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op1))
1639 || TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op2)))
1640 {
1641 if (vect_print_dump_info (REPORT_DETAILS))
1642 {
1643 fprintf (vect_dump, "reduction: multiple types: operation type: ");
1644 print_generic_expr (vect_dump, type, TDF_SLIM);
1645 fprintf (vect_dump, ", operands types: ");
1646 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1647 fprintf (vect_dump, ",");
1648 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1649 }
1650 return NULL;
1651 }
1652
1653 /* Check that it's ok to change the order of the computation.
1654 Generally, when vectorizing a reduction we change the order of the
1655 computation. This may change the behavior of the program in some
1656 cases, so we need to check that this is ok. One exception is when
1657 vectorizing an outer-loop: the inner-loop is executed sequentially,
1658 and therefore vectorizing reductions in the inner-loop during
1659 outer-loop vectorization is safe. */
1660
1661 /* CHECKME: check for !flag_finite_math_only too? */
1662 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1663 && check_reduction)
1664 {
1665 /* Changing the order of operations changes the semantics. */
1666 if (vect_print_dump_info (REPORT_DETAILS))
1667 report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1668 return NULL;
1669 }
1670 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1671 && check_reduction)
1672 {
1673 /* Changing the order of operations changes the semantics. */
1674 if (vect_print_dump_info (REPORT_DETAILS))
1675 report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1676 return NULL;
1677 }
1678 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1679 {
1680 /* Changing the order of operations changes the semantics. */
1681 if (vect_print_dump_info (REPORT_DETAILS))
1682 report_vect_op (def_stmt,
1683 "reduction: unsafe fixed-point math optimization: ");
1684 return NULL;
1685 }
1686
1687 /* Reduction is safe. We're dealing with one of the following:
1688 1) integer arithmetic and no trapv
1689 2) floating point arithmetic, and special flags permit this optimization
1690 3) nested cycle (i.e., outer loop vectorization). */
1691 def1 = SSA_NAME_DEF_STMT (op1);
1692 def2 = SSA_NAME_DEF_STMT (op2);
1693 if (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2))
1694 {
1695 if (vect_print_dump_info (REPORT_DETAILS))
1696 report_vect_op (def_stmt, "reduction: no defs for operands: ");
1697 return NULL;
1698 }
1699
1700
1701 /* Check that one def is the reduction def, defined by PHI,
1702 the other def is either defined in the loop ("vect_internal_def"),
1703 or it's an induction (defined by a loop-header phi-node). */
1704
1705 if (def2 == phi
1706 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1707 && (is_gimple_assign (def1)
1708 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def
1709 || (gimple_code (def1) == GIMPLE_PHI
1710 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1711 == vect_internal_def
1712 && !is_loop_header_bb_p (gimple_bb (def1)))))
1713 {
1714 if (vect_print_dump_info (REPORT_DETAILS))
1715 report_vect_op (def_stmt, "detected reduction: ");
1716 return def_stmt;
1717 }
1718 else if (def1 == phi
1719 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1720 && (is_gimple_assign (def2)
1721 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1722 == vect_induction_def
1723 || (gimple_code (def2) == GIMPLE_PHI
1724 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1725 == vect_internal_def
1726 && !is_loop_header_bb_p (gimple_bb (def2)))))
1727 {
1728 if (check_reduction)
1729 {
1730 /* Swap operands (just for simplicity - so that the rest of the code
1731 can assume that the reduction variable is always the last (second)
1732 argument). */
1733 if (vect_print_dump_info (REPORT_DETAILS))
1734 report_vect_op (def_stmt,
1735 "detected reduction: need to swap operands: ");
1736
1737 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1738 gimple_assign_rhs2_ptr (def_stmt));
1739 }
1740 else
1741 {
1742 if (vect_print_dump_info (REPORT_DETAILS))
1743 report_vect_op (def_stmt, "detected reduction: ");
1744 }
1745
1746 return def_stmt;
1747 }
1748 else
1749 {
1750 if (vect_print_dump_info (REPORT_DETAILS))
1751 report_vect_op (def_stmt, "reduction: unknown pattern: ");
1752
1753 return NULL;
1754 }
1755 }
1756
1757
1758 /* Function vect_estimate_min_profitable_iters
1759
1760 Return the number of iterations required for the vector version of the
1761 loop to be profitable relative to the cost of the scalar version of the
1762 loop.
1763
1764 TODO: Take profile info into account before making vectorization
1765 decisions, if available. */
1766
1767 int
1768 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
1769 {
1770 int i;
1771 int min_profitable_iters;
1772 int peel_iters_prologue;
1773 int peel_iters_epilogue;
1774 int vec_inside_cost = 0;
1775 int vec_outside_cost = 0;
1776 int scalar_single_iter_cost = 0;
1777 int scalar_outside_cost = 0;
1778 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1779 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1780 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1781 int nbbs = loop->num_nodes;
1782 int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
1783 int peel_guard_costs = 0;
1784 int innerloop_iters = 0, factor;
1785 VEC (slp_instance, heap) *slp_instances;
1786 slp_instance instance;
1787
1788 /* Cost model disabled. */
1789 if (!flag_vect_cost_model)
1790 {
1791 if (vect_print_dump_info (REPORT_COST))
1792 fprintf (vect_dump, "cost model disabled.");
1793 return 0;
1794 }
1795
1796 /* Requires loop versioning tests to handle misalignment. */
1797 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1798 {
1799 /* FIXME: Make cost depend on complexity of individual check. */
1800 vec_outside_cost +=
1801 VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1802 if (vect_print_dump_info (REPORT_COST))
1803 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1804 "versioning to treat misalignment.\n");
1805 }
1806
1807 /* Requires loop versioning with alias checks. */
1808 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1809 {
1810 /* FIXME: Make cost depend on complexity of individual check. */
1811 vec_outside_cost +=
1812 VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1813 if (vect_print_dump_info (REPORT_COST))
1814 fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1815 "versioning aliasing.\n");
1816 }
1817
1818 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
1819 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1820 vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
1821
1822 /* Count statements in scalar loop. Using this as scalar cost for a single
1823 iteration for now.
1824
1825 TODO: Add outer loop support.
1826
1827 TODO: Consider assigning different costs to different scalar
1828 statements. */
1829
1830 /* FORNOW. */
1831 if (loop->inner)
1832 innerloop_iters = 50; /* FIXME */
1833
1834 for (i = 0; i < nbbs; i++)
1835 {
1836 gimple_stmt_iterator si;
1837 basic_block bb = bbs[i];
1838
1839 if (bb->loop_father == loop->inner)
1840 factor = innerloop_iters;
1841 else
1842 factor = 1;
1843
1844 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1845 {
1846 gimple stmt = gsi_stmt (si);
1847 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1848 /* Skip stmts that are not vectorized inside the loop. */
1849 if (!STMT_VINFO_RELEVANT_P (stmt_info)
1850 && (!STMT_VINFO_LIVE_P (stmt_info)
1851 || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
1852 continue;
1853 scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
1854 vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
1855 /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
1856 some of the "outside" costs are generated inside the outer-loop. */
1857 vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
1858 }
1859 }
1860
1861 /* Add additional cost for the peeled instructions in prologue and epilogue
1862 loop.
1863
1864 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
1865 at compile-time - we assume it's vf/2 (the worst would be vf-1).
1866
1867 TODO: Build an expression that represents peel_iters for prologue and
1868 epilogue to be used in a run-time test. */
1869
1870 if (byte_misalign < 0)
1871 {
1872 peel_iters_prologue = vf/2;
1873 if (vect_print_dump_info (REPORT_COST))
1874 fprintf (vect_dump, "cost model: "
1875 "prologue peel iters set to vf/2.");
1876
1877 /* If peeling for alignment is unknown, loop bound of main loop becomes
1878 unknown. */
1879 peel_iters_epilogue = vf/2;
1880 if (vect_print_dump_info (REPORT_COST))
1881 fprintf (vect_dump, "cost model: "
1882 "epilogue peel iters set to vf/2 because "
1883 "peeling for alignment is unknown .");
1884
1885 /* If peeled iterations are unknown, count a taken branch and a not taken
1886 branch per peeled loop. Even if scalar loop iterations are known,
1887 vector iterations are not known since peeled prologue iterations are
1888 not known. Hence guards remain the same. */
1889 peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
1890 + TARG_COND_NOT_TAKEN_BRANCH_COST);
1891 }
1892 else
1893 {
1894 if (byte_misalign)
1895 {
1896 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1897 int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
1898 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
1899 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1900
1901 peel_iters_prologue = nelements - (byte_misalign / element_size);
1902 }
1903 else
1904 peel_iters_prologue = 0;
1905
1906 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1907 {
1908 peel_iters_epilogue = vf/2;
1909 if (vect_print_dump_info (REPORT_COST))
1910 fprintf (vect_dump, "cost model: "
1911 "epilogue peel iters set to vf/2 because "
1912 "loop iterations are unknown .");
1913
1914 /* If peeled iterations are known but number of scalar loop
1915 iterations are unknown, count a taken branch per peeled loop. */
1916 peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
1917
1918 }
1919 else
1920 {
1921 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
1922 peel_iters_prologue = niters < peel_iters_prologue ?
1923 niters : peel_iters_prologue;
1924 peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
1925 }
1926 }
1927
1928 vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
1929 + (peel_iters_epilogue * scalar_single_iter_cost)
1930 + peel_guard_costs;
1931
1932 /* FORNOW: The scalar outside cost is incremented in one of the
1933 following ways:
1934
1935 1. The vectorizer checks for alignment and aliasing and generates
1936 a condition that allows dynamic vectorization. A cost model
1937 check is ANDED with the versioning condition. Hence scalar code
1938 path now has the added cost of the versioning check.
1939
1940 if (cost > th & versioning_check)
1941 jmp to vector code
1942
1943 Hence run-time scalar is incremented by not-taken branch cost.
1944
1945 2. The vectorizer then checks if a prologue is required. If the
1946 cost model check was not done before during versioning, it has to
1947 be done before the prologue check.
1948
1949 if (cost <= th)
1950 prologue = scalar_iters
1951 if (prologue == 0)
1952 jmp to vector code
1953 else
1954 execute prologue
1955 if (prologue == num_iters)
1956 go to exit
1957
1958 Hence the run-time scalar cost is incremented by a taken branch,
1959 plus a not-taken branch, plus a taken branch cost.
1960
1961 3. The vectorizer then checks if an epilogue is required. If the
1962 cost model check was not done before during prologue check, it
1963 has to be done with the epilogue check.
1964
1965 if (prologue == 0)
1966 jmp to vector code
1967 else
1968 execute prologue
1969 if (prologue == num_iters)
1970 go to exit
1971 vector code:
1972 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
1973 jmp to epilogue
1974
1975 Hence the run-time scalar cost should be incremented by 2 taken
1976 branches.
1977
1978 TODO: The back end may reorder the BBS's differently and reverse
1979 conditions/branch directions. Change the estimates below to
1980 something more reasonable. */
1981
1982 /* If the number of iterations is known and we do not do versioning, we can
1983 decide whether to vectorize at compile time. Hence the scalar version
1984 do not carry cost model guard costs. */
1985 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1986 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
1987 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1988 {
1989 /* Cost model check occurs at versioning. */
1990 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
1991 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1992 scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
1993 else
1994 {
1995 /* Cost model check occurs at prologue generation. */
1996 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
1997 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
1998 + TARG_COND_NOT_TAKEN_BRANCH_COST;
1999 /* Cost model check occurs at epilogue generation. */
2000 else
2001 scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
2002 }
2003 }
2004
2005 /* Add SLP costs. */
2006 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2007 for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2008 {
2009 vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2010 vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2011 }
2012
2013 /* Calculate number of iterations required to make the vector version
2014 profitable, relative to the loop bodies only. The following condition
2015 must hold true:
2016 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2017 where
2018 SIC = scalar iteration cost, VIC = vector iteration cost,
2019 VOC = vector outside cost, VF = vectorization factor,
2020 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2021 SOC = scalar outside cost for run time cost model check. */
2022
2023 if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2024 {
2025 if (vec_outside_cost <= 0)
2026 min_profitable_iters = 1;
2027 else
2028 {
2029 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2030 - vec_inside_cost * peel_iters_prologue
2031 - vec_inside_cost * peel_iters_epilogue)
2032 / ((scalar_single_iter_cost * vf)
2033 - vec_inside_cost);
2034
2035 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2036 <= ((vec_inside_cost * min_profitable_iters)
2037 + ((vec_outside_cost - scalar_outside_cost) * vf)))
2038 min_profitable_iters++;
2039 }
2040 }
2041 /* vector version will never be profitable. */
2042 else
2043 {
2044 if (vect_print_dump_info (REPORT_COST))
2045 fprintf (vect_dump, "cost model: vector iteration cost = %d "
2046 "is divisible by scalar iteration cost = %d by a factor "
2047 "greater than or equal to the vectorization factor = %d .",
2048 vec_inside_cost, scalar_single_iter_cost, vf);
2049 return -1;
2050 }
2051
2052 if (vect_print_dump_info (REPORT_COST))
2053 {
2054 fprintf (vect_dump, "Cost model analysis: \n");
2055 fprintf (vect_dump, " Vector inside of loop cost: %d\n",
2056 vec_inside_cost);
2057 fprintf (vect_dump, " Vector outside of loop cost: %d\n",
2058 vec_outside_cost);
2059 fprintf (vect_dump, " Scalar iteration cost: %d\n",
2060 scalar_single_iter_cost);
2061 fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost);
2062 fprintf (vect_dump, " prologue iterations: %d\n",
2063 peel_iters_prologue);
2064 fprintf (vect_dump, " epilogue iterations: %d\n",
2065 peel_iters_epilogue);
2066 fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
2067 min_profitable_iters);
2068 }
2069
2070 min_profitable_iters =
2071 min_profitable_iters < vf ? vf : min_profitable_iters;
2072
2073 /* Because the condition we create is:
2074 if (niters <= min_profitable_iters)
2075 then skip the vectorized loop. */
2076 min_profitable_iters--;
2077
2078 if (vect_print_dump_info (REPORT_COST))
2079 fprintf (vect_dump, " Profitability threshold = %d\n",
2080 min_profitable_iters);
2081
2082 return min_profitable_iters;
2083 }
2084
2085
2086 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2087 functions. Design better to avoid maintenance issues. */
2088
2089 /* Function vect_model_reduction_cost.
2090
2091 Models cost for a reduction operation, including the vector ops
2092 generated within the strip-mine loop, the initial definition before
2093 the loop, and the epilogue code that must be generated. */
2094
2095 static bool
2096 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2097 int ncopies)
2098 {
2099 int outer_cost = 0;
2100 enum tree_code code;
2101 optab optab;
2102 tree vectype;
2103 gimple stmt, orig_stmt;
2104 tree reduction_op;
2105 enum machine_mode mode;
2106 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2107 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2108
2109
2110 /* Cost of reduction op inside loop. */
2111 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
2112
2113 stmt = STMT_VINFO_STMT (stmt_info);
2114
2115 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2116 {
2117 case GIMPLE_SINGLE_RHS:
2118 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2119 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2120 break;
2121 case GIMPLE_UNARY_RHS:
2122 reduction_op = gimple_assign_rhs1 (stmt);
2123 break;
2124 case GIMPLE_BINARY_RHS:
2125 reduction_op = gimple_assign_rhs2 (stmt);
2126 break;
2127 default:
2128 gcc_unreachable ();
2129 }
2130
2131 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2132 if (!vectype)
2133 {
2134 if (vect_print_dump_info (REPORT_COST))
2135 {
2136 fprintf (vect_dump, "unsupported data-type ");
2137 print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2138 }
2139 return false;
2140 }
2141
2142 mode = TYPE_MODE (vectype);
2143 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2144
2145 if (!orig_stmt)
2146 orig_stmt = STMT_VINFO_STMT (stmt_info);
2147
2148 code = gimple_assign_rhs_code (orig_stmt);
2149
2150 /* Add in cost for initial definition. */
2151 outer_cost += TARG_SCALAR_TO_VEC_COST;
2152
2153 /* Determine cost of epilogue code.
2154
2155 We have a reduction operator that will reduce the vector in one statement.
2156 Also requires scalar extract. */
2157
2158 if (!nested_in_vect_loop_p (loop, orig_stmt))
2159 {
2160 if (reduc_code != ERROR_MARK)
2161 outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
2162 else
2163 {
2164 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2165 tree bitsize =
2166 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2167 int element_bitsize = tree_low_cst (bitsize, 1);
2168 int nelements = vec_size_in_bits / element_bitsize;
2169
2170 optab = optab_for_tree_code (code, vectype, optab_default);
2171
2172 /* We have a whole vector shift available. */
2173 if (VECTOR_MODE_P (mode)
2174 && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
2175 && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2176 /* Final reduction via vector shifts and the reduction operator. Also
2177 requires scalar extract. */
2178 outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
2179 + TARG_VEC_TO_SCALAR_COST);
2180 else
2181 /* Use extracts and reduction op for final reduction. For N elements,
2182 we have N extracts and N-1 reduction ops. */
2183 outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
2184 }
2185 }
2186
2187 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2188
2189 if (vect_print_dump_info (REPORT_COST))
2190 fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2191 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2192 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2193
2194 return true;
2195 }
2196
2197
2198 /* Function vect_model_induction_cost.
2199
2200 Models cost for induction operations. */
2201
2202 static void
2203 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2204 {
2205 /* loop cost for vec_loop. */
2206 STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
2207 /* prologue cost for vec_init and vec_step. */
2208 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
2209
2210 if (vect_print_dump_info (REPORT_COST))
2211 fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2212 "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2213 STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2214 }
2215
2216
2217 /* Function get_initial_def_for_induction
2218
2219 Input:
2220 STMT - a stmt that performs an induction operation in the loop.
2221 IV_PHI - the initial value of the induction variable
2222
2223 Output:
2224 Return a vector variable, initialized with the first VF values of
2225 the induction variable. E.g., for an iv with IV_PHI='X' and
2226 evolution S, for a vector of 4 units, we want to return:
2227 [X, X + S, X + 2*S, X + 3*S]. */
2228
2229 static tree
2230 get_initial_def_for_induction (gimple iv_phi)
2231 {
2232 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2233 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2234 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2235 tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2236 tree vectype;
2237 int nunits;
2238 edge pe = loop_preheader_edge (loop);
2239 struct loop *iv_loop;
2240 basic_block new_bb;
2241 tree vec, vec_init, vec_step, t;
2242 tree access_fn;
2243 tree new_var;
2244 tree new_name;
2245 gimple init_stmt, induction_phi, new_stmt;
2246 tree induc_def, vec_def, vec_dest;
2247 tree init_expr, step_expr;
2248 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2249 int i;
2250 bool ok;
2251 int ncopies;
2252 tree expr;
2253 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2254 bool nested_in_vect_loop = false;
2255 gimple_seq stmts = NULL;
2256 imm_use_iterator imm_iter;
2257 use_operand_p use_p;
2258 gimple exit_phi;
2259 edge latch_e;
2260 tree loop_arg;
2261 gimple_stmt_iterator si;
2262 basic_block bb = gimple_bb (iv_phi);
2263 tree stepvectype;
2264
2265 vectype = get_vectype_for_scalar_type (scalar_type);
2266 gcc_assert (vectype);
2267 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2268 ncopies = vf / nunits;
2269
2270 gcc_assert (phi_info);
2271 gcc_assert (ncopies >= 1);
2272
2273 /* Find the first insertion point in the BB. */
2274 si = gsi_after_labels (bb);
2275
2276 if (INTEGRAL_TYPE_P (scalar_type))
2277 step_expr = build_int_cst (scalar_type, 0);
2278 else if (POINTER_TYPE_P (scalar_type))
2279 step_expr = build_int_cst (sizetype, 0);
2280 else
2281 step_expr = build_real (scalar_type, dconst0);
2282
2283 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
2284 if (nested_in_vect_loop_p (loop, iv_phi))
2285 {
2286 nested_in_vect_loop = true;
2287 iv_loop = loop->inner;
2288 }
2289 else
2290 iv_loop = loop;
2291 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2292
2293 latch_e = loop_latch_edge (iv_loop);
2294 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2295
2296 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2297 gcc_assert (access_fn);
2298 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2299 &init_expr, &step_expr);
2300 gcc_assert (ok);
2301 pe = loop_preheader_edge (iv_loop);
2302
2303 /* Create the vector that holds the initial_value of the induction. */
2304 if (nested_in_vect_loop)
2305 {
2306 /* iv_loop is nested in the loop to be vectorized. init_expr had already
2307 been created during vectorization of previous stmts; We obtain it from
2308 the STMT_VINFO_VEC_STMT of the defining stmt. */
2309 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
2310 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2311 }
2312 else
2313 {
2314 /* iv_loop is the loop to be vectorized. Create:
2315 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
2316 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2317 add_referenced_var (new_var);
2318
2319 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2320 if (stmts)
2321 {
2322 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2323 gcc_assert (!new_bb);
2324 }
2325
2326 t = NULL_TREE;
2327 t = tree_cons (NULL_TREE, init_expr, t);
2328 for (i = 1; i < nunits; i++)
2329 {
2330 /* Create: new_name_i = new_name + step_expr */
2331 enum tree_code code = POINTER_TYPE_P (scalar_type)
2332 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2333 init_stmt = gimple_build_assign_with_ops (code, new_var,
2334 new_name, step_expr);
2335 new_name = make_ssa_name (new_var, init_stmt);
2336 gimple_assign_set_lhs (init_stmt, new_name);
2337
2338 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2339 gcc_assert (!new_bb);
2340
2341 if (vect_print_dump_info (REPORT_DETAILS))
2342 {
2343 fprintf (vect_dump, "created new init_stmt: ");
2344 print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2345 }
2346 t = tree_cons (NULL_TREE, new_name, t);
2347 }
2348 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
2349 vec = build_constructor_from_list (vectype, nreverse (t));
2350 vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2351 }
2352
2353
2354 /* Create the vector that holds the step of the induction. */
2355 if (nested_in_vect_loop)
2356 /* iv_loop is nested in the loop to be vectorized. Generate:
2357 vec_step = [S, S, S, S] */
2358 new_name = step_expr;
2359 else
2360 {
2361 /* iv_loop is the loop to be vectorized. Generate:
2362 vec_step = [VF*S, VF*S, VF*S, VF*S] */
2363 expr = build_int_cst (TREE_TYPE (step_expr), vf);
2364 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2365 expr, step_expr);
2366 }
2367
2368 t = NULL_TREE;
2369 for (i = 0; i < nunits; i++)
2370 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2371 gcc_assert (CONSTANT_CLASS_P (new_name));
2372 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2373 gcc_assert (stepvectype);
2374 vec = build_vector (stepvectype, t);
2375 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2376
2377
2378 /* Create the following def-use cycle:
2379 loop prolog:
2380 vec_init = ...
2381 vec_step = ...
2382 loop:
2383 vec_iv = PHI <vec_init, vec_loop>
2384 ...
2385 STMT
2386 ...
2387 vec_loop = vec_iv + vec_step; */
2388
2389 /* Create the induction-phi that defines the induction-operand. */
2390 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2391 add_referenced_var (vec_dest);
2392 induction_phi = create_phi_node (vec_dest, iv_loop->header);
2393 set_vinfo_for_stmt (induction_phi,
2394 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2395 induc_def = PHI_RESULT (induction_phi);
2396
2397 /* Create the iv update inside the loop */
2398 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2399 induc_def, vec_step);
2400 vec_def = make_ssa_name (vec_dest, new_stmt);
2401 gimple_assign_set_lhs (new_stmt, vec_def);
2402 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2403 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2404 NULL));
2405
2406 /* Set the arguments of the phi node: */
2407 add_phi_arg (induction_phi, vec_init, pe);
2408 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
2409
2410
2411 /* In case that vectorization factor (VF) is bigger than the number
2412 of elements that we can fit in a vectype (nunits), we have to generate
2413 more than one vector stmt - i.e - we need to "unroll" the
2414 vector stmt by a factor VF/nunits. For more details see documentation
2415 in vectorizable_operation. */
2416
2417 if (ncopies > 1)
2418 {
2419 stmt_vec_info prev_stmt_vinfo;
2420 /* FORNOW. This restriction should be relaxed. */
2421 gcc_assert (!nested_in_vect_loop);
2422
2423 /* Create the vector that holds the step of the induction. */
2424 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2425 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2426 expr, step_expr);
2427 t = NULL_TREE;
2428 for (i = 0; i < nunits; i++)
2429 t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2430 gcc_assert (CONSTANT_CLASS_P (new_name));
2431 vec = build_vector (stepvectype, t);
2432 vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2433
2434 vec_def = induc_def;
2435 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2436 for (i = 1; i < ncopies; i++)
2437 {
2438 /* vec_i = vec_prev + vec_step */
2439 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2440 vec_def, vec_step);
2441 vec_def = make_ssa_name (vec_dest, new_stmt);
2442 gimple_assign_set_lhs (new_stmt, vec_def);
2443
2444 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2445 set_vinfo_for_stmt (new_stmt,
2446 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2447 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2448 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2449 }
2450 }
2451
2452 if (nested_in_vect_loop)
2453 {
2454 /* Find the loop-closed exit-phi of the induction, and record
2455 the final vector of induction results: */
2456 exit_phi = NULL;
2457 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2458 {
2459 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2460 {
2461 exit_phi = USE_STMT (use_p);
2462 break;
2463 }
2464 }
2465 if (exit_phi)
2466 {
2467 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2468 /* FORNOW. Currently not supporting the case that an inner-loop induction
2469 is not used in the outer-loop (i.e. only outside the outer-loop). */
2470 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2471 && !STMT_VINFO_LIVE_P (stmt_vinfo));
2472
2473 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2474 if (vect_print_dump_info (REPORT_DETAILS))
2475 {
2476 fprintf (vect_dump, "vector of inductions after inner-loop:");
2477 print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2478 }
2479 }
2480 }
2481
2482
2483 if (vect_print_dump_info (REPORT_DETAILS))
2484 {
2485 fprintf (vect_dump, "transform induction: created def-use cycle: ");
2486 print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2487 fprintf (vect_dump, "\n");
2488 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2489 }
2490
2491 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2492 return induc_def;
2493 }
2494
2495
2496 /* Function get_initial_def_for_reduction
2497
2498 Input:
2499 STMT - a stmt that performs a reduction operation in the loop.
2500 INIT_VAL - the initial value of the reduction variable
2501
2502 Output:
2503 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2504 of the reduction (used for adjusting the epilog - see below).
2505 Return a vector variable, initialized according to the operation that STMT
2506 performs. This vector will be used as the initial value of the
2507 vector of partial results.
2508
2509 Option1 (adjust in epilog): Initialize the vector as follows:
2510 add: [0,0,...,0,0]
2511 mult: [1,1,...,1,1]
2512 min/max: [init_val,init_val,..,init_val,init_val]
2513 bit and/or: [init_val,init_val,..,init_val,init_val]
2514 and when necessary (e.g. add/mult case) let the caller know
2515 that it needs to adjust the result by init_val.
2516
2517 Option2: Initialize the vector as follows:
2518 add: [0,0,...,0,init_val]
2519 mult: [1,1,...,1,init_val]
2520 min/max: [init_val,init_val,...,init_val]
2521 bit and/or: [init_val,init_val,...,init_val]
2522 and no adjustments are needed.
2523
2524 For example, for the following code:
2525
2526 s = init_val;
2527 for (i=0;i<n;i++)
2528 s = s + a[i];
2529
2530 STMT is 's = s + a[i]', and the reduction variable is 's'.
2531 For a vector of 4 units, we want to return either [0,0,0,init_val],
2532 or [0,0,0,0] and let the caller know that it needs to adjust
2533 the result at the end by 'init_val'.
2534
2535 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2536 initialization vector is simpler (same element in all entries).
2537 A cost model should help decide between these two schemes. */
2538
2539 tree
2540 get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
2541 {
2542 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2543 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2544 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2545 tree scalar_type = TREE_TYPE (init_val);
2546 tree vectype = get_vectype_for_scalar_type (scalar_type);
2547 int nunits;
2548 enum tree_code code = gimple_assign_rhs_code (stmt);
2549 tree def_for_init;
2550 tree init_def;
2551 tree t = NULL_TREE;
2552 int i;
2553 bool nested_in_vect_loop = false;
2554
2555 gcc_assert (vectype);
2556 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2557
2558 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2559 || SCALAR_FLOAT_TYPE_P (scalar_type));
2560 if (nested_in_vect_loop_p (loop, stmt))
2561 nested_in_vect_loop = true;
2562 else
2563 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2564
2565 switch (code)
2566 {
2567 case WIDEN_SUM_EXPR:
2568 case DOT_PROD_EXPR:
2569 case PLUS_EXPR:
2570 case MINUS_EXPR:
2571 if (nested_in_vect_loop)
2572 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2573 else
2574 *adjustment_def = init_val;
2575 /* Create a vector of zeros for init_def. */
2576 if (SCALAR_FLOAT_TYPE_P (scalar_type))
2577 def_for_init = build_real (scalar_type, dconst0);
2578 else
2579 def_for_init = build_int_cst (scalar_type, 0);
2580
2581 for (i = nunits - 1; i >= 0; --i)
2582 t = tree_cons (NULL_TREE, def_for_init, t);
2583 init_def = build_vector (vectype, t);
2584 break;
2585
2586 case MIN_EXPR:
2587 case MAX_EXPR:
2588 *adjustment_def = NULL_TREE;
2589 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2590 break;
2591
2592 default:
2593 gcc_unreachable ();
2594 }
2595
2596 return init_def;
2597 }
2598
2599
2600 /* Function vect_create_epilog_for_reduction
2601
2602 Create code at the loop-epilog to finalize the result of a reduction
2603 computation.
2604
2605 VECT_DEF is a vector of partial results.
2606 REDUC_CODE is the tree-code for the epilog reduction.
2607 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2608 number of elements that we can fit in a vectype (nunits). In this case
2609 we have to generate more than one vector stmt - i.e - we need to "unroll"
2610 the vector stmt by a factor VF/nunits. For more details see documentation
2611 in vectorizable_operation.
2612 STMT is the scalar reduction stmt that is being vectorized.
2613 REDUCTION_PHI is the phi-node that carries the reduction computation.
2614 REDUC_INDEX is the index of the operand in the right hand side of the
2615 statement that is defined by REDUCTION_PHI.
2616
2617 This function:
2618 1. Creates the reduction def-use cycle: sets the arguments for
2619 REDUCTION_PHI:
2620 The loop-entry argument is the vectorized initial-value of the reduction.
2621 The loop-latch argument is VECT_DEF - the vector of partial sums.
2622 2. "Reduces" the vector of partial results VECT_DEF into a single result,
2623 by applying the operation specified by REDUC_CODE if available, or by
2624 other means (whole-vector shifts or a scalar loop).
2625 The function also creates a new phi node at the loop exit to preserve
2626 loop-closed form, as illustrated below.
2627
2628 The flow at the entry to this function:
2629
2630 loop:
2631 vec_def = phi <null, null> # REDUCTION_PHI
2632 VECT_DEF = vector_stmt # vectorized form of STMT
2633 s_loop = scalar_stmt # (scalar) STMT
2634 loop_exit:
2635 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2636 use <s_out0>
2637 use <s_out0>
2638
2639 The above is transformed by this function into:
2640
2641 loop:
2642 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
2643 VECT_DEF = vector_stmt # vectorized form of STMT
2644 s_loop = scalar_stmt # (scalar) STMT
2645 loop_exit:
2646 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
2647 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2648 v_out2 = reduce <v_out1>
2649 s_out3 = extract_field <v_out2, 0>
2650 s_out4 = adjust_result <s_out3>
2651 use <s_out4>
2652 use <s_out4>
2653 */
2654
2655 static void
2656 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2657 int ncopies,
2658 enum tree_code reduc_code,
2659 gimple reduction_phi,
2660 int reduc_index)
2661 {
2662 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2663 stmt_vec_info prev_phi_info;
2664 tree vectype;
2665 enum machine_mode mode;
2666 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2667 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2668 basic_block exit_bb;
2669 tree scalar_dest;
2670 tree scalar_type;
2671 gimple new_phi = NULL, phi;
2672 gimple_stmt_iterator exit_gsi;
2673 tree vec_dest;
2674 tree new_temp = NULL_TREE;
2675 tree new_name;
2676 gimple epilog_stmt = NULL;
2677 tree new_scalar_dest, new_dest;
2678 gimple exit_phi;
2679 tree bitsize, bitpos, bytesize;
2680 enum tree_code code = gimple_assign_rhs_code (stmt);
2681 tree adjustment_def;
2682 tree vec_initial_def, def;
2683 tree orig_name;
2684 imm_use_iterator imm_iter;
2685 use_operand_p use_p;
2686 bool extract_scalar_result = false;
2687 tree reduction_op, expr;
2688 gimple orig_stmt;
2689 gimple use_stmt;
2690 bool nested_in_vect_loop = false;
2691 VEC(gimple,heap) *phis = NULL;
2692 enum vect_def_type dt = vect_unknown_def_type;
2693 int j, i;
2694
2695 if (nested_in_vect_loop_p (loop, stmt))
2696 {
2697 loop = loop->inner;
2698 nested_in_vect_loop = true;
2699 }
2700
2701 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2702 {
2703 case GIMPLE_SINGLE_RHS:
2704 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
2705 == ternary_op);
2706 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
2707 break;
2708 case GIMPLE_UNARY_RHS:
2709 reduction_op = gimple_assign_rhs1 (stmt);
2710 break;
2711 case GIMPLE_BINARY_RHS:
2712 reduction_op = reduc_index ?
2713 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
2714 break;
2715 default:
2716 gcc_unreachable ();
2717 }
2718
2719 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2720 gcc_assert (vectype);
2721 mode = TYPE_MODE (vectype);
2722
2723 /*** 1. Create the reduction def-use cycle ***/
2724
2725 /* For the case of reduction, vect_get_vec_def_for_operand returns
2726 the scalar def before the loop, that defines the initial value
2727 of the reduction variable. */
2728 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2729 &adjustment_def);
2730
2731 phi = reduction_phi;
2732 def = vect_def;
2733 for (j = 0; j < ncopies; j++)
2734 {
2735 /* 1.1 set the loop-entry arg of the reduction-phi: */
2736 add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop));
2737
2738 /* 1.2 set the loop-latch arg for the reduction-phi: */
2739 if (j > 0)
2740 def = vect_get_vec_def_for_stmt_copy (dt, def);
2741 add_phi_arg (phi, def, loop_latch_edge (loop));
2742
2743 if (vect_print_dump_info (REPORT_DETAILS))
2744 {
2745 fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2746 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2747 fprintf (vect_dump, "\n");
2748 print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2749 }
2750
2751 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2752 }
2753
2754 /*** 2. Create epilog code
2755 The reduction epilog code operates across the elements of the vector
2756 of partial results computed by the vectorized loop.
2757 The reduction epilog code consists of:
2758 step 1: compute the scalar result in a vector (v_out2)
2759 step 2: extract the scalar result (s_out3) from the vector (v_out2)
2760 step 3: adjust the scalar result (s_out3) if needed.
2761
2762 Step 1 can be accomplished using one the following three schemes:
2763 (scheme 1) using reduc_code, if available.
2764 (scheme 2) using whole-vector shifts, if available.
2765 (scheme 3) using a scalar loop. In this case steps 1+2 above are
2766 combined.
2767
2768 The overall epilog code looks like this:
2769
2770 s_out0 = phi <s_loop> # original EXIT_PHI
2771 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
2772 v_out2 = reduce <v_out1> # step 1
2773 s_out3 = extract_field <v_out2, 0> # step 2
2774 s_out4 = adjust_result <s_out3> # step 3
2775
2776 (step 3 is optional, and steps 1 and 2 may be combined).
2777 Lastly, the uses of s_out0 are replaced by s_out4.
2778
2779 ***/
2780
2781 /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2782 v_out1 = phi <v_loop> */
2783
2784 exit_bb = single_exit (loop)->dest;
2785 def = vect_def;
2786 prev_phi_info = NULL;
2787 for (j = 0; j < ncopies; j++)
2788 {
2789 phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2790 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
2791 if (j == 0)
2792 new_phi = phi;
2793 else
2794 {
2795 def = vect_get_vec_def_for_stmt_copy (dt, def);
2796 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
2797 }
2798 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
2799 prev_phi_info = vinfo_for_stmt (phi);
2800 }
2801
2802 exit_gsi = gsi_after_labels (exit_bb);
2803
2804 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2805 (i.e. when reduc_code is not available) and in the final adjustment
2806 code (if needed). Also get the original scalar reduction variable as
2807 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
2808 represents a reduction pattern), the tree-code and scalar-def are
2809 taken from the original stmt that the pattern-stmt (STMT) replaces.
2810 Otherwise (it is a regular reduction) - the tree-code and scalar-def
2811 are taken from STMT. */
2812
2813 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2814 if (!orig_stmt)
2815 {
2816 /* Regular reduction */
2817 orig_stmt = stmt;
2818 }
2819 else
2820 {
2821 /* Reduction pattern */
2822 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2823 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2824 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2825 }
2826
2827 code = gimple_assign_rhs_code (orig_stmt);
2828 scalar_dest = gimple_assign_lhs (orig_stmt);
2829 scalar_type = TREE_TYPE (scalar_dest);
2830 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2831 bitsize = TYPE_SIZE (scalar_type);
2832 bytesize = TYPE_SIZE_UNIT (scalar_type);
2833
2834
2835 /* In case this is a reduction in an inner-loop while vectorizing an outer
2836 loop - we don't need to extract a single scalar result at the end of the
2837 inner-loop. The final vector of partial results will be used in the
2838 vectorized outer-loop, or reduced to a scalar result at the end of the
2839 outer-loop. */
2840 if (nested_in_vect_loop)
2841 goto vect_finalize_reduction;
2842
2843 /* FORNOW */
2844 gcc_assert (ncopies == 1);
2845
2846 /* 2.3 Create the reduction code, using one of the three schemes described
2847 above. */
2848
2849 if (reduc_code != ERROR_MARK)
2850 {
2851 tree tmp;
2852
2853 /*** Case 1: Create:
2854 v_out2 = reduc_expr <v_out1> */
2855
2856 if (vect_print_dump_info (REPORT_DETAILS))
2857 fprintf (vect_dump, "Reduce using direct vector reduction.");
2858
2859 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2860 tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
2861 epilog_stmt = gimple_build_assign (vec_dest, tmp);
2862 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2863 gimple_assign_set_lhs (epilog_stmt, new_temp);
2864 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2865
2866 extract_scalar_result = true;
2867 }
2868 else
2869 {
2870 enum tree_code shift_code = ERROR_MARK;
2871 bool have_whole_vector_shift = true;
2872 int bit_offset;
2873 int element_bitsize = tree_low_cst (bitsize, 1);
2874 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2875 tree vec_temp;
2876
2877 if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2878 shift_code = VEC_RSHIFT_EXPR;
2879 else
2880 have_whole_vector_shift = false;
2881
2882 /* Regardless of whether we have a whole vector shift, if we're
2883 emulating the operation via tree-vect-generic, we don't want
2884 to use it. Only the first round of the reduction is likely
2885 to still be profitable via emulation. */
2886 /* ??? It might be better to emit a reduction tree code here, so that
2887 tree-vect-generic can expand the first round via bit tricks. */
2888 if (!VECTOR_MODE_P (mode))
2889 have_whole_vector_shift = false;
2890 else
2891 {
2892 optab optab = optab_for_tree_code (code, vectype, optab_default);
2893 if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
2894 have_whole_vector_shift = false;
2895 }
2896
2897 if (have_whole_vector_shift)
2898 {
2899 /*** Case 2: Create:
2900 for (offset = VS/2; offset >= element_size; offset/=2)
2901 {
2902 Create: va' = vec_shift <va, offset>
2903 Create: va = vop <va, va'>
2904 } */
2905
2906 if (vect_print_dump_info (REPORT_DETAILS))
2907 fprintf (vect_dump, "Reduce using vector shifts");
2908
2909 vec_dest = vect_create_destination_var (scalar_dest, vectype);
2910 new_temp = PHI_RESULT (new_phi);
2911
2912 for (bit_offset = vec_size_in_bits/2;
2913 bit_offset >= element_bitsize;
2914 bit_offset /= 2)
2915 {
2916 tree bitpos = size_int (bit_offset);
2917 epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
2918 new_temp, bitpos);
2919 new_name = make_ssa_name (vec_dest, epilog_stmt);
2920 gimple_assign_set_lhs (epilog_stmt, new_name);
2921 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2922
2923 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
2924 new_name, new_temp);
2925 new_temp = make_ssa_name (vec_dest, epilog_stmt);
2926 gimple_assign_set_lhs (epilog_stmt, new_temp);
2927 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2928 }
2929
2930 extract_scalar_result = true;
2931 }
2932 else
2933 {
2934 tree rhs;
2935
2936 /*** Case 3: Create:
2937 s = extract_field <v_out2, 0>
2938 for (offset = element_size;
2939 offset < vector_size;
2940 offset += element_size;)
2941 {
2942 Create: s' = extract_field <v_out2, offset>
2943 Create: s = op <s, s'>
2944 } */
2945
2946 if (vect_print_dump_info (REPORT_DETAILS))
2947 fprintf (vect_dump, "Reduce using scalar code. ");
2948
2949 vec_temp = PHI_RESULT (new_phi);
2950 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2951 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2952 bitsize_zero_node);
2953 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2954 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2955 gimple_assign_set_lhs (epilog_stmt, new_temp);
2956 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2957
2958 for (bit_offset = element_bitsize;
2959 bit_offset < vec_size_in_bits;
2960 bit_offset += element_bitsize)
2961 {
2962 tree bitpos = bitsize_int (bit_offset);
2963 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
2964 bitpos);
2965
2966 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
2967 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
2968 gimple_assign_set_lhs (epilog_stmt, new_name);
2969 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2970
2971 epilog_stmt = gimple_build_assign_with_ops (code,
2972 new_scalar_dest,
2973 new_name, new_temp);
2974 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
2975 gimple_assign_set_lhs (epilog_stmt, new_temp);
2976 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
2977 }
2978
2979 extract_scalar_result = false;
2980 }
2981 }
2982
2983 /* 2.4 Extract the final scalar result. Create:
2984 s_out3 = extract_field <v_out2, bitpos> */
2985
2986 if (extract_scalar_result)
2987 {
2988 tree rhs;
2989
2990 gcc_assert (!nested_in_vect_loop);
2991 if (vect_print_dump_info (REPORT_DETAILS))
2992 fprintf (vect_dump, "extract scalar result");
2993
2994 if (BYTES_BIG_ENDIAN)
2995 bitpos = size_binop (MULT_EXPR,
2996 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
2997 TYPE_SIZE (scalar_type));
2998 else
2999 bitpos = bitsize_zero_node;
3000
3001 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3002 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3003 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3004 gimple_assign_set_lhs (epilog_stmt, new_temp);
3005 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3006 }
3007
3008 vect_finalize_reduction:
3009
3010 /* 2.5 Adjust the final result by the initial value of the reduction
3011 variable. (When such adjustment is not needed, then
3012 'adjustment_def' is zero). For example, if code is PLUS we create:
3013 new_temp = loop_exit_def + adjustment_def */
3014
3015 if (adjustment_def)
3016 {
3017 if (nested_in_vect_loop)
3018 {
3019 /* For MINUS_EXPR we create new_temp = loop_exit_def + adjustment_def
3020 since the initial value is [0,0,...,0]. */
3021 if (code == MINUS_EXPR)
3022 code = PLUS_EXPR;
3023
3024 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3025 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3026 new_dest = vect_create_destination_var (scalar_dest, vectype);
3027 }
3028 else
3029 {
3030 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3031 expr = build2 (code, scalar_type, new_temp, adjustment_def);
3032 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3033 }
3034
3035 epilog_stmt = gimple_build_assign (new_dest, expr);
3036 new_temp = make_ssa_name (new_dest, epilog_stmt);
3037 gimple_assign_set_lhs (epilog_stmt, new_temp);
3038 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3039 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3040 }
3041
3042
3043 /* 2.6 Handle the loop-exit phi */
3044
3045 /* Replace uses of s_out0 with uses of s_out3:
3046 Find the loop-closed-use at the loop exit of the original scalar result.
3047 (The reduction result is expected to have two immediate uses - one at the
3048 latch block, and one at the loop exit). */
3049 phis = VEC_alloc (gimple, heap, 10);
3050 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3051 {
3052 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3053 {
3054 exit_phi = USE_STMT (use_p);
3055 VEC_quick_push (gimple, phis, exit_phi);
3056 }
3057 }
3058 /* We expect to have found an exit_phi because of loop-closed-ssa form. */
3059 gcc_assert (!VEC_empty (gimple, phis));
3060
3061 for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
3062 {
3063 if (nested_in_vect_loop)
3064 {
3065 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3066
3067 /* FORNOW. Currently not supporting the case that an inner-loop
3068 reduction is not used in the outer-loop (but only outside the
3069 outer-loop). */
3070 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3071 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3072
3073 epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
3074 STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
3075 set_vinfo_for_stmt (epilog_stmt,
3076 new_stmt_vec_info (epilog_stmt, loop_vinfo,
3077 NULL));
3078 if (adjustment_def)
3079 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3080 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3081 continue;
3082 }
3083
3084 /* Replace the uses: */
3085 orig_name = PHI_RESULT (exit_phi);
3086 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3087 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3088 SET_USE (use_p, new_temp);
3089 }
3090 VEC_free (gimple, heap, phis);
3091 }
3092
3093
3094 /* Function vectorizable_reduction.
3095
3096 Check if STMT performs a reduction operation that can be vectorized.
3097 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3098 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3099 Return FALSE if not a vectorizable STMT, TRUE otherwise.
3100
3101 This function also handles reduction idioms (patterns) that have been
3102 recognized in advance during vect_pattern_recog. In this case, STMT may be
3103 of this form:
3104 X = pattern_expr (arg0, arg1, ..., X)
3105 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3106 sequence that had been detected and replaced by the pattern-stmt (STMT).
3107
3108 In some cases of reduction patterns, the type of the reduction variable X is
3109 different than the type of the other arguments of STMT.
3110 In such cases, the vectype that is used when transforming STMT into a vector
3111 stmt is different than the vectype that is used to determine the
3112 vectorization factor, because it consists of a different number of elements
3113 than the actual number of elements that are being operated upon in parallel.
3114
3115 For example, consider an accumulation of shorts into an int accumulator.
3116 On some targets it's possible to vectorize this pattern operating on 8
3117 shorts at a time (hence, the vectype for purposes of determining the
3118 vectorization factor should be V8HI); on the other hand, the vectype that
3119 is used to create the vector form is actually V4SI (the type of the result).
3120
3121 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3122 indicates what is the actual level of parallelism (V8HI in the example), so
3123 that the right vectorization factor would be derived. This vectype
3124 corresponds to the type of arguments to the reduction stmt, and should *NOT*
3125 be used to create the vectorized stmt. The right vectype for the vectorized
3126 stmt is obtained from the type of the result X:
3127 get_vectype_for_scalar_type (TREE_TYPE (X))
3128
3129 This means that, contrary to "regular" reductions (or "regular" stmts in
3130 general), the following equation:
3131 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3132 does *NOT* necessarily hold for reduction patterns. */
3133
3134 bool
3135 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3136 gimple *vec_stmt)
3137 {
3138 tree vec_dest;
3139 tree scalar_dest;
3140 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3141 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3142 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3143 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3144 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3145 enum tree_code code, orig_code, epilog_reduc_code;
3146 enum machine_mode vec_mode;
3147 int op_type;
3148 optab optab, reduc_optab;
3149 tree new_temp = NULL_TREE;
3150 tree def;
3151 gimple def_stmt;
3152 enum vect_def_type dt;
3153 gimple new_phi = NULL;
3154 tree scalar_type;
3155 bool is_simple_use;
3156 gimple orig_stmt;
3157 stmt_vec_info orig_stmt_info;
3158 tree expr = NULL_TREE;
3159 int i;
3160 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3161 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3162 int epilog_copies;
3163 stmt_vec_info prev_stmt_info, prev_phi_info;
3164 gimple first_phi = NULL;
3165 bool single_defuse_cycle = false;
3166 tree reduc_def;
3167 gimple new_stmt = NULL;
3168 int j;
3169 tree ops[3];
3170 bool nested_cycle = false, found_nested_cycle_def = false;
3171 gimple reduc_def_stmt = NULL;
3172 /* The default is that the reduction variable is the last in statement. */
3173 int reduc_index = 2;
3174
3175 if (nested_in_vect_loop_p (loop, stmt))
3176 {
3177 loop = loop->inner;
3178 nested_cycle = true;
3179 }
3180
3181 gcc_assert (ncopies >= 1);
3182
3183 /* FORNOW: SLP not supported. */
3184 if (STMT_SLP_TYPE (stmt_info))
3185 return false;
3186
3187 /* 1. Is vectorizable reduction? */
3188
3189 /* Not supportable if the reduction variable is used in the loop. */
3190 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3191 return false;
3192
3193 /* Reductions that are not used even in an enclosing outer-loop,
3194 are expected to be "live" (used out of the loop). */
3195 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3196 && !STMT_VINFO_LIVE_P (stmt_info))
3197 return false;
3198
3199 /* Make sure it was already recognized as a reduction computation. */
3200 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3201 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3202 return false;
3203
3204 /* 2. Has this been recognized as a reduction pattern?
3205
3206 Check if STMT represents a pattern that has been recognized
3207 in earlier analysis stages. For stmts that represent a pattern,
3208 the STMT_VINFO_RELATED_STMT field records the last stmt in
3209 the original sequence that constitutes the pattern. */
3210
3211 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3212 if (orig_stmt)
3213 {
3214 orig_stmt_info = vinfo_for_stmt (orig_stmt);
3215 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3216 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3217 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3218 }
3219
3220 /* 3. Check the operands of the operation. The first operands are defined
3221 inside the loop body. The last operand is the reduction variable,
3222 which is defined by the loop-header-phi. */
3223
3224 gcc_assert (is_gimple_assign (stmt));
3225
3226 /* Flatten RHS */
3227 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3228 {
3229 case GIMPLE_SINGLE_RHS:
3230 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3231 if (op_type == ternary_op)
3232 {
3233 tree rhs = gimple_assign_rhs1 (stmt);
3234 ops[0] = TREE_OPERAND (rhs, 0);
3235 ops[1] = TREE_OPERAND (rhs, 1);
3236 ops[2] = TREE_OPERAND (rhs, 2);
3237 code = TREE_CODE (rhs);
3238 }
3239 else
3240 return false;
3241 break;
3242
3243 case GIMPLE_BINARY_RHS:
3244 code = gimple_assign_rhs_code (stmt);
3245 op_type = TREE_CODE_LENGTH (code);
3246 gcc_assert (op_type == binary_op);
3247 ops[0] = gimple_assign_rhs1 (stmt);
3248 ops[1] = gimple_assign_rhs2 (stmt);
3249 break;
3250
3251 case GIMPLE_UNARY_RHS:
3252 return false;
3253
3254 default:
3255 gcc_unreachable ();
3256 }
3257
3258 scalar_dest = gimple_assign_lhs (stmt);
3259 scalar_type = TREE_TYPE (scalar_dest);
3260 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3261 && !SCALAR_FLOAT_TYPE_P (scalar_type))
3262 return false;
3263
3264 /* All uses but the last are expected to be defined in the loop.
3265 The last use is the reduction variable. In case of nested cycle this
3266 assumption is not true: we use reduc_index to record the index of the
3267 reduction variable. */
3268 for (i = 0; i < op_type-1; i++)
3269 {
3270 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3271 &def, &dt);
3272 gcc_assert (is_simple_use);
3273 if (dt != vect_internal_def
3274 && dt != vect_external_def
3275 && dt != vect_constant_def
3276 && dt != vect_induction_def
3277 && dt != vect_nested_cycle)
3278 return false;
3279
3280 if (dt == vect_nested_cycle)
3281 {
3282 found_nested_cycle_def = true;
3283 reduc_def_stmt = def_stmt;
3284 reduc_index = i;
3285 }
3286 }
3287
3288 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3289 &def, &dt);
3290 gcc_assert (is_simple_use);
3291 gcc_assert (dt == vect_reduction_def
3292 || dt == vect_nested_cycle
3293 || ((dt == vect_internal_def || dt == vect_external_def
3294 || dt == vect_constant_def || dt == vect_induction_def)
3295 && nested_cycle && found_nested_cycle_def));
3296 if (!found_nested_cycle_def)
3297 reduc_def_stmt = def_stmt;
3298
3299 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
3300 if (orig_stmt)
3301 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
3302 reduc_def_stmt,
3303 !nested_cycle));
3304 else
3305 gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
3306 !nested_cycle));
3307
3308 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
3309 return false;
3310
3311 /* 4. Supportable by target? */
3312
3313 /* 4.1. check support for the operation in the loop */
3314 optab = optab_for_tree_code (code, vectype, optab_default);
3315 if (!optab)
3316 {
3317 if (vect_print_dump_info (REPORT_DETAILS))
3318 fprintf (vect_dump, "no optab.");
3319 return false;
3320 }
3321 vec_mode = TYPE_MODE (vectype);
3322 if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
3323 {
3324 if (vect_print_dump_info (REPORT_DETAILS))
3325 fprintf (vect_dump, "op not supported by target.");
3326 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3327 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3328 < vect_min_worthwhile_factor (code))
3329 return false;
3330 if (vect_print_dump_info (REPORT_DETAILS))
3331 fprintf (vect_dump, "proceeding using word mode.");
3332 }
3333
3334 /* Worthwhile without SIMD support? */
3335 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3336 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3337 < vect_min_worthwhile_factor (code))
3338 {
3339 if (vect_print_dump_info (REPORT_DETAILS))
3340 fprintf (vect_dump, "not worthwhile without SIMD support.");
3341 return false;
3342 }
3343
3344 /* 4.2. Check support for the epilog operation.
3345
3346 If STMT represents a reduction pattern, then the type of the
3347 reduction variable may be different than the type of the rest
3348 of the arguments. For example, consider the case of accumulation
3349 of shorts into an int accumulator; The original code:
3350 S1: int_a = (int) short_a;
3351 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
3352
3353 was replaced with:
3354 STMT: int_acc = widen_sum <short_a, int_acc>
3355
3356 This means that:
3357 1. The tree-code that is used to create the vector operation in the
3358 epilog code (that reduces the partial results) is not the
3359 tree-code of STMT, but is rather the tree-code of the original
3360 stmt from the pattern that STMT is replacing. I.e, in the example
3361 above we want to use 'widen_sum' in the loop, but 'plus' in the
3362 epilog.
3363 2. The type (mode) we use to check available target support
3364 for the vector operation to be created in the *epilog*, is
3365 determined by the type of the reduction variable (in the example
3366 above we'd check this: plus_optab[vect_int_mode]).
3367 However the type (mode) we use to check available target support
3368 for the vector operation to be created *inside the loop*, is
3369 determined by the type of the other arguments to STMT (in the
3370 example we'd check this: widen_sum_optab[vect_short_mode]).
3371
3372 This is contrary to "regular" reductions, in which the types of all
3373 the arguments are the same as the type of the reduction variable.
3374 For "regular" reductions we can therefore use the same vector type
3375 (and also the same tree-code) when generating the epilog code and
3376 when generating the code inside the loop. */
3377
3378 if (orig_stmt)
3379 {
3380 /* This is a reduction pattern: get the vectype from the type of the
3381 reduction variable, and get the tree-code from orig_stmt. */
3382 orig_code = gimple_assign_rhs_code (orig_stmt);
3383 vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3384 if (!vectype)
3385 {
3386 if (vect_print_dump_info (REPORT_DETAILS))
3387 {
3388 fprintf (vect_dump, "unsupported data-type ");
3389 print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3390 }
3391 return false;
3392 }
3393
3394 vec_mode = TYPE_MODE (vectype);
3395 }
3396 else
3397 {
3398 /* Regular reduction: use the same vectype and tree-code as used for
3399 the vector code inside the loop can be used for the epilog code. */
3400 orig_code = code;
3401 }
3402
3403 if (nested_cycle)
3404 epilog_reduc_code = orig_code;
3405 else
3406 if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3407 return false;
3408
3409 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
3410 if (!reduc_optab)
3411 {
3412 if (vect_print_dump_info (REPORT_DETAILS))
3413 fprintf (vect_dump, "no optab for reduction.");
3414 epilog_reduc_code = ERROR_MARK;
3415 }
3416 if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
3417 {
3418 if (vect_print_dump_info (REPORT_DETAILS))
3419 fprintf (vect_dump, "reduc op not supported by target.");
3420 epilog_reduc_code = ERROR_MARK;
3421 }
3422
3423 if (!vec_stmt) /* transformation not required. */
3424 {
3425 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3426 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3427 return false;
3428 return true;
3429 }
3430
3431 /** Transform. **/
3432
3433 if (vect_print_dump_info (REPORT_DETAILS))
3434 fprintf (vect_dump, "transform reduction.");
3435
3436 /* Create the destination vector */
3437 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3438
3439 /* In case the vectorization factor (VF) is bigger than the number
3440 of elements that we can fit in a vectype (nunits), we have to generate
3441 more than one vector stmt - i.e - we need to "unroll" the
3442 vector stmt by a factor VF/nunits. For more details see documentation
3443 in vectorizable_operation. */
3444
3445 /* If the reduction is used in an outer loop we need to generate
3446 VF intermediate results, like so (e.g. for ncopies=2):
3447 r0 = phi (init, r0)
3448 r1 = phi (init, r1)
3449 r0 = x0 + r0;
3450 r1 = x1 + r1;
3451 (i.e. we generate VF results in 2 registers).
3452 In this case we have a separate def-use cycle for each copy, and therefore
3453 for each copy we get the vector def for the reduction variable from the
3454 respective phi node created for this copy.
3455
3456 Otherwise (the reduction is unused in the loop nest), we can combine
3457 together intermediate results, like so (e.g. for ncopies=2):
3458 r = phi (init, r)
3459 r = x0 + r;
3460 r = x1 + r;
3461 (i.e. we generate VF/2 results in a single register).
3462 In this case for each copy we get the vector def for the reduction variable
3463 from the vectorized reduction operation generated in the previous iteration.
3464 */
3465
3466 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
3467 {
3468 single_defuse_cycle = true;
3469 epilog_copies = 1;
3470 }
3471 else
3472 epilog_copies = ncopies;
3473
3474 prev_stmt_info = NULL;
3475 prev_phi_info = NULL;
3476 for (j = 0; j < ncopies; j++)
3477 {
3478 if (j == 0 || !single_defuse_cycle)
3479 {
3480 /* Create the reduction-phi that defines the reduction-operand. */
3481 new_phi = create_phi_node (vec_dest, loop->header);
3482 set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo,
3483 NULL));
3484 }
3485
3486 /* Handle uses. */
3487 if (j == 0)
3488 {
3489 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
3490 stmt, NULL);
3491 if (op_type == ternary_op)
3492 {
3493 if (reduc_index == 0)
3494 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
3495 NULL);
3496 else
3497 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
3498 NULL);
3499 }
3500
3501 /* Get the vector def for the reduction variable from the phi
3502 node. */
3503 reduc_def = PHI_RESULT (new_phi);
3504 first_phi = new_phi;
3505 }
3506 else
3507 {
3508 enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3509 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3510 if (op_type == ternary_op)
3511 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3512
3513 if (single_defuse_cycle)
3514 reduc_def = gimple_assign_lhs (new_stmt);
3515 else
3516 reduc_def = PHI_RESULT (new_phi);
3517
3518 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3519 }
3520
3521
3522 /* Arguments are ready. create the new vector stmt. */
3523 if (op_type == binary_op)
3524 {
3525 if (reduc_index == 0)
3526 expr = build2 (code, vectype, reduc_def, loop_vec_def0);
3527 else
3528 expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3529 }
3530 else
3531 {
3532 if (reduc_index == 0)
3533 expr = build3 (code, vectype, reduc_def, loop_vec_def0,
3534 loop_vec_def1);
3535 else
3536 {
3537 if (reduc_index == 1)
3538 expr = build3 (code, vectype, loop_vec_def0, reduc_def,
3539 loop_vec_def1);
3540 else
3541 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3542 reduc_def);
3543 }
3544 }
3545
3546 new_stmt = gimple_build_assign (vec_dest, expr);
3547 new_temp = make_ssa_name (vec_dest, new_stmt);
3548 gimple_assign_set_lhs (new_stmt, new_temp);
3549 vect_finish_stmt_generation (stmt, new_stmt, gsi);
3550
3551 if (j == 0)
3552 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3553 else
3554 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3555 prev_stmt_info = vinfo_for_stmt (new_stmt);
3556 prev_phi_info = vinfo_for_stmt (new_phi);
3557 }
3558
3559 /* Finalize the reduction-phi (set its arguments) and create the
3560 epilog reduction code. */
3561 if (!single_defuse_cycle)
3562 new_temp = gimple_assign_lhs (*vec_stmt);
3563 vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3564 epilog_reduc_code, first_phi, reduc_index);
3565 return true;
3566 }
3567
3568 /* Function vect_min_worthwhile_factor.
3569
3570 For a loop where we could vectorize the operation indicated by CODE,
3571 return the minimum vectorization factor that makes it worthwhile
3572 to use generic vectors. */
3573 int
3574 vect_min_worthwhile_factor (enum tree_code code)
3575 {
3576 switch (code)
3577 {
3578 case PLUS_EXPR:
3579 case MINUS_EXPR:
3580 case NEGATE_EXPR:
3581 return 4;
3582
3583 case BIT_AND_EXPR:
3584 case BIT_IOR_EXPR:
3585 case BIT_XOR_EXPR:
3586 case BIT_NOT_EXPR:
3587 return 2;
3588
3589 default:
3590 return INT_MAX;
3591 }
3592 }
3593
3594
3595 /* Function vectorizable_induction
3596
3597 Check if PHI performs an induction computation that can be vectorized.
3598 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3599 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3600 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
3601
3602 bool
3603 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3604 gimple *vec_stmt)
3605 {
3606 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3607 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3608 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3609 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3610 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3611 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3612 tree vec_def;
3613
3614 gcc_assert (ncopies >= 1);
3615 /* FORNOW. This restriction should be relaxed. */
3616 if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
3617 {
3618 if (vect_print_dump_info (REPORT_DETAILS))
3619 fprintf (vect_dump, "multiple types in nested loop.");
3620 return false;
3621 }
3622
3623 if (!STMT_VINFO_RELEVANT_P (stmt_info))
3624 return false;
3625
3626 /* FORNOW: SLP not supported. */
3627 if (STMT_SLP_TYPE (stmt_info))
3628 return false;
3629
3630 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3631
3632 if (gimple_code (phi) != GIMPLE_PHI)
3633 return false;
3634
3635 if (!vec_stmt) /* transformation not required. */
3636 {
3637 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3638 if (vect_print_dump_info (REPORT_DETAILS))
3639 fprintf (vect_dump, "=== vectorizable_induction ===");
3640 vect_model_induction_cost (stmt_info, ncopies);
3641 return true;
3642 }
3643
3644 /** Transform. **/
3645
3646 if (vect_print_dump_info (REPORT_DETAILS))
3647 fprintf (vect_dump, "transform induction phi.");
3648
3649 vec_def = get_initial_def_for_induction (phi);
3650 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3651 return true;
3652 }
3653
3654 /* Function vectorizable_live_operation.
3655
3656 STMT computes a value that is used outside the loop. Check if
3657 it can be supported. */
3658
3659 bool
3660 vectorizable_live_operation (gimple stmt,
3661 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3662 gimple *vec_stmt ATTRIBUTE_UNUSED)
3663 {
3664 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3665 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3666 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3667 int i;
3668 int op_type;
3669 tree op;
3670 tree def;
3671 gimple def_stmt;
3672 enum vect_def_type dt;
3673 enum tree_code code;
3674 enum gimple_rhs_class rhs_class;
3675
3676 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
3677
3678 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
3679 return false;
3680
3681 if (!is_gimple_assign (stmt))
3682 return false;
3683
3684 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
3685 return false;
3686
3687 /* FORNOW. CHECKME. */
3688 if (nested_in_vect_loop_p (loop, stmt))
3689 return false;
3690
3691 code = gimple_assign_rhs_code (stmt);
3692 op_type = TREE_CODE_LENGTH (code);
3693 rhs_class = get_gimple_rhs_class (code);
3694 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
3695 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
3696
3697 /* FORNOW: support only if all uses are invariant. This means
3698 that the scalar operations can remain in place, unvectorized.
3699 The original last scalar value that they compute will be used. */
3700
3701 for (i = 0; i < op_type; i++)
3702 {
3703 if (rhs_class == GIMPLE_SINGLE_RHS)
3704 op = TREE_OPERAND (gimple_op (stmt, 1), i);
3705 else
3706 op = gimple_op (stmt, i + 1);
3707 if (op
3708 && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
3709 {
3710 if (vect_print_dump_info (REPORT_DETAILS))
3711 fprintf (vect_dump, "use not simple.");
3712 return false;
3713 }
3714
3715 if (dt != vect_external_def && dt != vect_constant_def)
3716 return false;
3717 }
3718
3719 /* No transformation is required for the cases we currently support. */
3720 return true;
3721 }
3722
3723 /* Function vect_transform_loop.
3724
3725 The analysis phase has determined that the loop is vectorizable.
3726 Vectorize the loop - created vectorized stmts to replace the scalar
3727 stmts in the loop, and update the loop exit condition. */
3728
3729 void
3730 vect_transform_loop (loop_vec_info loop_vinfo)
3731 {
3732 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3733 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3734 int nbbs = loop->num_nodes;
3735 gimple_stmt_iterator si;
3736 int i;
3737 tree ratio = NULL;
3738 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3739 bool strided_store;
3740 bool slp_scheduled = false;
3741 unsigned int nunits;
3742 tree cond_expr = NULL_TREE;
3743 gimple_seq cond_expr_stmt_list = NULL;
3744 bool do_peeling_for_loop_bound;
3745
3746 if (vect_print_dump_info (REPORT_DETAILS))
3747 fprintf (vect_dump, "=== vec_transform_loop ===");
3748
3749 /* Peel the loop if there are data refs with unknown alignment.
3750 Only one data ref with unknown store is allowed. */
3751
3752 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
3753 vect_do_peeling_for_alignment (loop_vinfo);
3754
3755 do_peeling_for_loop_bound
3756 = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3757 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3758 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
3759
3760 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3761 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3762 vect_loop_versioning (loop_vinfo,
3763 !do_peeling_for_loop_bound,
3764 &cond_expr, &cond_expr_stmt_list);
3765
3766 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
3767 compile time constant), or it is a constant that doesn't divide by the
3768 vectorization factor, then an epilog loop needs to be created.
3769 We therefore duplicate the loop: the original loop will be vectorized,
3770 and will compute the first (n/VF) iterations. The second copy of the loop
3771 will remain scalar and will compute the remaining (n%VF) iterations.
3772 (VF is the vectorization factor). */
3773
3774 if (do_peeling_for_loop_bound)
3775 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
3776 cond_expr, cond_expr_stmt_list);
3777 else
3778 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
3779 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
3780
3781 /* 1) Make sure the loop header has exactly two entries
3782 2) Make sure we have a preheader basic block. */
3783
3784 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
3785
3786 split_edge (loop_preheader_edge (loop));
3787
3788 /* FORNOW: the vectorizer supports only loops which body consist
3789 of one basic block (header + empty latch). When the vectorizer will
3790 support more involved loop forms, the order by which the BBs are
3791 traversed need to be reconsidered. */
3792
3793 for (i = 0; i < nbbs; i++)
3794 {
3795 basic_block bb = bbs[i];
3796 stmt_vec_info stmt_info;
3797 gimple phi;
3798
3799 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
3800 {
3801 phi = gsi_stmt (si);
3802 if (vect_print_dump_info (REPORT_DETAILS))
3803 {
3804 fprintf (vect_dump, "------>vectorizing phi: ");
3805 print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3806 }
3807 stmt_info = vinfo_for_stmt (phi);
3808 if (!stmt_info)
3809 continue;
3810
3811 if (!STMT_VINFO_RELEVANT_P (stmt_info)
3812 && !STMT_VINFO_LIVE_P (stmt_info))
3813 continue;
3814
3815 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
3816 != (unsigned HOST_WIDE_INT) vectorization_factor)
3817 && vect_print_dump_info (REPORT_DETAILS))
3818 fprintf (vect_dump, "multiple-types.");
3819
3820 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
3821 {
3822 if (vect_print_dump_info (REPORT_DETAILS))
3823 fprintf (vect_dump, "transform phi.");
3824 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
3825 }
3826 }
3827
3828 for (si = gsi_start_bb (bb); !gsi_end_p (si);)
3829 {
3830 gimple stmt = gsi_stmt (si);
3831 bool is_store;
3832
3833 if (vect_print_dump_info (REPORT_DETAILS))
3834 {
3835 fprintf (vect_dump, "------>vectorizing statement: ");
3836 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
3837 }
3838
3839 stmt_info = vinfo_for_stmt (stmt);
3840
3841 /* vector stmts created in the outer-loop during vectorization of
3842 stmts in an inner-loop may not have a stmt_info, and do not
3843 need to be vectorized. */
3844 if (!stmt_info)
3845 {
3846 gsi_next (&si);
3847 continue;
3848 }
3849
3850 if (!STMT_VINFO_RELEVANT_P (stmt_info)
3851 && !STMT_VINFO_LIVE_P (stmt_info))
3852 {
3853 gsi_next (&si);
3854 continue;
3855 }
3856
3857 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
3858 nunits =
3859 (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3860 if (!STMT_SLP_TYPE (stmt_info)
3861 && nunits != (unsigned int) vectorization_factor
3862 && vect_print_dump_info (REPORT_DETAILS))
3863 /* For SLP VF is set according to unrolling factor, and not to
3864 vector size, hence for SLP this print is not valid. */
3865 fprintf (vect_dump, "multiple-types.");
3866
3867 /* SLP. Schedule all the SLP instances when the first SLP stmt is
3868 reached. */
3869 if (STMT_SLP_TYPE (stmt_info))
3870 {
3871 if (!slp_scheduled)
3872 {
3873 slp_scheduled = true;
3874
3875 if (vect_print_dump_info (REPORT_DETAILS))
3876 fprintf (vect_dump, "=== scheduling SLP instances ===");
3877
3878 vect_schedule_slp (loop_vinfo, NULL);
3879 }
3880
3881 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
3882 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
3883 {
3884 gsi_next (&si);
3885 continue;
3886 }
3887 }
3888
3889 /* -------- vectorize statement ------------ */
3890 if (vect_print_dump_info (REPORT_DETAILS))
3891 fprintf (vect_dump, "transform statement.");
3892
3893 strided_store = false;
3894 is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
3895 if (is_store)
3896 {
3897 if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
3898 {
3899 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
3900 interleaving chain was completed - free all the stores in
3901 the chain. */
3902 vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
3903 gsi_remove (&si, true);
3904 continue;
3905 }
3906 else
3907 {
3908 /* Free the attached stmt_vec_info and remove the stmt. */
3909 free_stmt_vec_info (stmt);
3910 gsi_remove (&si, true);
3911 continue;
3912 }
3913 }
3914 gsi_next (&si);
3915 } /* stmts in BB */
3916 } /* BBs in loop */
3917
3918 slpeel_make_loop_iterate_ntimes (loop, ratio);
3919
3920 /* The memory tags and pointers in vectorized statements need to
3921 have their SSA forms updated. FIXME, why can't this be delayed
3922 until all the loops have been transformed? */
3923 update_ssa (TODO_update_ssa);
3924
3925 if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
3926 fprintf (vect_dump, "LOOP VECTORIZED.");
3927 if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
3928 fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
3929 }