tree-vect-loop.c (vect_transform_loop): Use MSG_NOTE instead of MSG_OPTIMIZED_LOCATIONS.
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "gimple-pretty-print.h"
31 #include "tree-flow.h"
32 #include "tree-pass.h"
33 #include "cfgloop.h"
34 #include "expr.h"
35 #include "recog.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "diagnostic-core.h"
39 #include "tree-chrec.h"
40 #include "tree-scalar-evolution.h"
41 #include "tree-vectorizer.h"
42 #include "target.h"
43
44 /* Loop Vectorization Pass.
45
46 This pass tries to vectorize loops.
47
48 For example, the vectorizer transforms the following simple loop:
49
50 short a[N]; short b[N]; short c[N]; int i;
51
52 for (i=0; i<N; i++){
53 a[i] = b[i] + c[i];
54 }
55
56 as if it was manually vectorized by rewriting the source code into:
57
58 typedef int __attribute__((mode(V8HI))) v8hi;
59 short a[N]; short b[N]; short c[N]; int i;
60 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
61 v8hi va, vb, vc;
62
63 for (i=0; i<N/8; i++){
64 vb = pb[i];
65 vc = pc[i];
66 va = vb + vc;
67 pa[i] = va;
68 }
69
70 The main entry to this pass is vectorize_loops(), in which
71 the vectorizer applies a set of analyses on a given set of loops,
72 followed by the actual vectorization transformation for the loops that
73 had successfully passed the analysis phase.
74 Throughout this pass we make a distinction between two types of
75 data: scalars (which are represented by SSA_NAMES), and memory references
76 ("data-refs"). These two types of data require different handling both
77 during analysis and transformation. The types of data-refs that the
78 vectorizer currently supports are ARRAY_REFS which base is an array DECL
79 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80 accesses are required to have a simple (consecutive) access pattern.
81
82 Analysis phase:
83 ===============
84 The driver for the analysis phase is vect_analyze_loop().
85 It applies a set of analyses, some of which rely on the scalar evolution
86 analyzer (scev) developed by Sebastian Pop.
87
88 During the analysis phase the vectorizer records some information
89 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90 loop, as well as general information about the loop as a whole, which is
91 recorded in a "loop_vec_info" struct attached to each loop.
92
93 Transformation phase:
94 =====================
95 The loop transformation phase scans all the stmts in the loop, and
96 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97 the loop that needs to be vectorized. It inserts the vector code sequence
98 just before the scalar stmt S, and records a pointer to the vector code
99 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100 attached to S). This pointer will be used for the vectorization of following
101 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102 otherwise, we rely on dead code elimination for removing it.
103
104 For example, say stmt S1 was vectorized into stmt VS1:
105
106 VS1: vb = px[i];
107 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
108 S2: a = b;
109
110 To vectorize stmt S2, the vectorizer first finds the stmt that defines
111 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
113 resulting sequence would be:
114
115 VS1: vb = px[i];
116 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
117 VS2: va = vb;
118 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
119
120 Operands that are not SSA_NAMEs, are data-refs that appear in
121 load/store operations (like 'x[i]' in S1), and are handled differently.
122
123 Target modeling:
124 =================
125 Currently the only target specific information that is used is the
126 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
127 Targets that can support different sizes of vectors, for now will need
128 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
129 flexibility will be added in the future.
130
131 Since we only vectorize operations which vector form can be
132 expressed using existing tree codes, to verify that an operation is
133 supported, the vectorizer checks the relevant optab at the relevant
134 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
135 the value found is CODE_FOR_nothing, then there's no target support, and
136 we can't vectorize the stmt.
137
138 For additional information on this project see:
139 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
140 */
141
142 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
143
144 /* Function vect_determine_vectorization_factor
145
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
151
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
155 in the loop.
156
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
158 original loop:
159 for (i=0; i<N; i++){
160 a[i] = b[i] + c[i];
161 }
162
163 vectorized loop:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
166 }
167 */
168
169 static bool
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
171 {
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
177 tree scalar_type;
178 gimple phi;
179 tree vectype;
180 unsigned int nunits;
181 stmt_vec_info stmt_info;
182 int i;
183 HOST_WIDE_INT dummy;
184 gimple stmt, pattern_stmt = NULL;
185 gimple_seq pattern_def_seq = NULL;
186 gimple_stmt_iterator pattern_def_si = gsi_none ();
187 bool analyze_pattern_stmt = false;
188
189 if (dump_enabled_p ())
190 dump_printf_loc (MSG_NOTE, vect_location,
191 "=== vect_determine_vectorization_factor ===");
192
193 for (i = 0; i < nbbs; i++)
194 {
195 basic_block bb = bbs[i];
196
197 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
198 {
199 phi = gsi_stmt (si);
200 stmt_info = vinfo_for_stmt (phi);
201 if (dump_enabled_p ())
202 {
203 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
204 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
205 }
206
207 gcc_assert (stmt_info);
208
209 if (STMT_VINFO_RELEVANT_P (stmt_info))
210 {
211 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
212 scalar_type = TREE_TYPE (PHI_RESULT (phi));
213
214 if (dump_enabled_p ())
215 {
216 dump_printf_loc (MSG_NOTE, vect_location,
217 "get vectype for scalar type: ");
218 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
219 }
220
221 vectype = get_vectype_for_scalar_type (scalar_type);
222 if (!vectype)
223 {
224 if (dump_enabled_p ())
225 {
226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
227 "not vectorized: unsupported "
228 "data-type ");
229 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
230 scalar_type);
231 }
232 return false;
233 }
234 STMT_VINFO_VECTYPE (stmt_info) = vectype;
235
236 if (dump_enabled_p ())
237 {
238 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
239 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
240 }
241
242 nunits = TYPE_VECTOR_SUBPARTS (vectype);
243 if (dump_enabled_p ())
244 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
245
246 if (!vectorization_factor
247 || (nunits > vectorization_factor))
248 vectorization_factor = nunits;
249 }
250 }
251
252 for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
253 {
254 tree vf_vectype;
255
256 if (analyze_pattern_stmt)
257 stmt = pattern_stmt;
258 else
259 stmt = gsi_stmt (si);
260
261 stmt_info = vinfo_for_stmt (stmt);
262
263 if (dump_enabled_p ())
264 {
265 dump_printf_loc (MSG_NOTE, vect_location,
266 "==> examining statement: ");
267 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
268 }
269
270 gcc_assert (stmt_info);
271
272 /* Skip stmts which do not need to be vectorized. */
273 if (!STMT_VINFO_RELEVANT_P (stmt_info)
274 && !STMT_VINFO_LIVE_P (stmt_info))
275 {
276 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
277 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
278 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
279 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
280 {
281 stmt = pattern_stmt;
282 stmt_info = vinfo_for_stmt (pattern_stmt);
283 if (dump_enabled_p ())
284 {
285 dump_printf_loc (MSG_NOTE, vect_location,
286 "==> examining pattern statement: ");
287 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
288 }
289 }
290 else
291 {
292 if (dump_enabled_p ())
293 dump_printf_loc (MSG_NOTE, vect_location, "skip.");
294 gsi_next (&si);
295 continue;
296 }
297 }
298 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
302 analyze_pattern_stmt = true;
303
304 /* If a pattern statement has def stmts, analyze them too. */
305 if (is_pattern_stmt_p (stmt_info))
306 {
307 if (pattern_def_seq == NULL)
308 {
309 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
310 pattern_def_si = gsi_start (pattern_def_seq);
311 }
312 else if (!gsi_end_p (pattern_def_si))
313 gsi_next (&pattern_def_si);
314 if (pattern_def_seq != NULL)
315 {
316 gimple pattern_def_stmt = NULL;
317 stmt_vec_info pattern_def_stmt_info = NULL;
318
319 while (!gsi_end_p (pattern_def_si))
320 {
321 pattern_def_stmt = gsi_stmt (pattern_def_si);
322 pattern_def_stmt_info
323 = vinfo_for_stmt (pattern_def_stmt);
324 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
325 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
326 break;
327 gsi_next (&pattern_def_si);
328 }
329
330 if (!gsi_end_p (pattern_def_si))
331 {
332 if (dump_enabled_p ())
333 {
334 dump_printf_loc (MSG_NOTE, vect_location,
335 "==> examining pattern def stmt: ");
336 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
337 pattern_def_stmt, 0);
338 }
339
340 stmt = pattern_def_stmt;
341 stmt_info = pattern_def_stmt_info;
342 }
343 else
344 {
345 pattern_def_si = gsi_none ();
346 analyze_pattern_stmt = false;
347 }
348 }
349 else
350 analyze_pattern_stmt = false;
351 }
352
353 if (gimple_get_lhs (stmt) == NULL_TREE)
354 {
355 if (dump_enabled_p ())
356 {
357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
358 "not vectorized: irregular stmt.");
359 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
360 0);
361 }
362 return false;
363 }
364
365 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
366 {
367 if (dump_enabled_p ())
368 {
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370 "not vectorized: vector stmt in loop:");
371 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
372 }
373 return false;
374 }
375
376 if (STMT_VINFO_VECTYPE (stmt_info))
377 {
378 /* The only case when a vectype had been already set is for stmts
379 that contain a dataref, or for "pattern-stmts" (stmts
380 generated by the vectorizer to represent/replace a certain
381 idiom). */
382 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
383 || is_pattern_stmt_p (stmt_info)
384 || !gsi_end_p (pattern_def_si));
385 vectype = STMT_VINFO_VECTYPE (stmt_info);
386 }
387 else
388 {
389 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
390 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
391 if (dump_enabled_p ())
392 {
393 dump_printf_loc (MSG_NOTE, vect_location,
394 "get vectype for scalar type: ");
395 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
396 }
397 vectype = get_vectype_for_scalar_type (scalar_type);
398 if (!vectype)
399 {
400 if (dump_enabled_p ())
401 {
402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
403 "not vectorized: unsupported "
404 "data-type ");
405 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
406 scalar_type);
407 }
408 return false;
409 }
410
411 STMT_VINFO_VECTYPE (stmt_info) = vectype;
412
413 if (dump_enabled_p ())
414 {
415 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
416 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
417 }
418 }
419
420 /* The vectorization factor is according to the smallest
421 scalar type (or the largest vector size, but we only
422 support one vector size per loop). */
423 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
424 &dummy);
425 if (dump_enabled_p ())
426 {
427 dump_printf_loc (MSG_NOTE, vect_location,
428 "get vectype for scalar type: ");
429 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
430 }
431 vf_vectype = get_vectype_for_scalar_type (scalar_type);
432 if (!vf_vectype)
433 {
434 if (dump_enabled_p ())
435 {
436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
437 "not vectorized: unsupported data-type ");
438 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
439 scalar_type);
440 }
441 return false;
442 }
443
444 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
445 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
446 {
447 if (dump_enabled_p ())
448 {
449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
450 "not vectorized: different sized vector "
451 "types in statement, ");
452 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
453 vectype);
454 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
455 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
456 vf_vectype);
457 }
458 return false;
459 }
460
461 if (dump_enabled_p ())
462 {
463 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
464 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
465 }
466
467 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
470 if (!vectorization_factor
471 || (nunits > vectorization_factor))
472 vectorization_factor = nunits;
473
474 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
475 {
476 pattern_def_seq = NULL;
477 gsi_next (&si);
478 }
479 }
480 }
481
482 /* TODO: Analyze cost. Decide if worth while to vectorize. */
483 if (dump_enabled_p ())
484 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
485 vectorization_factor);
486 if (vectorization_factor <= 1)
487 {
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
490 "not vectorized: unsupported data-type");
491 return false;
492 }
493 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
494
495 return true;
496 }
497
498
499 /* Function vect_is_simple_iv_evolution.
500
501 FORNOW: A simple evolution of an induction variables in the loop is
502 considered a polynomial evolution with constant step. */
503
504 static bool
505 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
506 tree * step)
507 {
508 tree init_expr;
509 tree step_expr;
510 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
511
512 /* When there is no evolution in this loop, the evolution function
513 is not "simple". */
514 if (evolution_part == NULL_TREE)
515 return false;
516
517 /* When the evolution is a polynomial of degree >= 2
518 the evolution function is not "simple". */
519 if (tree_is_chrec (evolution_part))
520 return false;
521
522 step_expr = evolution_part;
523 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
524
525 if (dump_enabled_p ())
526 {
527 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
528 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
529 dump_printf (MSG_NOTE, ", init: ");
530 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
531 }
532
533 *init = init_expr;
534 *step = step_expr;
535
536 if (TREE_CODE (step_expr) != INTEGER_CST)
537 {
538 if (dump_enabled_p ())
539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
540 "step unknown.");
541 return false;
542 }
543
544 return true;
545 }
546
547 /* Function vect_analyze_scalar_cycles_1.
548
549 Examine the cross iteration def-use cycles of scalar variables
550 in LOOP. LOOP_VINFO represents the loop that is now being
551 considered for vectorization (can be LOOP, or an outer-loop
552 enclosing LOOP). */
553
554 static void
555 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
556 {
557 basic_block bb = loop->header;
558 tree dumy;
559 vec<gimple> worklist;
560 worklist.create (64);
561 gimple_stmt_iterator gsi;
562 bool double_reduc;
563
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_NOTE, vect_location,
566 "=== vect_analyze_scalar_cycles ===");
567
568 /* First - identify all inductions. Reduction detection assumes that all the
569 inductions have been identified, therefore, this order must not be
570 changed. */
571 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
572 {
573 gimple phi = gsi_stmt (gsi);
574 tree access_fn = NULL;
575 tree def = PHI_RESULT (phi);
576 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
577
578 if (dump_enabled_p ())
579 {
580 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
581 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
582 }
583
584 /* Skip virtual phi's. The data dependences that are associated with
585 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
586 if (virtual_operand_p (def))
587 continue;
588
589 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
590
591 /* Analyze the evolution function. */
592 access_fn = analyze_scalar_evolution (loop, def);
593 if (access_fn)
594 {
595 STRIP_NOPS (access_fn);
596 if (dump_enabled_p ())
597 {
598 dump_printf_loc (MSG_NOTE, vect_location,
599 "Access function of PHI: ");
600 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
601 }
602 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
603 = evolution_part_in_loop_num (access_fn, loop->num);
604 }
605
606 if (!access_fn
607 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
608 {
609 worklist.safe_push (phi);
610 continue;
611 }
612
613 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
614
615 if (dump_enabled_p ())
616 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
617 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
618 }
619
620
621 /* Second - identify all reductions and nested cycles. */
622 while (worklist.length () > 0)
623 {
624 gimple phi = worklist.pop ();
625 tree def = PHI_RESULT (phi);
626 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
627 gimple reduc_stmt;
628 bool nested_cycle;
629
630 if (dump_enabled_p ())
631 {
632 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
633 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
634 }
635
636 gcc_assert (!virtual_operand_p (def)
637 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
638
639 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
640 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
641 &double_reduc);
642 if (reduc_stmt)
643 {
644 if (double_reduc)
645 {
646 if (dump_enabled_p ())
647 dump_printf_loc (MSG_NOTE, vect_location,
648 "Detected double reduction.");
649
650 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
651 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
652 vect_double_reduction_def;
653 }
654 else
655 {
656 if (nested_cycle)
657 {
658 if (dump_enabled_p ())
659 dump_printf_loc (MSG_NOTE, vect_location,
660 "Detected vectorizable nested cycle.");
661
662 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
663 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
664 vect_nested_cycle;
665 }
666 else
667 {
668 if (dump_enabled_p ())
669 dump_printf_loc (MSG_NOTE, vect_location,
670 "Detected reduction.");
671
672 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
673 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
674 vect_reduction_def;
675 /* Store the reduction cycles for possible vectorization in
676 loop-aware SLP. */
677 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
678 }
679 }
680 }
681 else
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
684 "Unknown def-use cycle pattern.");
685 }
686
687 worklist.release ();
688 }
689
690
691 /* Function vect_analyze_scalar_cycles.
692
693 Examine the cross iteration def-use cycles of scalar variables, by
694 analyzing the loop-header PHIs of scalar variables. Classify each
695 cycle as one of the following: invariant, induction, reduction, unknown.
696 We do that for the loop represented by LOOP_VINFO, and also to its
697 inner-loop, if exists.
698 Examples for scalar cycles:
699
700 Example1: reduction:
701
702 loop1:
703 for (i=0; i<N; i++)
704 sum += a[i];
705
706 Example2: induction:
707
708 loop2:
709 for (i=0; i<N; i++)
710 a[i] = i; */
711
712 static void
713 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
714 {
715 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
716
717 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
718
719 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
720 Reductions in such inner-loop therefore have different properties than
721 the reductions in the nest that gets vectorized:
722 1. When vectorized, they are executed in the same order as in the original
723 scalar loop, so we can't change the order of computation when
724 vectorizing them.
725 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
726 current checks are too strict. */
727
728 if (loop->inner)
729 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
730 }
731
732 /* Function vect_get_loop_niters.
733
734 Determine how many iterations the loop is executed.
735 If an expression that represents the number of iterations
736 can be constructed, place it in NUMBER_OF_ITERATIONS.
737 Return the loop exit condition. */
738
739 static gimple
740 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
741 {
742 tree niters;
743
744 if (dump_enabled_p ())
745 dump_printf_loc (MSG_NOTE, vect_location,
746 "=== get_loop_niters ===");
747 niters = number_of_exit_cond_executions (loop);
748
749 if (niters != NULL_TREE
750 && niters != chrec_dont_know)
751 {
752 *number_of_iterations = niters;
753
754 if (dump_enabled_p ())
755 {
756 dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
757 dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
758 }
759 }
760
761 return get_loop_exit_condition (loop);
762 }
763
764
765 /* Function bb_in_loop_p
766
767 Used as predicate for dfs order traversal of the loop bbs. */
768
769 static bool
770 bb_in_loop_p (const_basic_block bb, const void *data)
771 {
772 const struct loop *const loop = (const struct loop *)data;
773 if (flow_bb_inside_loop_p (loop, bb))
774 return true;
775 return false;
776 }
777
778
779 /* Function new_loop_vec_info.
780
781 Create and initialize a new loop_vec_info struct for LOOP, as well as
782 stmt_vec_info structs for all the stmts in LOOP. */
783
784 static loop_vec_info
785 new_loop_vec_info (struct loop *loop)
786 {
787 loop_vec_info res;
788 basic_block *bbs;
789 gimple_stmt_iterator si;
790 unsigned int i, nbbs;
791
792 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
793 LOOP_VINFO_LOOP (res) = loop;
794
795 bbs = get_loop_body (loop);
796
797 /* Create/Update stmt_info for all stmts in the loop. */
798 for (i = 0; i < loop->num_nodes; i++)
799 {
800 basic_block bb = bbs[i];
801
802 /* BBs in a nested inner-loop will have been already processed (because
803 we will have called vect_analyze_loop_form for any nested inner-loop).
804 Therefore, for stmts in an inner-loop we just want to update the
805 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
806 loop_info of the outer-loop we are currently considering to vectorize
807 (instead of the loop_info of the inner-loop).
808 For stmts in other BBs we need to create a stmt_info from scratch. */
809 if (bb->loop_father != loop)
810 {
811 /* Inner-loop bb. */
812 gcc_assert (loop->inner && bb->loop_father == loop->inner);
813 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
814 {
815 gimple phi = gsi_stmt (si);
816 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
817 loop_vec_info inner_loop_vinfo =
818 STMT_VINFO_LOOP_VINFO (stmt_info);
819 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
820 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
821 }
822 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
823 {
824 gimple stmt = gsi_stmt (si);
825 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
826 loop_vec_info inner_loop_vinfo =
827 STMT_VINFO_LOOP_VINFO (stmt_info);
828 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
829 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
830 }
831 }
832 else
833 {
834 /* bb in current nest. */
835 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
836 {
837 gimple phi = gsi_stmt (si);
838 gimple_set_uid (phi, 0);
839 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
840 }
841
842 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
843 {
844 gimple stmt = gsi_stmt (si);
845 gimple_set_uid (stmt, 0);
846 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
847 }
848 }
849 }
850
851 /* CHECKME: We want to visit all BBs before their successors (except for
852 latch blocks, for which this assertion wouldn't hold). In the simple
853 case of the loop forms we allow, a dfs order of the BBs would the same
854 as reversed postorder traversal, so we are safe. */
855
856 free (bbs);
857 bbs = XCNEWVEC (basic_block, loop->num_nodes);
858 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
859 bbs, loop->num_nodes, loop);
860 gcc_assert (nbbs == loop->num_nodes);
861
862 LOOP_VINFO_BBS (res) = bbs;
863 LOOP_VINFO_NITERS (res) = NULL;
864 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
865 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
866 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
867 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
868 LOOP_VINFO_VECT_FACTOR (res) = 0;
869 LOOP_VINFO_LOOP_NEST (res).create (3);
870 LOOP_VINFO_DATAREFS (res).create (10);
871 LOOP_VINFO_DDRS (res).create (10 * 10);
872 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
873 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
874 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
875 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
876 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
877 LOOP_VINFO_GROUPED_STORES (res).create (10);
878 LOOP_VINFO_REDUCTIONS (res).create (10);
879 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
880 LOOP_VINFO_SLP_INSTANCES (res).create (10);
881 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
882 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
883 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
884 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
885
886 return res;
887 }
888
889
890 /* Function destroy_loop_vec_info.
891
892 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
893 stmts in the loop. */
894
895 void
896 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
897 {
898 struct loop *loop;
899 basic_block *bbs;
900 int nbbs;
901 gimple_stmt_iterator si;
902 int j;
903 vec<slp_instance> slp_instances;
904 slp_instance instance;
905 bool swapped;
906
907 if (!loop_vinfo)
908 return;
909
910 loop = LOOP_VINFO_LOOP (loop_vinfo);
911
912 bbs = LOOP_VINFO_BBS (loop_vinfo);
913 nbbs = clean_stmts ? loop->num_nodes : 0;
914 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
915
916 for (j = 0; j < nbbs; j++)
917 {
918 basic_block bb = bbs[j];
919 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
920 free_stmt_vec_info (gsi_stmt (si));
921
922 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
923 {
924 gimple stmt = gsi_stmt (si);
925
926 /* We may have broken canonical form by moving a constant
927 into RHS1 of a commutative op. Fix such occurrences. */
928 if (swapped && is_gimple_assign (stmt))
929 {
930 enum tree_code code = gimple_assign_rhs_code (stmt);
931
932 if ((code == PLUS_EXPR
933 || code == POINTER_PLUS_EXPR
934 || code == MULT_EXPR)
935 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
936 swap_tree_operands (stmt,
937 gimple_assign_rhs1_ptr (stmt),
938 gimple_assign_rhs2_ptr (stmt));
939 }
940
941 /* Free stmt_vec_info. */
942 free_stmt_vec_info (stmt);
943 gsi_next (&si);
944 }
945 }
946
947 free (LOOP_VINFO_BBS (loop_vinfo));
948 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
949 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
950 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
951 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
952 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
953 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
954 FOR_EACH_VEC_ELT (slp_instances, j, instance)
955 vect_free_slp_instance (instance);
956
957 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
958 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
959 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
960 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
961
962 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
963 LOOP_VINFO_PEELING_HTAB (loop_vinfo).dispose ();
964
965 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
966
967 free (loop_vinfo);
968 loop->aux = NULL;
969 }
970
971
972 /* Function vect_analyze_loop_1.
973
974 Apply a set of analyses on LOOP, and create a loop_vec_info struct
975 for it. The different analyses will record information in the
976 loop_vec_info struct. This is a subset of the analyses applied in
977 vect_analyze_loop, to be applied on an inner-loop nested in the loop
978 that is now considered for (outer-loop) vectorization. */
979
980 static loop_vec_info
981 vect_analyze_loop_1 (struct loop *loop)
982 {
983 loop_vec_info loop_vinfo;
984
985 if (dump_enabled_p ())
986 dump_printf_loc (MSG_NOTE, vect_location,
987 "===== analyze_loop_nest_1 =====");
988
989 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
990
991 loop_vinfo = vect_analyze_loop_form (loop);
992 if (!loop_vinfo)
993 {
994 if (dump_enabled_p ())
995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
996 "bad inner-loop form.");
997 return NULL;
998 }
999
1000 return loop_vinfo;
1001 }
1002
1003
1004 /* Function vect_analyze_loop_form.
1005
1006 Verify that certain CFG restrictions hold, including:
1007 - the loop has a pre-header
1008 - the loop has a single entry and exit
1009 - the loop exit condition is simple enough, and the number of iterations
1010 can be analyzed (a countable loop). */
1011
1012 loop_vec_info
1013 vect_analyze_loop_form (struct loop *loop)
1014 {
1015 loop_vec_info loop_vinfo;
1016 gimple loop_cond;
1017 tree number_of_iterations = NULL;
1018 loop_vec_info inner_loop_vinfo = NULL;
1019
1020 if (dump_enabled_p ())
1021 dump_printf_loc (MSG_NOTE, vect_location,
1022 "=== vect_analyze_loop_form ===");
1023
1024 /* Different restrictions apply when we are considering an inner-most loop,
1025 vs. an outer (nested) loop.
1026 (FORNOW. May want to relax some of these restrictions in the future). */
1027
1028 if (!loop->inner)
1029 {
1030 /* Inner-most loop. We currently require that the number of BBs is
1031 exactly 2 (the header and latch). Vectorizable inner-most loops
1032 look like this:
1033
1034 (pre-header)
1035 |
1036 header <--------+
1037 | | |
1038 | +--> latch --+
1039 |
1040 (exit-bb) */
1041
1042 if (loop->num_nodes != 2)
1043 {
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046 "not vectorized: control flow in loop.");
1047 return NULL;
1048 }
1049
1050 if (empty_block_p (loop->header))
1051 {
1052 if (dump_enabled_p ())
1053 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1054 "not vectorized: empty loop.");
1055 return NULL;
1056 }
1057 }
1058 else
1059 {
1060 struct loop *innerloop = loop->inner;
1061 edge entryedge;
1062
1063 /* Nested loop. We currently require that the loop is doubly-nested,
1064 contains a single inner loop, and the number of BBs is exactly 5.
1065 Vectorizable outer-loops look like this:
1066
1067 (pre-header)
1068 |
1069 header <---+
1070 | |
1071 inner-loop |
1072 | |
1073 tail ------+
1074 |
1075 (exit-bb)
1076
1077 The inner-loop has the properties expected of inner-most loops
1078 as described above. */
1079
1080 if ((loop->inner)->inner || (loop->inner)->next)
1081 {
1082 if (dump_enabled_p ())
1083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1084 "not vectorized: multiple nested loops.");
1085 return NULL;
1086 }
1087
1088 /* Analyze the inner-loop. */
1089 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1090 if (!inner_loop_vinfo)
1091 {
1092 if (dump_enabled_p ())
1093 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1094 "not vectorized: Bad inner loop.");
1095 return NULL;
1096 }
1097
1098 if (!expr_invariant_in_loop_p (loop,
1099 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1100 {
1101 if (dump_enabled_p ())
1102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1103 "not vectorized: inner-loop count not invariant.");
1104 destroy_loop_vec_info (inner_loop_vinfo, true);
1105 return NULL;
1106 }
1107
1108 if (loop->num_nodes != 5)
1109 {
1110 if (dump_enabled_p ())
1111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1112 "not vectorized: control flow in loop.");
1113 destroy_loop_vec_info (inner_loop_vinfo, true);
1114 return NULL;
1115 }
1116
1117 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1118 entryedge = EDGE_PRED (innerloop->header, 0);
1119 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1120 entryedge = EDGE_PRED (innerloop->header, 1);
1121
1122 if (entryedge->src != loop->header
1123 || !single_exit (innerloop)
1124 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1125 {
1126 if (dump_enabled_p ())
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128 "not vectorized: unsupported outerloop form.");
1129 destroy_loop_vec_info (inner_loop_vinfo, true);
1130 return NULL;
1131 }
1132
1133 if (dump_enabled_p ())
1134 dump_printf_loc (MSG_NOTE, vect_location,
1135 "Considering outer-loop vectorization.");
1136 }
1137
1138 if (!single_exit (loop)
1139 || EDGE_COUNT (loop->header->preds) != 2)
1140 {
1141 if (dump_enabled_p ())
1142 {
1143 if (!single_exit (loop))
1144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1145 "not vectorized: multiple exits.");
1146 else if (EDGE_COUNT (loop->header->preds) != 2)
1147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1148 "not vectorized: too many incoming edges.");
1149 }
1150 if (inner_loop_vinfo)
1151 destroy_loop_vec_info (inner_loop_vinfo, true);
1152 return NULL;
1153 }
1154
1155 /* We assume that the loop exit condition is at the end of the loop. i.e,
1156 that the loop is represented as a do-while (with a proper if-guard
1157 before the loop if needed), where the loop header contains all the
1158 executable statements, and the latch is empty. */
1159 if (!empty_block_p (loop->latch)
1160 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1161 {
1162 if (dump_enabled_p ())
1163 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1164 "not vectorized: latch block not empty.");
1165 if (inner_loop_vinfo)
1166 destroy_loop_vec_info (inner_loop_vinfo, true);
1167 return NULL;
1168 }
1169
1170 /* Make sure there exists a single-predecessor exit bb: */
1171 if (!single_pred_p (single_exit (loop)->dest))
1172 {
1173 edge e = single_exit (loop);
1174 if (!(e->flags & EDGE_ABNORMAL))
1175 {
1176 split_loop_exit_edge (e);
1177 if (dump_enabled_p ())
1178 dump_printf (MSG_NOTE, "split exit edge.");
1179 }
1180 else
1181 {
1182 if (dump_enabled_p ())
1183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1184 "not vectorized: abnormal loop exit edge.");
1185 if (inner_loop_vinfo)
1186 destroy_loop_vec_info (inner_loop_vinfo, true);
1187 return NULL;
1188 }
1189 }
1190
1191 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1192 if (!loop_cond)
1193 {
1194 if (dump_enabled_p ())
1195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196 "not vectorized: complicated exit condition.");
1197 if (inner_loop_vinfo)
1198 destroy_loop_vec_info (inner_loop_vinfo, true);
1199 return NULL;
1200 }
1201
1202 if (!number_of_iterations)
1203 {
1204 if (dump_enabled_p ())
1205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1206 "not vectorized: number of iterations cannot be "
1207 "computed.");
1208 if (inner_loop_vinfo)
1209 destroy_loop_vec_info (inner_loop_vinfo, true);
1210 return NULL;
1211 }
1212
1213 if (chrec_contains_undetermined (number_of_iterations))
1214 {
1215 if (dump_enabled_p ())
1216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1217 "Infinite number of iterations.");
1218 if (inner_loop_vinfo)
1219 destroy_loop_vec_info (inner_loop_vinfo, true);
1220 return NULL;
1221 }
1222
1223 if (!NITERS_KNOWN_P (number_of_iterations))
1224 {
1225 if (dump_enabled_p ())
1226 {
1227 dump_printf_loc (MSG_NOTE, vect_location,
1228 "Symbolic number of iterations is ");
1229 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1230 }
1231 }
1232 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1233 {
1234 if (dump_enabled_p ())
1235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1236 "not vectorized: number of iterations = 0.");
1237 if (inner_loop_vinfo)
1238 destroy_loop_vec_info (inner_loop_vinfo, true);
1239 return NULL;
1240 }
1241
1242 loop_vinfo = new_loop_vec_info (loop);
1243 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1244 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1245
1246 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1247
1248 /* CHECKME: May want to keep it around it in the future. */
1249 if (inner_loop_vinfo)
1250 destroy_loop_vec_info (inner_loop_vinfo, false);
1251
1252 gcc_assert (!loop->aux);
1253 loop->aux = loop_vinfo;
1254 return loop_vinfo;
1255 }
1256
1257
1258 /* Function vect_analyze_loop_operations.
1259
1260 Scan the loop stmts and make sure they are all vectorizable. */
1261
1262 static bool
1263 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1264 {
1265 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1266 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1267 int nbbs = loop->num_nodes;
1268 gimple_stmt_iterator si;
1269 unsigned int vectorization_factor = 0;
1270 int i;
1271 gimple phi;
1272 stmt_vec_info stmt_info;
1273 bool need_to_vectorize = false;
1274 int min_profitable_iters;
1275 int min_scalar_loop_bound;
1276 unsigned int th;
1277 bool only_slp_in_loop = true, ok;
1278 HOST_WIDE_INT max_niter;
1279 HOST_WIDE_INT estimated_niter;
1280 int min_profitable_estimate;
1281
1282 if (dump_enabled_p ())
1283 dump_printf_loc (MSG_NOTE, vect_location,
1284 "=== vect_analyze_loop_operations ===");
1285
1286 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1287 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1288 if (slp)
1289 {
1290 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1291 vectorization factor of the loop is the unrolling factor required by
1292 the SLP instances. If that unrolling factor is 1, we say, that we
1293 perform pure SLP on loop - cross iteration parallelism is not
1294 exploited. */
1295 for (i = 0; i < nbbs; i++)
1296 {
1297 basic_block bb = bbs[i];
1298 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1299 {
1300 gimple stmt = gsi_stmt (si);
1301 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1302 gcc_assert (stmt_info);
1303 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1304 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1305 && !PURE_SLP_STMT (stmt_info))
1306 /* STMT needs both SLP and loop-based vectorization. */
1307 only_slp_in_loop = false;
1308 }
1309 }
1310
1311 if (only_slp_in_loop)
1312 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1313 else
1314 vectorization_factor = least_common_multiple (vectorization_factor,
1315 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1316
1317 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1318 if (dump_enabled_p ())
1319 dump_printf_loc (MSG_NOTE, vect_location,
1320 "Updating vectorization factor to %d ",
1321 vectorization_factor);
1322 }
1323
1324 for (i = 0; i < nbbs; i++)
1325 {
1326 basic_block bb = bbs[i];
1327
1328 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1329 {
1330 phi = gsi_stmt (si);
1331 ok = true;
1332
1333 stmt_info = vinfo_for_stmt (phi);
1334 if (dump_enabled_p ())
1335 {
1336 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1337 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1338 }
1339
1340 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1341 (i.e., a phi in the tail of the outer-loop). */
1342 if (! is_loop_header_bb_p (bb))
1343 {
1344 /* FORNOW: we currently don't support the case that these phis
1345 are not used in the outerloop (unless it is double reduction,
1346 i.e., this phi is vect_reduction_def), cause this case
1347 requires to actually do something here. */
1348 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1349 || STMT_VINFO_LIVE_P (stmt_info))
1350 && STMT_VINFO_DEF_TYPE (stmt_info)
1351 != vect_double_reduction_def)
1352 {
1353 if (dump_enabled_p ())
1354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1355 "Unsupported loop-closed phi in "
1356 "outer-loop.");
1357 return false;
1358 }
1359
1360 /* If PHI is used in the outer loop, we check that its operand
1361 is defined in the inner loop. */
1362 if (STMT_VINFO_RELEVANT_P (stmt_info))
1363 {
1364 tree phi_op;
1365 gimple op_def_stmt;
1366
1367 if (gimple_phi_num_args (phi) != 1)
1368 return false;
1369
1370 phi_op = PHI_ARG_DEF (phi, 0);
1371 if (TREE_CODE (phi_op) != SSA_NAME)
1372 return false;
1373
1374 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1375 if (!op_def_stmt
1376 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1377 || !vinfo_for_stmt (op_def_stmt))
1378 return false;
1379
1380 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1381 != vect_used_in_outer
1382 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1383 != vect_used_in_outer_by_reduction)
1384 return false;
1385 }
1386
1387 continue;
1388 }
1389
1390 gcc_assert (stmt_info);
1391
1392 if (STMT_VINFO_LIVE_P (stmt_info))
1393 {
1394 /* FORNOW: not yet supported. */
1395 if (dump_enabled_p ())
1396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1397 "not vectorized: value used after loop.");
1398 return false;
1399 }
1400
1401 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1402 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1403 {
1404 /* A scalar-dependence cycle that we don't support. */
1405 if (dump_enabled_p ())
1406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1407 "not vectorized: scalar dependence cycle.");
1408 return false;
1409 }
1410
1411 if (STMT_VINFO_RELEVANT_P (stmt_info))
1412 {
1413 need_to_vectorize = true;
1414 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1415 ok = vectorizable_induction (phi, NULL, NULL);
1416 }
1417
1418 if (!ok)
1419 {
1420 if (dump_enabled_p ())
1421 {
1422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1423 "not vectorized: relevant phi not "
1424 "supported: ");
1425 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1426 }
1427 return false;
1428 }
1429 }
1430
1431 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1432 {
1433 gimple stmt = gsi_stmt (si);
1434 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1435 return false;
1436 }
1437 } /* bbs */
1438
1439 /* All operations in the loop are either irrelevant (deal with loop
1440 control, or dead), or only used outside the loop and can be moved
1441 out of the loop (e.g. invariants, inductions). The loop can be
1442 optimized away by scalar optimizations. We're better off not
1443 touching this loop. */
1444 if (!need_to_vectorize)
1445 {
1446 if (dump_enabled_p ())
1447 dump_printf_loc (MSG_NOTE, vect_location,
1448 "All the computation can be taken out of the loop.");
1449 if (dump_enabled_p ())
1450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1451 "not vectorized: redundant loop. no profit to "
1452 "vectorize.");
1453 return false;
1454 }
1455
1456 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1457 dump_printf_loc (MSG_NOTE, vect_location,
1458 "vectorization_factor = %d, niters = "
1459 HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1460 LOOP_VINFO_INT_NITERS (loop_vinfo));
1461
1462 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1463 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1464 || ((max_niter = max_stmt_executions_int (loop)) != -1
1465 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1466 {
1467 if (dump_enabled_p ())
1468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1469 "not vectorized: iteration count too small.");
1470 if (dump_enabled_p ())
1471 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1472 "not vectorized: iteration count smaller than "
1473 "vectorization factor.");
1474 return false;
1475 }
1476
1477 /* Analyze cost. Decide if worth while to vectorize. */
1478
1479 /* Once VF is set, SLP costs should be updated since the number of created
1480 vector stmts depends on VF. */
1481 vect_update_slp_costs_according_to_vf (loop_vinfo);
1482
1483 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1484 &min_profitable_estimate);
1485 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1486
1487 if (min_profitable_iters < 0)
1488 {
1489 if (dump_enabled_p ())
1490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1491 "not vectorized: vectorization not profitable.");
1492 if (dump_enabled_p ())
1493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1494 "not vectorized: vector version will never be "
1495 "profitable.");
1496 return false;
1497 }
1498
1499 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1500 * vectorization_factor) - 1);
1501
1502
1503 /* Use the cost model only if it is more conservative than user specified
1504 threshold. */
1505
1506 th = (unsigned) min_scalar_loop_bound;
1507 if (min_profitable_iters
1508 && (!min_scalar_loop_bound
1509 || min_profitable_iters > min_scalar_loop_bound))
1510 th = (unsigned) min_profitable_iters;
1511
1512 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1513 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1514 {
1515 if (dump_enabled_p ())
1516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1517 "not vectorized: vectorization not profitable.");
1518 if (dump_enabled_p ())
1519 dump_printf_loc (MSG_NOTE, vect_location,
1520 "not vectorized: iteration count smaller than user "
1521 "specified loop bound parameter or minimum profitable "
1522 "iterations (whichever is more conservative).");
1523 return false;
1524 }
1525
1526 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1527 && ((unsigned HOST_WIDE_INT) estimated_niter
1528 <= MAX (th, (unsigned)min_profitable_estimate)))
1529 {
1530 if (dump_enabled_p ())
1531 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1532 "not vectorized: estimated iteration count too "
1533 "small.");
1534 if (dump_enabled_p ())
1535 dump_printf_loc (MSG_NOTE, vect_location,
1536 "not vectorized: estimated iteration count smaller "
1537 "than specified loop bound parameter or minimum "
1538 "profitable iterations (whichever is more "
1539 "conservative).");
1540 return false;
1541 }
1542
1543 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1544 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1545 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1546 {
1547 if (dump_enabled_p ())
1548 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1549 if (!vect_can_advance_ivs_p (loop_vinfo))
1550 {
1551 if (dump_enabled_p ())
1552 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1553 "not vectorized: can't create epilog loop 1.");
1554 return false;
1555 }
1556 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1557 {
1558 if (dump_enabled_p ())
1559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1560 "not vectorized: can't create epilog loop 2.");
1561 return false;
1562 }
1563 }
1564
1565 return true;
1566 }
1567
1568
1569 /* Function vect_analyze_loop_2.
1570
1571 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1572 for it. The different analyses will record information in the
1573 loop_vec_info struct. */
1574 static bool
1575 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1576 {
1577 bool ok, slp = false;
1578 int max_vf = MAX_VECTORIZATION_FACTOR;
1579 int min_vf = 2;
1580
1581 /* Find all data references in the loop (which correspond to vdefs/vuses)
1582 and analyze their evolution in the loop. Also adjust the minimal
1583 vectorization factor according to the loads and stores.
1584
1585 FORNOW: Handle only simple, array references, which
1586 alignment can be forced, and aligned pointer-references. */
1587
1588 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1589 if (!ok)
1590 {
1591 if (dump_enabled_p ())
1592 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1593 "bad data references.");
1594 return false;
1595 }
1596
1597 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1598 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1599
1600 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1601 if (!ok)
1602 {
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605 "bad data access.");
1606 return false;
1607 }
1608
1609 /* Classify all cross-iteration scalar data-flow cycles.
1610 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1611
1612 vect_analyze_scalar_cycles (loop_vinfo);
1613
1614 vect_pattern_recog (loop_vinfo, NULL);
1615
1616 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1617
1618 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1619 if (!ok)
1620 {
1621 if (dump_enabled_p ())
1622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1623 "unexpected pattern.");
1624 return false;
1625 }
1626
1627 /* Analyze data dependences between the data-refs in the loop
1628 and adjust the maximum vectorization factor according to
1629 the dependences.
1630 FORNOW: fail at the first data dependence that we encounter. */
1631
1632 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1633 if (!ok
1634 || max_vf < min_vf)
1635 {
1636 if (dump_enabled_p ())
1637 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1638 "bad data dependence.");
1639 return false;
1640 }
1641
1642 ok = vect_determine_vectorization_factor (loop_vinfo);
1643 if (!ok)
1644 {
1645 if (dump_enabled_p ())
1646 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1647 "can't determine vectorization factor.");
1648 return false;
1649 }
1650 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1651 {
1652 if (dump_enabled_p ())
1653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1654 "bad data dependence.");
1655 return false;
1656 }
1657
1658 /* Analyze the alignment of the data-refs in the loop.
1659 Fail if a data reference is found that cannot be vectorized. */
1660
1661 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1662 if (!ok)
1663 {
1664 if (dump_enabled_p ())
1665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1666 "bad data alignment.");
1667 return false;
1668 }
1669
1670 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1671 It is important to call pruning after vect_analyze_data_ref_accesses,
1672 since we use grouping information gathered by interleaving analysis. */
1673 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1674 if (!ok)
1675 {
1676 if (dump_enabled_p ())
1677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1678 "too long list of versioning for alias "
1679 "run-time tests.");
1680 return false;
1681 }
1682
1683 /* This pass will decide on using loop versioning and/or loop peeling in
1684 order to enhance the alignment of data references in the loop. */
1685
1686 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1687 if (!ok)
1688 {
1689 if (dump_enabled_p ())
1690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1691 "bad data alignment.");
1692 return false;
1693 }
1694
1695 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1696 ok = vect_analyze_slp (loop_vinfo, NULL);
1697 if (ok)
1698 {
1699 /* Decide which possible SLP instances to SLP. */
1700 slp = vect_make_slp_decision (loop_vinfo);
1701
1702 /* Find stmts that need to be both vectorized and SLPed. */
1703 vect_detect_hybrid_slp (loop_vinfo);
1704 }
1705 else
1706 return false;
1707
1708 /* Scan all the operations in the loop and make sure they are
1709 vectorizable. */
1710
1711 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1712 if (!ok)
1713 {
1714 if (dump_enabled_p ())
1715 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1716 "bad operation or unsupported loop bound.");
1717 return false;
1718 }
1719
1720 return true;
1721 }
1722
1723 /* Function vect_analyze_loop.
1724
1725 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1726 for it. The different analyses will record information in the
1727 loop_vec_info struct. */
1728 loop_vec_info
1729 vect_analyze_loop (struct loop *loop)
1730 {
1731 loop_vec_info loop_vinfo;
1732 unsigned int vector_sizes;
1733
1734 /* Autodetect first vector size we try. */
1735 current_vector_size = 0;
1736 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1737
1738 if (dump_enabled_p ())
1739 dump_printf_loc (MSG_NOTE, vect_location,
1740 "===== analyze_loop_nest =====");
1741
1742 if (loop_outer (loop)
1743 && loop_vec_info_for_loop (loop_outer (loop))
1744 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1745 {
1746 if (dump_enabled_p ())
1747 dump_printf_loc (MSG_NOTE, vect_location,
1748 "outer-loop already vectorized.");
1749 return NULL;
1750 }
1751
1752 while (1)
1753 {
1754 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1755 loop_vinfo = vect_analyze_loop_form (loop);
1756 if (!loop_vinfo)
1757 {
1758 if (dump_enabled_p ())
1759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1760 "bad loop form.");
1761 return NULL;
1762 }
1763
1764 if (vect_analyze_loop_2 (loop_vinfo))
1765 {
1766 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1767
1768 return loop_vinfo;
1769 }
1770
1771 destroy_loop_vec_info (loop_vinfo, true);
1772
1773 vector_sizes &= ~current_vector_size;
1774 if (vector_sizes == 0
1775 || current_vector_size == 0)
1776 return NULL;
1777
1778 /* Try the next biggest vector size. */
1779 current_vector_size = 1 << floor_log2 (vector_sizes);
1780 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_NOTE, vect_location,
1782 "***** Re-trying analysis with "
1783 "vector size %d\n", current_vector_size);
1784 }
1785 }
1786
1787
1788 /* Function reduction_code_for_scalar_code
1789
1790 Input:
1791 CODE - tree_code of a reduction operations.
1792
1793 Output:
1794 REDUC_CODE - the corresponding tree-code to be used to reduce the
1795 vector of partial results into a single scalar result (which
1796 will also reside in a vector) or ERROR_MARK if the operation is
1797 a supported reduction operation, but does not have such tree-code.
1798
1799 Return FALSE if CODE currently cannot be vectorized as reduction. */
1800
1801 static bool
1802 reduction_code_for_scalar_code (enum tree_code code,
1803 enum tree_code *reduc_code)
1804 {
1805 switch (code)
1806 {
1807 case MAX_EXPR:
1808 *reduc_code = REDUC_MAX_EXPR;
1809 return true;
1810
1811 case MIN_EXPR:
1812 *reduc_code = REDUC_MIN_EXPR;
1813 return true;
1814
1815 case PLUS_EXPR:
1816 *reduc_code = REDUC_PLUS_EXPR;
1817 return true;
1818
1819 case MULT_EXPR:
1820 case MINUS_EXPR:
1821 case BIT_IOR_EXPR:
1822 case BIT_XOR_EXPR:
1823 case BIT_AND_EXPR:
1824 *reduc_code = ERROR_MARK;
1825 return true;
1826
1827 default:
1828 return false;
1829 }
1830 }
1831
1832
1833 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1834 STMT is printed with a message MSG. */
1835
1836 static void
1837 report_vect_op (int msg_type, gimple stmt, const char *msg)
1838 {
1839 dump_printf_loc (msg_type, vect_location, "%s", msg);
1840 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1841 }
1842
1843
1844 /* Detect SLP reduction of the form:
1845
1846 #a1 = phi <a5, a0>
1847 a2 = operation (a1)
1848 a3 = operation (a2)
1849 a4 = operation (a3)
1850 a5 = operation (a4)
1851
1852 #a = phi <a5>
1853
1854 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1855 FIRST_STMT is the first reduction stmt in the chain
1856 (a2 = operation (a1)).
1857
1858 Return TRUE if a reduction chain was detected. */
1859
1860 static bool
1861 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1862 {
1863 struct loop *loop = (gimple_bb (phi))->loop_father;
1864 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1865 enum tree_code code;
1866 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1867 stmt_vec_info use_stmt_info, current_stmt_info;
1868 tree lhs;
1869 imm_use_iterator imm_iter;
1870 use_operand_p use_p;
1871 int nloop_uses, size = 0, n_out_of_loop_uses;
1872 bool found = false;
1873
1874 if (loop != vect_loop)
1875 return false;
1876
1877 lhs = PHI_RESULT (phi);
1878 code = gimple_assign_rhs_code (first_stmt);
1879 while (1)
1880 {
1881 nloop_uses = 0;
1882 n_out_of_loop_uses = 0;
1883 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1884 {
1885 gimple use_stmt = USE_STMT (use_p);
1886 if (is_gimple_debug (use_stmt))
1887 continue;
1888
1889 use_stmt = USE_STMT (use_p);
1890
1891 /* Check if we got back to the reduction phi. */
1892 if (use_stmt == phi)
1893 {
1894 loop_use_stmt = use_stmt;
1895 found = true;
1896 break;
1897 }
1898
1899 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1900 {
1901 if (vinfo_for_stmt (use_stmt)
1902 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1903 {
1904 loop_use_stmt = use_stmt;
1905 nloop_uses++;
1906 }
1907 }
1908 else
1909 n_out_of_loop_uses++;
1910
1911 /* There are can be either a single use in the loop or two uses in
1912 phi nodes. */
1913 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1914 return false;
1915 }
1916
1917 if (found)
1918 break;
1919
1920 /* We reached a statement with no loop uses. */
1921 if (nloop_uses == 0)
1922 return false;
1923
1924 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1925 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1926 return false;
1927
1928 if (!is_gimple_assign (loop_use_stmt)
1929 || code != gimple_assign_rhs_code (loop_use_stmt)
1930 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1931 return false;
1932
1933 /* Insert USE_STMT into reduction chain. */
1934 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1935 if (current_stmt)
1936 {
1937 current_stmt_info = vinfo_for_stmt (current_stmt);
1938 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1939 GROUP_FIRST_ELEMENT (use_stmt_info)
1940 = GROUP_FIRST_ELEMENT (current_stmt_info);
1941 }
1942 else
1943 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1944
1945 lhs = gimple_assign_lhs (loop_use_stmt);
1946 current_stmt = loop_use_stmt;
1947 size++;
1948 }
1949
1950 if (!found || loop_use_stmt != phi || size < 2)
1951 return false;
1952
1953 /* Swap the operands, if needed, to make the reduction operand be the second
1954 operand. */
1955 lhs = PHI_RESULT (phi);
1956 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1957 while (next_stmt)
1958 {
1959 if (gimple_assign_rhs2 (next_stmt) == lhs)
1960 {
1961 tree op = gimple_assign_rhs1 (next_stmt);
1962 gimple def_stmt = NULL;
1963
1964 if (TREE_CODE (op) == SSA_NAME)
1965 def_stmt = SSA_NAME_DEF_STMT (op);
1966
1967 /* Check that the other def is either defined in the loop
1968 ("vect_internal_def"), or it's an induction (defined by a
1969 loop-header phi-node). */
1970 if (def_stmt
1971 && gimple_bb (def_stmt)
1972 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1973 && (is_gimple_assign (def_stmt)
1974 || is_gimple_call (def_stmt)
1975 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1976 == vect_induction_def
1977 || (gimple_code (def_stmt) == GIMPLE_PHI
1978 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1979 == vect_internal_def
1980 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1981 {
1982 lhs = gimple_assign_lhs (next_stmt);
1983 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1984 continue;
1985 }
1986
1987 return false;
1988 }
1989 else
1990 {
1991 tree op = gimple_assign_rhs2 (next_stmt);
1992 gimple def_stmt = NULL;
1993
1994 if (TREE_CODE (op) == SSA_NAME)
1995 def_stmt = SSA_NAME_DEF_STMT (op);
1996
1997 /* Check that the other def is either defined in the loop
1998 ("vect_internal_def"), or it's an induction (defined by a
1999 loop-header phi-node). */
2000 if (def_stmt
2001 && gimple_bb (def_stmt)
2002 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2003 && (is_gimple_assign (def_stmt)
2004 || is_gimple_call (def_stmt)
2005 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2006 == vect_induction_def
2007 || (gimple_code (def_stmt) == GIMPLE_PHI
2008 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2009 == vect_internal_def
2010 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2011 {
2012 if (dump_enabled_p ())
2013 {
2014 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2015 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2016 }
2017
2018 swap_tree_operands (next_stmt,
2019 gimple_assign_rhs1_ptr (next_stmt),
2020 gimple_assign_rhs2_ptr (next_stmt));
2021 update_stmt (next_stmt);
2022
2023 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2024 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2025 }
2026 else
2027 return false;
2028 }
2029
2030 lhs = gimple_assign_lhs (next_stmt);
2031 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2032 }
2033
2034 /* Save the chain for further analysis in SLP detection. */
2035 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2036 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2037 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2038
2039 return true;
2040 }
2041
2042
2043 /* Function vect_is_simple_reduction_1
2044
2045 (1) Detect a cross-iteration def-use cycle that represents a simple
2046 reduction computation. We look for the following pattern:
2047
2048 loop_header:
2049 a1 = phi < a0, a2 >
2050 a3 = ...
2051 a2 = operation (a3, a1)
2052
2053 such that:
2054 1. operation is commutative and associative and it is safe to
2055 change the order of the computation (if CHECK_REDUCTION is true)
2056 2. no uses for a2 in the loop (a2 is used out of the loop)
2057 3. no uses of a1 in the loop besides the reduction operation
2058 4. no uses of a1 outside the loop.
2059
2060 Conditions 1,4 are tested here.
2061 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2062
2063 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2064 nested cycles, if CHECK_REDUCTION is false.
2065
2066 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2067 reductions:
2068
2069 a1 = phi < a0, a2 >
2070 inner loop (def of a3)
2071 a2 = phi < a3 >
2072
2073 If MODIFY is true it tries also to rework the code in-place to enable
2074 detection of more reduction patterns. For the time being we rewrite
2075 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2076 */
2077
2078 static gimple
2079 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2080 bool check_reduction, bool *double_reduc,
2081 bool modify)
2082 {
2083 struct loop *loop = (gimple_bb (phi))->loop_father;
2084 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2085 edge latch_e = loop_latch_edge (loop);
2086 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2087 gimple def_stmt, def1 = NULL, def2 = NULL;
2088 enum tree_code orig_code, code;
2089 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2090 tree type;
2091 int nloop_uses;
2092 tree name;
2093 imm_use_iterator imm_iter;
2094 use_operand_p use_p;
2095 bool phi_def;
2096
2097 *double_reduc = false;
2098
2099 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2100 otherwise, we assume outer loop vectorization. */
2101 gcc_assert ((check_reduction && loop == vect_loop)
2102 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2103
2104 name = PHI_RESULT (phi);
2105 nloop_uses = 0;
2106 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2107 {
2108 gimple use_stmt = USE_STMT (use_p);
2109 if (is_gimple_debug (use_stmt))
2110 continue;
2111
2112 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2113 {
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2116 "intermediate value used outside loop.");
2117
2118 return NULL;
2119 }
2120
2121 if (vinfo_for_stmt (use_stmt)
2122 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2123 nloop_uses++;
2124 if (nloop_uses > 1)
2125 {
2126 if (dump_enabled_p ())
2127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2128 "reduction used in loop.");
2129 return NULL;
2130 }
2131 }
2132
2133 if (TREE_CODE (loop_arg) != SSA_NAME)
2134 {
2135 if (dump_enabled_p ())
2136 {
2137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2138 "reduction: not ssa_name: ");
2139 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2140 }
2141 return NULL;
2142 }
2143
2144 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2145 if (!def_stmt)
2146 {
2147 if (dump_enabled_p ())
2148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2149 "reduction: no def_stmt.");
2150 return NULL;
2151 }
2152
2153 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2154 {
2155 if (dump_enabled_p ())
2156 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2157 return NULL;
2158 }
2159
2160 if (is_gimple_assign (def_stmt))
2161 {
2162 name = gimple_assign_lhs (def_stmt);
2163 phi_def = false;
2164 }
2165 else
2166 {
2167 name = PHI_RESULT (def_stmt);
2168 phi_def = true;
2169 }
2170
2171 nloop_uses = 0;
2172 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2173 {
2174 gimple use_stmt = USE_STMT (use_p);
2175 if (is_gimple_debug (use_stmt))
2176 continue;
2177 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2178 && vinfo_for_stmt (use_stmt)
2179 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2180 nloop_uses++;
2181 if (nloop_uses > 1)
2182 {
2183 if (dump_enabled_p ())
2184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2185 "reduction used in loop.");
2186 return NULL;
2187 }
2188 }
2189
2190 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2191 defined in the inner loop. */
2192 if (phi_def)
2193 {
2194 op1 = PHI_ARG_DEF (def_stmt, 0);
2195
2196 if (gimple_phi_num_args (def_stmt) != 1
2197 || TREE_CODE (op1) != SSA_NAME)
2198 {
2199 if (dump_enabled_p ())
2200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2201 "unsupported phi node definition.");
2202
2203 return NULL;
2204 }
2205
2206 def1 = SSA_NAME_DEF_STMT (op1);
2207 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2208 && loop->inner
2209 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2210 && is_gimple_assign (def1))
2211 {
2212 if (dump_enabled_p ())
2213 report_vect_op (MSG_NOTE, def_stmt,
2214 "detected double reduction: ");
2215
2216 *double_reduc = true;
2217 return def_stmt;
2218 }
2219
2220 return NULL;
2221 }
2222
2223 code = orig_code = gimple_assign_rhs_code (def_stmt);
2224
2225 /* We can handle "res -= x[i]", which is non-associative by
2226 simply rewriting this into "res += -x[i]". Avoid changing
2227 gimple instruction for the first simple tests and only do this
2228 if we're allowed to change code at all. */
2229 if (code == MINUS_EXPR
2230 && modify
2231 && (op1 = gimple_assign_rhs1 (def_stmt))
2232 && TREE_CODE (op1) == SSA_NAME
2233 && SSA_NAME_DEF_STMT (op1) == phi)
2234 code = PLUS_EXPR;
2235
2236 if (check_reduction
2237 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2238 {
2239 if (dump_enabled_p ())
2240 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2241 "reduction: not commutative/associative: ");
2242 return NULL;
2243 }
2244
2245 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2246 {
2247 if (code != COND_EXPR)
2248 {
2249 if (dump_enabled_p ())
2250 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2251 "reduction: not binary operation: ");
2252
2253 return NULL;
2254 }
2255
2256 op3 = gimple_assign_rhs1 (def_stmt);
2257 if (COMPARISON_CLASS_P (op3))
2258 {
2259 op4 = TREE_OPERAND (op3, 1);
2260 op3 = TREE_OPERAND (op3, 0);
2261 }
2262
2263 op1 = gimple_assign_rhs2 (def_stmt);
2264 op2 = gimple_assign_rhs3 (def_stmt);
2265
2266 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2267 {
2268 if (dump_enabled_p ())
2269 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2270 "reduction: uses not ssa_names: ");
2271
2272 return NULL;
2273 }
2274 }
2275 else
2276 {
2277 op1 = gimple_assign_rhs1 (def_stmt);
2278 op2 = gimple_assign_rhs2 (def_stmt);
2279
2280 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2281 {
2282 if (dump_enabled_p ())
2283 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2284 "reduction: uses not ssa_names: ");
2285
2286 return NULL;
2287 }
2288 }
2289
2290 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2291 if ((TREE_CODE (op1) == SSA_NAME
2292 && !types_compatible_p (type,TREE_TYPE (op1)))
2293 || (TREE_CODE (op2) == SSA_NAME
2294 && !types_compatible_p (type, TREE_TYPE (op2)))
2295 || (op3 && TREE_CODE (op3) == SSA_NAME
2296 && !types_compatible_p (type, TREE_TYPE (op3)))
2297 || (op4 && TREE_CODE (op4) == SSA_NAME
2298 && !types_compatible_p (type, TREE_TYPE (op4))))
2299 {
2300 if (dump_enabled_p ())
2301 {
2302 dump_printf_loc (MSG_NOTE, vect_location,
2303 "reduction: multiple types: operation type: ");
2304 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2305 dump_printf (MSG_NOTE, ", operands types: ");
2306 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2307 TREE_TYPE (op1));
2308 dump_printf (MSG_NOTE, ",");
2309 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2310 TREE_TYPE (op2));
2311 if (op3)
2312 {
2313 dump_printf (MSG_NOTE, ",");
2314 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2315 TREE_TYPE (op3));
2316 }
2317
2318 if (op4)
2319 {
2320 dump_printf (MSG_NOTE, ",");
2321 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2322 TREE_TYPE (op4));
2323 }
2324 }
2325
2326 return NULL;
2327 }
2328
2329 /* Check that it's ok to change the order of the computation.
2330 Generally, when vectorizing a reduction we change the order of the
2331 computation. This may change the behavior of the program in some
2332 cases, so we need to check that this is ok. One exception is when
2333 vectorizing an outer-loop: the inner-loop is executed sequentially,
2334 and therefore vectorizing reductions in the inner-loop during
2335 outer-loop vectorization is safe. */
2336
2337 /* CHECKME: check for !flag_finite_math_only too? */
2338 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2339 && check_reduction)
2340 {
2341 /* Changing the order of operations changes the semantics. */
2342 if (dump_enabled_p ())
2343 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2344 "reduction: unsafe fp math optimization: ");
2345 return NULL;
2346 }
2347 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2348 && check_reduction)
2349 {
2350 /* Changing the order of operations changes the semantics. */
2351 if (dump_enabled_p ())
2352 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2353 "reduction: unsafe int math optimization: ");
2354 return NULL;
2355 }
2356 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2357 {
2358 /* Changing the order of operations changes the semantics. */
2359 if (dump_enabled_p ())
2360 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2361 "reduction: unsafe fixed-point math optimization: ");
2362 return NULL;
2363 }
2364
2365 /* If we detected "res -= x[i]" earlier, rewrite it into
2366 "res += -x[i]" now. If this turns out to be useless reassoc
2367 will clean it up again. */
2368 if (orig_code == MINUS_EXPR)
2369 {
2370 tree rhs = gimple_assign_rhs2 (def_stmt);
2371 tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2372 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2373 rhs, NULL);
2374 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2375 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2376 loop_info, NULL));
2377 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2378 gimple_assign_set_rhs2 (def_stmt, negrhs);
2379 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2380 update_stmt (def_stmt);
2381 }
2382
2383 /* Reduction is safe. We're dealing with one of the following:
2384 1) integer arithmetic and no trapv
2385 2) floating point arithmetic, and special flags permit this optimization
2386 3) nested cycle (i.e., outer loop vectorization). */
2387 if (TREE_CODE (op1) == SSA_NAME)
2388 def1 = SSA_NAME_DEF_STMT (op1);
2389
2390 if (TREE_CODE (op2) == SSA_NAME)
2391 def2 = SSA_NAME_DEF_STMT (op2);
2392
2393 if (code != COND_EXPR
2394 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2395 {
2396 if (dump_enabled_p ())
2397 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2398 return NULL;
2399 }
2400
2401 /* Check that one def is the reduction def, defined by PHI,
2402 the other def is either defined in the loop ("vect_internal_def"),
2403 or it's an induction (defined by a loop-header phi-node). */
2404
2405 if (def2 && def2 == phi
2406 && (code == COND_EXPR
2407 || !def1 || gimple_nop_p (def1)
2408 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2409 && (is_gimple_assign (def1)
2410 || is_gimple_call (def1)
2411 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2412 == vect_induction_def
2413 || (gimple_code (def1) == GIMPLE_PHI
2414 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2415 == vect_internal_def
2416 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2417 {
2418 if (dump_enabled_p ())
2419 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2420 return def_stmt;
2421 }
2422
2423 if (def1 && def1 == phi
2424 && (code == COND_EXPR
2425 || !def2 || gimple_nop_p (def2)
2426 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2427 && (is_gimple_assign (def2)
2428 || is_gimple_call (def2)
2429 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2430 == vect_induction_def
2431 || (gimple_code (def2) == GIMPLE_PHI
2432 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2433 == vect_internal_def
2434 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2435 {
2436 if (check_reduction)
2437 {
2438 /* Swap operands (just for simplicity - so that the rest of the code
2439 can assume that the reduction variable is always the last (second)
2440 argument). */
2441 if (dump_enabled_p ())
2442 report_vect_op (MSG_NOTE, def_stmt,
2443 "detected reduction: need to swap operands: ");
2444
2445 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2446 gimple_assign_rhs2_ptr (def_stmt));
2447
2448 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2449 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2450 }
2451 else
2452 {
2453 if (dump_enabled_p ())
2454 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2455 }
2456
2457 return def_stmt;
2458 }
2459
2460 /* Try to find SLP reduction chain. */
2461 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2462 {
2463 if (dump_enabled_p ())
2464 report_vect_op (MSG_NOTE, def_stmt,
2465 "reduction: detected reduction chain: ");
2466
2467 return def_stmt;
2468 }
2469
2470 if (dump_enabled_p ())
2471 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2472 "reduction: unknown pattern: ");
2473
2474 return NULL;
2475 }
2476
2477 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2478 in-place. Arguments as there. */
2479
2480 static gimple
2481 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2482 bool check_reduction, bool *double_reduc)
2483 {
2484 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2485 double_reduc, false);
2486 }
2487
2488 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2489 in-place if it enables detection of more reductions. Arguments
2490 as there. */
2491
2492 gimple
2493 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2494 bool check_reduction, bool *double_reduc)
2495 {
2496 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2497 double_reduc, true);
2498 }
2499
2500 /* Calculate the cost of one scalar iteration of the loop. */
2501 int
2502 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2503 {
2504 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2505 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2506 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2507 int innerloop_iters, i, stmt_cost;
2508
2509 /* Count statements in scalar loop. Using this as scalar cost for a single
2510 iteration for now.
2511
2512 TODO: Add outer loop support.
2513
2514 TODO: Consider assigning different costs to different scalar
2515 statements. */
2516
2517 /* FORNOW. */
2518 innerloop_iters = 1;
2519 if (loop->inner)
2520 innerloop_iters = 50; /* FIXME */
2521
2522 for (i = 0; i < nbbs; i++)
2523 {
2524 gimple_stmt_iterator si;
2525 basic_block bb = bbs[i];
2526
2527 if (bb->loop_father == loop->inner)
2528 factor = innerloop_iters;
2529 else
2530 factor = 1;
2531
2532 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2533 {
2534 gimple stmt = gsi_stmt (si);
2535 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2536
2537 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2538 continue;
2539
2540 /* Skip stmts that are not vectorized inside the loop. */
2541 if (stmt_info
2542 && !STMT_VINFO_RELEVANT_P (stmt_info)
2543 && (!STMT_VINFO_LIVE_P (stmt_info)
2544 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2545 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2546 continue;
2547
2548 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2549 {
2550 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2551 stmt_cost = vect_get_stmt_cost (scalar_load);
2552 else
2553 stmt_cost = vect_get_stmt_cost (scalar_store);
2554 }
2555 else
2556 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2557
2558 scalar_single_iter_cost += stmt_cost * factor;
2559 }
2560 }
2561 return scalar_single_iter_cost;
2562 }
2563
2564 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2565 int
2566 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2567 int *peel_iters_epilogue,
2568 int scalar_single_iter_cost,
2569 stmt_vector_for_cost *prologue_cost_vec,
2570 stmt_vector_for_cost *epilogue_cost_vec)
2571 {
2572 int retval = 0;
2573 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2574
2575 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2576 {
2577 *peel_iters_epilogue = vf/2;
2578 if (dump_enabled_p ())
2579 dump_printf_loc (MSG_NOTE, vect_location,
2580 "cost model: epilogue peel iters set to vf/2 "
2581 "because loop iterations are unknown .");
2582
2583 /* If peeled iterations are known but number of scalar loop
2584 iterations are unknown, count a taken branch per peeled loop. */
2585 retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2586 NULL, 0, vect_prologue);
2587 }
2588 else
2589 {
2590 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2591 peel_iters_prologue = niters < peel_iters_prologue ?
2592 niters : peel_iters_prologue;
2593 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2594 /* If we need to peel for gaps, but no peeling is required, we have to
2595 peel VF iterations. */
2596 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2597 *peel_iters_epilogue = vf;
2598 }
2599
2600 if (peel_iters_prologue)
2601 retval += record_stmt_cost (prologue_cost_vec,
2602 peel_iters_prologue * scalar_single_iter_cost,
2603 scalar_stmt, NULL, 0, vect_prologue);
2604 if (*peel_iters_epilogue)
2605 retval += record_stmt_cost (epilogue_cost_vec,
2606 *peel_iters_epilogue * scalar_single_iter_cost,
2607 scalar_stmt, NULL, 0, vect_epilogue);
2608 return retval;
2609 }
2610
2611 /* Function vect_estimate_min_profitable_iters
2612
2613 Return the number of iterations required for the vector version of the
2614 loop to be profitable relative to the cost of the scalar version of the
2615 loop. */
2616
2617 static void
2618 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2619 int *ret_min_profitable_niters,
2620 int *ret_min_profitable_estimate)
2621 {
2622 int min_profitable_iters;
2623 int min_profitable_estimate;
2624 int peel_iters_prologue;
2625 int peel_iters_epilogue;
2626 unsigned vec_inside_cost = 0;
2627 int vec_outside_cost = 0;
2628 unsigned vec_prologue_cost = 0;
2629 unsigned vec_epilogue_cost = 0;
2630 int scalar_single_iter_cost = 0;
2631 int scalar_outside_cost = 0;
2632 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2633 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2634 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2635
2636 /* Cost model disabled. */
2637 if (!flag_vect_cost_model)
2638 {
2639 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2640 *ret_min_profitable_niters = 0;
2641 *ret_min_profitable_estimate = 0;
2642 return;
2643 }
2644
2645 /* Requires loop versioning tests to handle misalignment. */
2646 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2647 {
2648 /* FIXME: Make cost depend on complexity of individual check. */
2649 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2650 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2651 vect_prologue);
2652 dump_printf (MSG_NOTE,
2653 "cost model: Adding cost of checks for loop "
2654 "versioning to treat misalignment.\n");
2655 }
2656
2657 /* Requires loop versioning with alias checks. */
2658 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2659 {
2660 /* FIXME: Make cost depend on complexity of individual check. */
2661 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2662 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2663 vect_prologue);
2664 dump_printf (MSG_NOTE,
2665 "cost model: Adding cost of checks for loop "
2666 "versioning aliasing.\n");
2667 }
2668
2669 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2670 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2671 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2672 vect_prologue);
2673
2674 /* Count statements in scalar loop. Using this as scalar cost for a single
2675 iteration for now.
2676
2677 TODO: Add outer loop support.
2678
2679 TODO: Consider assigning different costs to different scalar
2680 statements. */
2681
2682 scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2683
2684 /* Add additional cost for the peeled instructions in prologue and epilogue
2685 loop.
2686
2687 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2688 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2689
2690 TODO: Build an expression that represents peel_iters for prologue and
2691 epilogue to be used in a run-time test. */
2692
2693 if (npeel < 0)
2694 {
2695 peel_iters_prologue = vf/2;
2696 dump_printf (MSG_NOTE, "cost model: "
2697 "prologue peel iters set to vf/2.");
2698
2699 /* If peeling for alignment is unknown, loop bound of main loop becomes
2700 unknown. */
2701 peel_iters_epilogue = vf/2;
2702 dump_printf (MSG_NOTE, "cost model: "
2703 "epilogue peel iters set to vf/2 because "
2704 "peeling for alignment is unknown.");
2705
2706 /* If peeled iterations are unknown, count a taken branch and a not taken
2707 branch per peeled loop. Even if scalar loop iterations are known,
2708 vector iterations are not known since peeled prologue iterations are
2709 not known. Hence guards remain the same. */
2710 (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2711 NULL, 0, vect_prologue);
2712 (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2713 NULL, 0, vect_prologue);
2714 /* FORNOW: Don't attempt to pass individual scalar instructions to
2715 the model; just assume linear cost for scalar iterations. */
2716 (void) add_stmt_cost (target_cost_data,
2717 peel_iters_prologue * scalar_single_iter_cost,
2718 scalar_stmt, NULL, 0, vect_prologue);
2719 (void) add_stmt_cost (target_cost_data,
2720 peel_iters_epilogue * scalar_single_iter_cost,
2721 scalar_stmt, NULL, 0, vect_epilogue);
2722 }
2723 else
2724 {
2725 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2726 stmt_info_for_cost *si;
2727 int j;
2728 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2729
2730 prologue_cost_vec.create (2);
2731 epilogue_cost_vec.create (2);
2732 peel_iters_prologue = npeel;
2733
2734 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2735 &peel_iters_epilogue,
2736 scalar_single_iter_cost,
2737 &prologue_cost_vec,
2738 &epilogue_cost_vec);
2739
2740 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2741 {
2742 struct _stmt_vec_info *stmt_info
2743 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2744 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2745 si->misalign, vect_prologue);
2746 }
2747
2748 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2749 {
2750 struct _stmt_vec_info *stmt_info
2751 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2752 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2753 si->misalign, vect_epilogue);
2754 }
2755
2756 prologue_cost_vec.release ();
2757 epilogue_cost_vec.release ();
2758 }
2759
2760 /* FORNOW: The scalar outside cost is incremented in one of the
2761 following ways:
2762
2763 1. The vectorizer checks for alignment and aliasing and generates
2764 a condition that allows dynamic vectorization. A cost model
2765 check is ANDED with the versioning condition. Hence scalar code
2766 path now has the added cost of the versioning check.
2767
2768 if (cost > th & versioning_check)
2769 jmp to vector code
2770
2771 Hence run-time scalar is incremented by not-taken branch cost.
2772
2773 2. The vectorizer then checks if a prologue is required. If the
2774 cost model check was not done before during versioning, it has to
2775 be done before the prologue check.
2776
2777 if (cost <= th)
2778 prologue = scalar_iters
2779 if (prologue == 0)
2780 jmp to vector code
2781 else
2782 execute prologue
2783 if (prologue == num_iters)
2784 go to exit
2785
2786 Hence the run-time scalar cost is incremented by a taken branch,
2787 plus a not-taken branch, plus a taken branch cost.
2788
2789 3. The vectorizer then checks if an epilogue is required. If the
2790 cost model check was not done before during prologue check, it
2791 has to be done with the epilogue check.
2792
2793 if (prologue == 0)
2794 jmp to vector code
2795 else
2796 execute prologue
2797 if (prologue == num_iters)
2798 go to exit
2799 vector code:
2800 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2801 jmp to epilogue
2802
2803 Hence the run-time scalar cost should be incremented by 2 taken
2804 branches.
2805
2806 TODO: The back end may reorder the BBS's differently and reverse
2807 conditions/branch directions. Change the estimates below to
2808 something more reasonable. */
2809
2810 /* If the number of iterations is known and we do not do versioning, we can
2811 decide whether to vectorize at compile time. Hence the scalar version
2812 do not carry cost model guard costs. */
2813 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2814 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2815 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2816 {
2817 /* Cost model check occurs at versioning. */
2818 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2819 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2820 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2821 else
2822 {
2823 /* Cost model check occurs at prologue generation. */
2824 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2825 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2826 + vect_get_stmt_cost (cond_branch_not_taken);
2827 /* Cost model check occurs at epilogue generation. */
2828 else
2829 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2830 }
2831 }
2832
2833 /* Complete the target-specific cost calculations. */
2834 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2835 &vec_inside_cost, &vec_epilogue_cost);
2836
2837 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2838
2839 /* Calculate number of iterations required to make the vector version
2840 profitable, relative to the loop bodies only. The following condition
2841 must hold true:
2842 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2843 where
2844 SIC = scalar iteration cost, VIC = vector iteration cost,
2845 VOC = vector outside cost, VF = vectorization factor,
2846 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2847 SOC = scalar outside cost for run time cost model check. */
2848
2849 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2850 {
2851 if (vec_outside_cost <= 0)
2852 min_profitable_iters = 1;
2853 else
2854 {
2855 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2856 - vec_inside_cost * peel_iters_prologue
2857 - vec_inside_cost * peel_iters_epilogue)
2858 / ((scalar_single_iter_cost * vf)
2859 - vec_inside_cost);
2860
2861 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2862 <= (((int) vec_inside_cost * min_profitable_iters)
2863 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2864 min_profitable_iters++;
2865 }
2866 }
2867 /* vector version will never be profitable. */
2868 else
2869 {
2870 if (dump_enabled_p ())
2871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2872 "cost model: the vector iteration cost = %d "
2873 "divided by the scalar iteration cost = %d "
2874 "is greater or equal to the vectorization factor = %d.",
2875 vec_inside_cost, scalar_single_iter_cost, vf);
2876 *ret_min_profitable_niters = -1;
2877 *ret_min_profitable_estimate = -1;
2878 return;
2879 }
2880
2881 if (dump_enabled_p ())
2882 {
2883 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2884 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
2885 vec_inside_cost);
2886 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
2887 vec_prologue_cost);
2888 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
2889 vec_epilogue_cost);
2890 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
2891 scalar_single_iter_cost);
2892 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
2893 scalar_outside_cost);
2894 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
2895 vec_outside_cost);
2896 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
2897 peel_iters_prologue);
2898 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
2899 peel_iters_epilogue);
2900 dump_printf (MSG_NOTE,
2901 " Calculated minimum iters for profitability: %d\n",
2902 min_profitable_iters);
2903 }
2904
2905 min_profitable_iters =
2906 min_profitable_iters < vf ? vf : min_profitable_iters;
2907
2908 /* Because the condition we create is:
2909 if (niters <= min_profitable_iters)
2910 then skip the vectorized loop. */
2911 min_profitable_iters--;
2912
2913 if (dump_enabled_p ())
2914 dump_printf_loc (MSG_NOTE, vect_location,
2915 " Runtime profitability threshold = %d\n", min_profitable_iters);
2916
2917 *ret_min_profitable_niters = min_profitable_iters;
2918
2919 /* Calculate number of iterations required to make the vector version
2920 profitable, relative to the loop bodies only.
2921
2922 Non-vectorized variant is SIC * niters and it must win over vector
2923 variant on the expected loop trip count. The following condition must hold true:
2924 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
2925
2926 if (vec_outside_cost <= 0)
2927 min_profitable_estimate = 1;
2928 else
2929 {
2930 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2931 - vec_inside_cost * peel_iters_prologue
2932 - vec_inside_cost * peel_iters_epilogue)
2933 / ((scalar_single_iter_cost * vf)
2934 - vec_inside_cost);
2935 }
2936 min_profitable_estimate --;
2937 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2938 if (dump_enabled_p ())
2939 dump_printf_loc (MSG_NOTE, vect_location,
2940 " Static estimate profitability threshold = %d\n",
2941 min_profitable_iters);
2942
2943 *ret_min_profitable_estimate = min_profitable_estimate;
2944 }
2945
2946
2947 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2948 functions. Design better to avoid maintenance issues. */
2949
2950 /* Function vect_model_reduction_cost.
2951
2952 Models cost for a reduction operation, including the vector ops
2953 generated within the strip-mine loop, the initial definition before
2954 the loop, and the epilogue code that must be generated. */
2955
2956 static bool
2957 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2958 int ncopies)
2959 {
2960 int prologue_cost = 0, epilogue_cost = 0;
2961 enum tree_code code;
2962 optab optab;
2963 tree vectype;
2964 gimple stmt, orig_stmt;
2965 tree reduction_op;
2966 enum machine_mode mode;
2967 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2968 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2969 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2970
2971 /* Cost of reduction op inside loop. */
2972 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2973 stmt_info, 0, vect_body);
2974 stmt = STMT_VINFO_STMT (stmt_info);
2975
2976 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2977 {
2978 case GIMPLE_SINGLE_RHS:
2979 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2980 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2981 break;
2982 case GIMPLE_UNARY_RHS:
2983 reduction_op = gimple_assign_rhs1 (stmt);
2984 break;
2985 case GIMPLE_BINARY_RHS:
2986 reduction_op = gimple_assign_rhs2 (stmt);
2987 break;
2988 case GIMPLE_TERNARY_RHS:
2989 reduction_op = gimple_assign_rhs3 (stmt);
2990 break;
2991 default:
2992 gcc_unreachable ();
2993 }
2994
2995 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2996 if (!vectype)
2997 {
2998 if (dump_enabled_p ())
2999 {
3000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3001 "unsupported data-type ");
3002 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3003 TREE_TYPE (reduction_op));
3004 }
3005 return false;
3006 }
3007
3008 mode = TYPE_MODE (vectype);
3009 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3010
3011 if (!orig_stmt)
3012 orig_stmt = STMT_VINFO_STMT (stmt_info);
3013
3014 code = gimple_assign_rhs_code (orig_stmt);
3015
3016 /* Add in cost for initial definition. */
3017 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3018 stmt_info, 0, vect_prologue);
3019
3020 /* Determine cost of epilogue code.
3021
3022 We have a reduction operator that will reduce the vector in one statement.
3023 Also requires scalar extract. */
3024
3025 if (!nested_in_vect_loop_p (loop, orig_stmt))
3026 {
3027 if (reduc_code != ERROR_MARK)
3028 {
3029 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3030 stmt_info, 0, vect_epilogue);
3031 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3032 stmt_info, 0, vect_epilogue);
3033 }
3034 else
3035 {
3036 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3037 tree bitsize =
3038 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3039 int element_bitsize = tree_low_cst (bitsize, 1);
3040 int nelements = vec_size_in_bits / element_bitsize;
3041
3042 optab = optab_for_tree_code (code, vectype, optab_default);
3043
3044 /* We have a whole vector shift available. */
3045 if (VECTOR_MODE_P (mode)
3046 && optab_handler (optab, mode) != CODE_FOR_nothing
3047 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3048 {
3049 /* Final reduction via vector shifts and the reduction operator.
3050 Also requires scalar extract. */
3051 epilogue_cost += add_stmt_cost (target_cost_data,
3052 exact_log2 (nelements) * 2,
3053 vector_stmt, stmt_info, 0,
3054 vect_epilogue);
3055 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3056 vec_to_scalar, stmt_info, 0,
3057 vect_epilogue);
3058 }
3059 else
3060 /* Use extracts and reduction op for final reduction. For N
3061 elements, we have N extracts and N-1 reduction ops. */
3062 epilogue_cost += add_stmt_cost (target_cost_data,
3063 nelements + nelements - 1,
3064 vector_stmt, stmt_info, 0,
3065 vect_epilogue);
3066 }
3067 }
3068
3069 if (dump_enabled_p ())
3070 dump_printf (MSG_NOTE,
3071 "vect_model_reduction_cost: inside_cost = %d, "
3072 "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3073 prologue_cost, epilogue_cost);
3074
3075 return true;
3076 }
3077
3078
3079 /* Function vect_model_induction_cost.
3080
3081 Models cost for induction operations. */
3082
3083 static void
3084 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3085 {
3086 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3087 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3088 unsigned inside_cost, prologue_cost;
3089
3090 /* loop cost for vec_loop. */
3091 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3092 stmt_info, 0, vect_body);
3093
3094 /* prologue cost for vec_init and vec_step. */
3095 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3096 stmt_info, 0, vect_prologue);
3097
3098 if (dump_enabled_p ())
3099 dump_printf_loc (MSG_NOTE, vect_location,
3100 "vect_model_induction_cost: inside_cost = %d, "
3101 "prologue_cost = %d .", inside_cost, prologue_cost);
3102 }
3103
3104
3105 /* Function get_initial_def_for_induction
3106
3107 Input:
3108 STMT - a stmt that performs an induction operation in the loop.
3109 IV_PHI - the initial value of the induction variable
3110
3111 Output:
3112 Return a vector variable, initialized with the first VF values of
3113 the induction variable. E.g., for an iv with IV_PHI='X' and
3114 evolution S, for a vector of 4 units, we want to return:
3115 [X, X + S, X + 2*S, X + 3*S]. */
3116
3117 static tree
3118 get_initial_def_for_induction (gimple iv_phi)
3119 {
3120 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3121 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3122 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3123 tree scalar_type;
3124 tree vectype;
3125 int nunits;
3126 edge pe = loop_preheader_edge (loop);
3127 struct loop *iv_loop;
3128 basic_block new_bb;
3129 tree new_vec, vec_init, vec_step, t;
3130 tree access_fn;
3131 tree new_var;
3132 tree new_name;
3133 gimple init_stmt, induction_phi, new_stmt;
3134 tree induc_def, vec_def, vec_dest;
3135 tree init_expr, step_expr;
3136 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3137 int i;
3138 bool ok;
3139 int ncopies;
3140 tree expr;
3141 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3142 bool nested_in_vect_loop = false;
3143 gimple_seq stmts = NULL;
3144 imm_use_iterator imm_iter;
3145 use_operand_p use_p;
3146 gimple exit_phi;
3147 edge latch_e;
3148 tree loop_arg;
3149 gimple_stmt_iterator si;
3150 basic_block bb = gimple_bb (iv_phi);
3151 tree stepvectype;
3152 tree resvectype;
3153
3154 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3155 if (nested_in_vect_loop_p (loop, iv_phi))
3156 {
3157 nested_in_vect_loop = true;
3158 iv_loop = loop->inner;
3159 }
3160 else
3161 iv_loop = loop;
3162 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3163
3164 latch_e = loop_latch_edge (iv_loop);
3165 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3166
3167 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3168 gcc_assert (access_fn);
3169 STRIP_NOPS (access_fn);
3170 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3171 &init_expr, &step_expr);
3172 gcc_assert (ok);
3173 pe = loop_preheader_edge (iv_loop);
3174
3175 scalar_type = TREE_TYPE (init_expr);
3176 vectype = get_vectype_for_scalar_type (scalar_type);
3177 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3178 gcc_assert (vectype);
3179 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3180 ncopies = vf / nunits;
3181
3182 gcc_assert (phi_info);
3183 gcc_assert (ncopies >= 1);
3184
3185 /* Find the first insertion point in the BB. */
3186 si = gsi_after_labels (bb);
3187
3188 /* Create the vector that holds the initial_value of the induction. */
3189 if (nested_in_vect_loop)
3190 {
3191 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3192 been created during vectorization of previous stmts. We obtain it
3193 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3194 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3195 loop_preheader_edge (iv_loop));
3196 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3197 /* If the initial value is not of proper type, convert it. */
3198 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3199 {
3200 new_stmt = gimple_build_assign_with_ops
3201 (VIEW_CONVERT_EXPR,
3202 vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3203 build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3204 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3205 gimple_assign_set_lhs (new_stmt, vec_init);
3206 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3207 new_stmt);
3208 gcc_assert (!new_bb);
3209 set_vinfo_for_stmt (new_stmt,
3210 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3211 }
3212 }
3213 else
3214 {
3215 vec<constructor_elt, va_gc> *v;
3216
3217 /* iv_loop is the loop to be vectorized. Create:
3218 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3219 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3220 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3221 if (stmts)
3222 {
3223 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3224 gcc_assert (!new_bb);
3225 }
3226
3227 vec_alloc (v, nunits);
3228 bool constant_p = is_gimple_min_invariant (new_name);
3229 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3230 for (i = 1; i < nunits; i++)
3231 {
3232 /* Create: new_name_i = new_name + step_expr */
3233 enum tree_code code = POINTER_TYPE_P (scalar_type)
3234 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3235 new_name = fold_build2 (code, scalar_type, new_name, step_expr);
3236 if (!is_gimple_min_invariant (new_name))
3237 {
3238 init_stmt = gimple_build_assign (new_var, new_name);
3239 new_name = make_ssa_name (new_var, init_stmt);
3240 gimple_assign_set_lhs (init_stmt, new_name);
3241 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3242 gcc_assert (!new_bb);
3243 if (dump_enabled_p ())
3244 {
3245 dump_printf_loc (MSG_NOTE, vect_location,
3246 "created new init_stmt: ");
3247 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3248 }
3249 constant_p = false;
3250 }
3251 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3252 }
3253 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3254 if (constant_p)
3255 new_vec = build_vector_from_ctor (vectype, v);
3256 else
3257 new_vec = build_constructor (vectype, v);
3258 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3259 }
3260
3261
3262 /* Create the vector that holds the step of the induction. */
3263 if (nested_in_vect_loop)
3264 /* iv_loop is nested in the loop to be vectorized. Generate:
3265 vec_step = [S, S, S, S] */
3266 new_name = step_expr;
3267 else
3268 {
3269 /* iv_loop is the loop to be vectorized. Generate:
3270 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3271 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3272 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3273 expr, step_expr);
3274 }
3275
3276 t = unshare_expr (new_name);
3277 gcc_assert (CONSTANT_CLASS_P (new_name));
3278 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3279 gcc_assert (stepvectype);
3280 new_vec = build_vector_from_val (stepvectype, t);
3281 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3282
3283
3284 /* Create the following def-use cycle:
3285 loop prolog:
3286 vec_init = ...
3287 vec_step = ...
3288 loop:
3289 vec_iv = PHI <vec_init, vec_loop>
3290 ...
3291 STMT
3292 ...
3293 vec_loop = vec_iv + vec_step; */
3294
3295 /* Create the induction-phi that defines the induction-operand. */
3296 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3297 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3298 set_vinfo_for_stmt (induction_phi,
3299 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3300 induc_def = PHI_RESULT (induction_phi);
3301
3302 /* Create the iv update inside the loop */
3303 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3304 induc_def, vec_step);
3305 vec_def = make_ssa_name (vec_dest, new_stmt);
3306 gimple_assign_set_lhs (new_stmt, vec_def);
3307 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3308 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3309 NULL));
3310
3311 /* Set the arguments of the phi node: */
3312 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3313 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3314 UNKNOWN_LOCATION);
3315
3316
3317 /* In case that vectorization factor (VF) is bigger than the number
3318 of elements that we can fit in a vectype (nunits), we have to generate
3319 more than one vector stmt - i.e - we need to "unroll" the
3320 vector stmt by a factor VF/nunits. For more details see documentation
3321 in vectorizable_operation. */
3322
3323 if (ncopies > 1)
3324 {
3325 stmt_vec_info prev_stmt_vinfo;
3326 /* FORNOW. This restriction should be relaxed. */
3327 gcc_assert (!nested_in_vect_loop);
3328
3329 /* Create the vector that holds the step of the induction. */
3330 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3331 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3332 expr, step_expr);
3333 t = unshare_expr (new_name);
3334 gcc_assert (CONSTANT_CLASS_P (new_name));
3335 new_vec = build_vector_from_val (stepvectype, t);
3336 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3337
3338 vec_def = induc_def;
3339 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3340 for (i = 1; i < ncopies; i++)
3341 {
3342 /* vec_i = vec_prev + vec_step */
3343 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3344 vec_def, vec_step);
3345 vec_def = make_ssa_name (vec_dest, new_stmt);
3346 gimple_assign_set_lhs (new_stmt, vec_def);
3347
3348 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3349 if (!useless_type_conversion_p (resvectype, vectype))
3350 {
3351 new_stmt = gimple_build_assign_with_ops
3352 (VIEW_CONVERT_EXPR,
3353 vect_get_new_vect_var (resvectype, vect_simple_var,
3354 "vec_iv_"),
3355 build1 (VIEW_CONVERT_EXPR, resvectype,
3356 gimple_assign_lhs (new_stmt)), NULL_TREE);
3357 gimple_assign_set_lhs (new_stmt,
3358 make_ssa_name
3359 (gimple_assign_lhs (new_stmt), new_stmt));
3360 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3361 }
3362 set_vinfo_for_stmt (new_stmt,
3363 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3364 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3365 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3366 }
3367 }
3368
3369 if (nested_in_vect_loop)
3370 {
3371 /* Find the loop-closed exit-phi of the induction, and record
3372 the final vector of induction results: */
3373 exit_phi = NULL;
3374 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3375 {
3376 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3377 {
3378 exit_phi = USE_STMT (use_p);
3379 break;
3380 }
3381 }
3382 if (exit_phi)
3383 {
3384 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3385 /* FORNOW. Currently not supporting the case that an inner-loop induction
3386 is not used in the outer-loop (i.e. only outside the outer-loop). */
3387 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3388 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3389
3390 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3391 if (dump_enabled_p ())
3392 {
3393 dump_printf_loc (MSG_NOTE, vect_location,
3394 "vector of inductions after inner-loop:");
3395 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3396 }
3397 }
3398 }
3399
3400
3401 if (dump_enabled_p ())
3402 {
3403 dump_printf_loc (MSG_NOTE, vect_location,
3404 "transform induction: created def-use cycle: ");
3405 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3406 dump_printf (MSG_NOTE, "\n");
3407 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3408 SSA_NAME_DEF_STMT (vec_def), 0);
3409 }
3410
3411 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3412 if (!useless_type_conversion_p (resvectype, vectype))
3413 {
3414 new_stmt = gimple_build_assign_with_ops
3415 (VIEW_CONVERT_EXPR,
3416 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3417 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3418 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3419 gimple_assign_set_lhs (new_stmt, induc_def);
3420 si = gsi_after_labels (bb);
3421 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3422 set_vinfo_for_stmt (new_stmt,
3423 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3424 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3425 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3426 }
3427
3428 return induc_def;
3429 }
3430
3431
3432 /* Function get_initial_def_for_reduction
3433
3434 Input:
3435 STMT - a stmt that performs a reduction operation in the loop.
3436 INIT_VAL - the initial value of the reduction variable
3437
3438 Output:
3439 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3440 of the reduction (used for adjusting the epilog - see below).
3441 Return a vector variable, initialized according to the operation that STMT
3442 performs. This vector will be used as the initial value of the
3443 vector of partial results.
3444
3445 Option1 (adjust in epilog): Initialize the vector as follows:
3446 add/bit or/xor: [0,0,...,0,0]
3447 mult/bit and: [1,1,...,1,1]
3448 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3449 and when necessary (e.g. add/mult case) let the caller know
3450 that it needs to adjust the result by init_val.
3451
3452 Option2: Initialize the vector as follows:
3453 add/bit or/xor: [init_val,0,0,...,0]
3454 mult/bit and: [init_val,1,1,...,1]
3455 min/max/cond_expr: [init_val,init_val,...,init_val]
3456 and no adjustments are needed.
3457
3458 For example, for the following code:
3459
3460 s = init_val;
3461 for (i=0;i<n;i++)
3462 s = s + a[i];
3463
3464 STMT is 's = s + a[i]', and the reduction variable is 's'.
3465 For a vector of 4 units, we want to return either [0,0,0,init_val],
3466 or [0,0,0,0] and let the caller know that it needs to adjust
3467 the result at the end by 'init_val'.
3468
3469 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3470 initialization vector is simpler (same element in all entries), if
3471 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3472
3473 A cost model should help decide between these two schemes. */
3474
3475 tree
3476 get_initial_def_for_reduction (gimple stmt, tree init_val,
3477 tree *adjustment_def)
3478 {
3479 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3480 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3481 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3482 tree scalar_type = TREE_TYPE (init_val);
3483 tree vectype = get_vectype_for_scalar_type (scalar_type);
3484 int nunits;
3485 enum tree_code code = gimple_assign_rhs_code (stmt);
3486 tree def_for_init;
3487 tree init_def;
3488 tree *elts;
3489 int i;
3490 bool nested_in_vect_loop = false;
3491 tree init_value;
3492 REAL_VALUE_TYPE real_init_val = dconst0;
3493 int int_init_val = 0;
3494 gimple def_stmt = NULL;
3495
3496 gcc_assert (vectype);
3497 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3498
3499 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3500 || SCALAR_FLOAT_TYPE_P (scalar_type));
3501
3502 if (nested_in_vect_loop_p (loop, stmt))
3503 nested_in_vect_loop = true;
3504 else
3505 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3506
3507 /* In case of double reduction we only create a vector variable to be put
3508 in the reduction phi node. The actual statement creation is done in
3509 vect_create_epilog_for_reduction. */
3510 if (adjustment_def && nested_in_vect_loop
3511 && TREE_CODE (init_val) == SSA_NAME
3512 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3513 && gimple_code (def_stmt) == GIMPLE_PHI
3514 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3515 && vinfo_for_stmt (def_stmt)
3516 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3517 == vect_double_reduction_def)
3518 {
3519 *adjustment_def = NULL;
3520 return vect_create_destination_var (init_val, vectype);
3521 }
3522
3523 if (TREE_CONSTANT (init_val))
3524 {
3525 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3526 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3527 else
3528 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3529 }
3530 else
3531 init_value = init_val;
3532
3533 switch (code)
3534 {
3535 case WIDEN_SUM_EXPR:
3536 case DOT_PROD_EXPR:
3537 case PLUS_EXPR:
3538 case MINUS_EXPR:
3539 case BIT_IOR_EXPR:
3540 case BIT_XOR_EXPR:
3541 case MULT_EXPR:
3542 case BIT_AND_EXPR:
3543 /* ADJUSMENT_DEF is NULL when called from
3544 vect_create_epilog_for_reduction to vectorize double reduction. */
3545 if (adjustment_def)
3546 {
3547 if (nested_in_vect_loop)
3548 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3549 NULL);
3550 else
3551 *adjustment_def = init_val;
3552 }
3553
3554 if (code == MULT_EXPR)
3555 {
3556 real_init_val = dconst1;
3557 int_init_val = 1;
3558 }
3559
3560 if (code == BIT_AND_EXPR)
3561 int_init_val = -1;
3562
3563 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3564 def_for_init = build_real (scalar_type, real_init_val);
3565 else
3566 def_for_init = build_int_cst (scalar_type, int_init_val);
3567
3568 /* Create a vector of '0' or '1' except the first element. */
3569 elts = XALLOCAVEC (tree, nunits);
3570 for (i = nunits - 2; i >= 0; --i)
3571 elts[i + 1] = def_for_init;
3572
3573 /* Option1: the first element is '0' or '1' as well. */
3574 if (adjustment_def)
3575 {
3576 elts[0] = def_for_init;
3577 init_def = build_vector (vectype, elts);
3578 break;
3579 }
3580
3581 /* Option2: the first element is INIT_VAL. */
3582 elts[0] = init_val;
3583 if (TREE_CONSTANT (init_val))
3584 init_def = build_vector (vectype, elts);
3585 else
3586 {
3587 vec<constructor_elt, va_gc> *v;
3588 vec_alloc (v, nunits);
3589 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3590 for (i = 1; i < nunits; ++i)
3591 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3592 init_def = build_constructor (vectype, v);
3593 }
3594
3595 break;
3596
3597 case MIN_EXPR:
3598 case MAX_EXPR:
3599 case COND_EXPR:
3600 if (adjustment_def)
3601 {
3602 *adjustment_def = NULL_TREE;
3603 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3604 break;
3605 }
3606
3607 init_def = build_vector_from_val (vectype, init_value);
3608 break;
3609
3610 default:
3611 gcc_unreachable ();
3612 }
3613
3614 return init_def;
3615 }
3616
3617
3618 /* Function vect_create_epilog_for_reduction
3619
3620 Create code at the loop-epilog to finalize the result of a reduction
3621 computation.
3622
3623 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3624 reduction statements.
3625 STMT is the scalar reduction stmt that is being vectorized.
3626 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3627 number of elements that we can fit in a vectype (nunits). In this case
3628 we have to generate more than one vector stmt - i.e - we need to "unroll"
3629 the vector stmt by a factor VF/nunits. For more details see documentation
3630 in vectorizable_operation.
3631 REDUC_CODE is the tree-code for the epilog reduction.
3632 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3633 computation.
3634 REDUC_INDEX is the index of the operand in the right hand side of the
3635 statement that is defined by REDUCTION_PHI.
3636 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3637 SLP_NODE is an SLP node containing a group of reduction statements. The
3638 first one in this group is STMT.
3639
3640 This function:
3641 1. Creates the reduction def-use cycles: sets the arguments for
3642 REDUCTION_PHIS:
3643 The loop-entry argument is the vectorized initial-value of the reduction.
3644 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3645 sums.
3646 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3647 by applying the operation specified by REDUC_CODE if available, or by
3648 other means (whole-vector shifts or a scalar loop).
3649 The function also creates a new phi node at the loop exit to preserve
3650 loop-closed form, as illustrated below.
3651
3652 The flow at the entry to this function:
3653
3654 loop:
3655 vec_def = phi <null, null> # REDUCTION_PHI
3656 VECT_DEF = vector_stmt # vectorized form of STMT
3657 s_loop = scalar_stmt # (scalar) STMT
3658 loop_exit:
3659 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3660 use <s_out0>
3661 use <s_out0>
3662
3663 The above is transformed by this function into:
3664
3665 loop:
3666 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3667 VECT_DEF = vector_stmt # vectorized form of STMT
3668 s_loop = scalar_stmt # (scalar) STMT
3669 loop_exit:
3670 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3671 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3672 v_out2 = reduce <v_out1>
3673 s_out3 = extract_field <v_out2, 0>
3674 s_out4 = adjust_result <s_out3>
3675 use <s_out4>
3676 use <s_out4>
3677 */
3678
3679 static void
3680 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3681 int ncopies, enum tree_code reduc_code,
3682 vec<gimple> reduction_phis,
3683 int reduc_index, bool double_reduc,
3684 slp_tree slp_node)
3685 {
3686 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3687 stmt_vec_info prev_phi_info;
3688 tree vectype;
3689 enum machine_mode mode;
3690 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3691 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3692 basic_block exit_bb;
3693 tree scalar_dest;
3694 tree scalar_type;
3695 gimple new_phi = NULL, phi;
3696 gimple_stmt_iterator exit_gsi;
3697 tree vec_dest;
3698 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3699 gimple epilog_stmt = NULL;
3700 enum tree_code code = gimple_assign_rhs_code (stmt);
3701 gimple exit_phi;
3702 tree bitsize, bitpos;
3703 tree adjustment_def = NULL;
3704 tree vec_initial_def = NULL;
3705 tree reduction_op, expr, def;
3706 tree orig_name, scalar_result;
3707 imm_use_iterator imm_iter, phi_imm_iter;
3708 use_operand_p use_p, phi_use_p;
3709 bool extract_scalar_result = false;
3710 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3711 bool nested_in_vect_loop = false;
3712 vec<gimple> new_phis = vNULL;
3713 vec<gimple> inner_phis = vNULL;
3714 enum vect_def_type dt = vect_unknown_def_type;
3715 int j, i;
3716 vec<tree> scalar_results = vNULL;
3717 unsigned int group_size = 1, k, ratio;
3718 vec<tree> vec_initial_defs = vNULL;
3719 vec<gimple> phis;
3720 bool slp_reduc = false;
3721 tree new_phi_result;
3722 gimple inner_phi = NULL;
3723
3724 if (slp_node)
3725 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3726
3727 if (nested_in_vect_loop_p (loop, stmt))
3728 {
3729 outer_loop = loop;
3730 loop = loop->inner;
3731 nested_in_vect_loop = true;
3732 gcc_assert (!slp_node);
3733 }
3734
3735 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3736 {
3737 case GIMPLE_SINGLE_RHS:
3738 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3739 == ternary_op);
3740 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3741 break;
3742 case GIMPLE_UNARY_RHS:
3743 reduction_op = gimple_assign_rhs1 (stmt);
3744 break;
3745 case GIMPLE_BINARY_RHS:
3746 reduction_op = reduc_index ?
3747 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3748 break;
3749 case GIMPLE_TERNARY_RHS:
3750 reduction_op = gimple_op (stmt, reduc_index + 1);
3751 break;
3752 default:
3753 gcc_unreachable ();
3754 }
3755
3756 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3757 gcc_assert (vectype);
3758 mode = TYPE_MODE (vectype);
3759
3760 /* 1. Create the reduction def-use cycle:
3761 Set the arguments of REDUCTION_PHIS, i.e., transform
3762
3763 loop:
3764 vec_def = phi <null, null> # REDUCTION_PHI
3765 VECT_DEF = vector_stmt # vectorized form of STMT
3766 ...
3767
3768 into:
3769
3770 loop:
3771 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3772 VECT_DEF = vector_stmt # vectorized form of STMT
3773 ...
3774
3775 (in case of SLP, do it for all the phis). */
3776
3777 /* Get the loop-entry arguments. */
3778 if (slp_node)
3779 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3780 NULL, slp_node, reduc_index);
3781 else
3782 {
3783 vec_initial_defs.create (1);
3784 /* For the case of reduction, vect_get_vec_def_for_operand returns
3785 the scalar def before the loop, that defines the initial value
3786 of the reduction variable. */
3787 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3788 &adjustment_def);
3789 vec_initial_defs.quick_push (vec_initial_def);
3790 }
3791
3792 /* Set phi nodes arguments. */
3793 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3794 {
3795 tree vec_init_def = vec_initial_defs[i];
3796 tree def = vect_defs[i];
3797 for (j = 0; j < ncopies; j++)
3798 {
3799 /* Set the loop-entry arg of the reduction-phi. */
3800 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3801 UNKNOWN_LOCATION);
3802
3803 /* Set the loop-latch arg for the reduction-phi. */
3804 if (j > 0)
3805 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3806
3807 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3808
3809 if (dump_enabled_p ())
3810 {
3811 dump_printf_loc (MSG_NOTE, vect_location,
3812 "transform reduction: created def-use cycle: ");
3813 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3814 dump_printf (MSG_NOTE, "\n");
3815 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3816 }
3817
3818 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3819 }
3820 }
3821
3822 vec_initial_defs.release ();
3823
3824 /* 2. Create epilog code.
3825 The reduction epilog code operates across the elements of the vector
3826 of partial results computed by the vectorized loop.
3827 The reduction epilog code consists of:
3828
3829 step 1: compute the scalar result in a vector (v_out2)
3830 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3831 step 3: adjust the scalar result (s_out3) if needed.
3832
3833 Step 1 can be accomplished using one the following three schemes:
3834 (scheme 1) using reduc_code, if available.
3835 (scheme 2) using whole-vector shifts, if available.
3836 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3837 combined.
3838
3839 The overall epilog code looks like this:
3840
3841 s_out0 = phi <s_loop> # original EXIT_PHI
3842 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3843 v_out2 = reduce <v_out1> # step 1
3844 s_out3 = extract_field <v_out2, 0> # step 2
3845 s_out4 = adjust_result <s_out3> # step 3
3846
3847 (step 3 is optional, and steps 1 and 2 may be combined).
3848 Lastly, the uses of s_out0 are replaced by s_out4. */
3849
3850
3851 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3852 v_out1 = phi <VECT_DEF>
3853 Store them in NEW_PHIS. */
3854
3855 exit_bb = single_exit (loop)->dest;
3856 prev_phi_info = NULL;
3857 new_phis.create (vect_defs.length ());
3858 FOR_EACH_VEC_ELT (vect_defs, i, def)
3859 {
3860 for (j = 0; j < ncopies; j++)
3861 {
3862 tree new_def = copy_ssa_name (def, NULL);
3863 phi = create_phi_node (new_def, exit_bb);
3864 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3865 if (j == 0)
3866 new_phis.quick_push (phi);
3867 else
3868 {
3869 def = vect_get_vec_def_for_stmt_copy (dt, def);
3870 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3871 }
3872
3873 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3874 prev_phi_info = vinfo_for_stmt (phi);
3875 }
3876 }
3877
3878 /* The epilogue is created for the outer-loop, i.e., for the loop being
3879 vectorized. Create exit phis for the outer loop. */
3880 if (double_reduc)
3881 {
3882 loop = outer_loop;
3883 exit_bb = single_exit (loop)->dest;
3884 inner_phis.create (vect_defs.length ());
3885 FOR_EACH_VEC_ELT (new_phis, i, phi)
3886 {
3887 tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3888 gimple outer_phi = create_phi_node (new_result, exit_bb);
3889 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3890 PHI_RESULT (phi));
3891 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3892 loop_vinfo, NULL));
3893 inner_phis.quick_push (phi);
3894 new_phis[i] = outer_phi;
3895 prev_phi_info = vinfo_for_stmt (outer_phi);
3896 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3897 {
3898 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3899 new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3900 outer_phi = create_phi_node (new_result, exit_bb);
3901 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3902 PHI_RESULT (phi));
3903 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3904 loop_vinfo, NULL));
3905 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3906 prev_phi_info = vinfo_for_stmt (outer_phi);
3907 }
3908 }
3909 }
3910
3911 exit_gsi = gsi_after_labels (exit_bb);
3912
3913 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3914 (i.e. when reduc_code is not available) and in the final adjustment
3915 code (if needed). Also get the original scalar reduction variable as
3916 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3917 represents a reduction pattern), the tree-code and scalar-def are
3918 taken from the original stmt that the pattern-stmt (STMT) replaces.
3919 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3920 are taken from STMT. */
3921
3922 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3923 if (!orig_stmt)
3924 {
3925 /* Regular reduction */
3926 orig_stmt = stmt;
3927 }
3928 else
3929 {
3930 /* Reduction pattern */
3931 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3932 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3933 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3934 }
3935
3936 code = gimple_assign_rhs_code (orig_stmt);
3937 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3938 partial results are added and not subtracted. */
3939 if (code == MINUS_EXPR)
3940 code = PLUS_EXPR;
3941
3942 scalar_dest = gimple_assign_lhs (orig_stmt);
3943 scalar_type = TREE_TYPE (scalar_dest);
3944 scalar_results.create (group_size);
3945 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3946 bitsize = TYPE_SIZE (scalar_type);
3947
3948 /* In case this is a reduction in an inner-loop while vectorizing an outer
3949 loop - we don't need to extract a single scalar result at the end of the
3950 inner-loop (unless it is double reduction, i.e., the use of reduction is
3951 outside the outer-loop). The final vector of partial results will be used
3952 in the vectorized outer-loop, or reduced to a scalar result at the end of
3953 the outer-loop. */
3954 if (nested_in_vect_loop && !double_reduc)
3955 goto vect_finalize_reduction;
3956
3957 /* SLP reduction without reduction chain, e.g.,
3958 # a1 = phi <a2, a0>
3959 # b1 = phi <b2, b0>
3960 a2 = operation (a1)
3961 b2 = operation (b1) */
3962 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3963
3964 /* In case of reduction chain, e.g.,
3965 # a1 = phi <a3, a0>
3966 a2 = operation (a1)
3967 a3 = operation (a2),
3968
3969 we may end up with more than one vector result. Here we reduce them to
3970 one vector. */
3971 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3972 {
3973 tree first_vect = PHI_RESULT (new_phis[0]);
3974 tree tmp;
3975 gimple new_vec_stmt = NULL;
3976
3977 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3978 for (k = 1; k < new_phis.length (); k++)
3979 {
3980 gimple next_phi = new_phis[k];
3981 tree second_vect = PHI_RESULT (next_phi);
3982
3983 tmp = build2 (code, vectype, first_vect, second_vect);
3984 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3985 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3986 gimple_assign_set_lhs (new_vec_stmt, first_vect);
3987 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3988 }
3989
3990 new_phi_result = first_vect;
3991 if (new_vec_stmt)
3992 {
3993 new_phis.truncate (0);
3994 new_phis.safe_push (new_vec_stmt);
3995 }
3996 }
3997 else
3998 new_phi_result = PHI_RESULT (new_phis[0]);
3999
4000 /* 2.3 Create the reduction code, using one of the three schemes described
4001 above. In SLP we simply need to extract all the elements from the
4002 vector (without reducing them), so we use scalar shifts. */
4003 if (reduc_code != ERROR_MARK && !slp_reduc)
4004 {
4005 tree tmp;
4006
4007 /*** Case 1: Create:
4008 v_out2 = reduc_expr <v_out1> */
4009
4010 if (dump_enabled_p ())
4011 dump_printf_loc (MSG_NOTE, vect_location,
4012 "Reduce using direct vector reduction.");
4013
4014 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4015 tmp = build1 (reduc_code, vectype, new_phi_result);
4016 epilog_stmt = gimple_build_assign (vec_dest, tmp);
4017 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4018 gimple_assign_set_lhs (epilog_stmt, new_temp);
4019 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4020
4021 extract_scalar_result = true;
4022 }
4023 else
4024 {
4025 enum tree_code shift_code = ERROR_MARK;
4026 bool have_whole_vector_shift = true;
4027 int bit_offset;
4028 int element_bitsize = tree_low_cst (bitsize, 1);
4029 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4030 tree vec_temp;
4031
4032 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4033 shift_code = VEC_RSHIFT_EXPR;
4034 else
4035 have_whole_vector_shift = false;
4036
4037 /* Regardless of whether we have a whole vector shift, if we're
4038 emulating the operation via tree-vect-generic, we don't want
4039 to use it. Only the first round of the reduction is likely
4040 to still be profitable via emulation. */
4041 /* ??? It might be better to emit a reduction tree code here, so that
4042 tree-vect-generic can expand the first round via bit tricks. */
4043 if (!VECTOR_MODE_P (mode))
4044 have_whole_vector_shift = false;
4045 else
4046 {
4047 optab optab = optab_for_tree_code (code, vectype, optab_default);
4048 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4049 have_whole_vector_shift = false;
4050 }
4051
4052 if (have_whole_vector_shift && !slp_reduc)
4053 {
4054 /*** Case 2: Create:
4055 for (offset = VS/2; offset >= element_size; offset/=2)
4056 {
4057 Create: va' = vec_shift <va, offset>
4058 Create: va = vop <va, va'>
4059 } */
4060
4061 if (dump_enabled_p ())
4062 dump_printf_loc (MSG_NOTE, vect_location,
4063 "Reduce using vector shifts");
4064
4065 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4066 new_temp = new_phi_result;
4067 for (bit_offset = vec_size_in_bits/2;
4068 bit_offset >= element_bitsize;
4069 bit_offset /= 2)
4070 {
4071 tree bitpos = size_int (bit_offset);
4072
4073 epilog_stmt = gimple_build_assign_with_ops (shift_code,
4074 vec_dest, new_temp, bitpos);
4075 new_name = make_ssa_name (vec_dest, epilog_stmt);
4076 gimple_assign_set_lhs (epilog_stmt, new_name);
4077 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4078
4079 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4080 new_name, new_temp);
4081 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4082 gimple_assign_set_lhs (epilog_stmt, new_temp);
4083 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4084 }
4085
4086 extract_scalar_result = true;
4087 }
4088 else
4089 {
4090 tree rhs;
4091
4092 /*** Case 3: Create:
4093 s = extract_field <v_out2, 0>
4094 for (offset = element_size;
4095 offset < vector_size;
4096 offset += element_size;)
4097 {
4098 Create: s' = extract_field <v_out2, offset>
4099 Create: s = op <s, s'> // For non SLP cases
4100 } */
4101
4102 if (dump_enabled_p ())
4103 dump_printf_loc (MSG_NOTE, vect_location,
4104 "Reduce using scalar code. ");
4105
4106 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4107 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4108 {
4109 if (gimple_code (new_phi) == GIMPLE_PHI)
4110 vec_temp = PHI_RESULT (new_phi);
4111 else
4112 vec_temp = gimple_assign_lhs (new_phi);
4113 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4114 bitsize_zero_node);
4115 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4116 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4117 gimple_assign_set_lhs (epilog_stmt, new_temp);
4118 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4119
4120 /* In SLP we don't need to apply reduction operation, so we just
4121 collect s' values in SCALAR_RESULTS. */
4122 if (slp_reduc)
4123 scalar_results.safe_push (new_temp);
4124
4125 for (bit_offset = element_bitsize;
4126 bit_offset < vec_size_in_bits;
4127 bit_offset += element_bitsize)
4128 {
4129 tree bitpos = bitsize_int (bit_offset);
4130 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4131 bitsize, bitpos);
4132
4133 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4134 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4135 gimple_assign_set_lhs (epilog_stmt, new_name);
4136 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4137
4138 if (slp_reduc)
4139 {
4140 /* In SLP we don't need to apply reduction operation, so
4141 we just collect s' values in SCALAR_RESULTS. */
4142 new_temp = new_name;
4143 scalar_results.safe_push (new_name);
4144 }
4145 else
4146 {
4147 epilog_stmt = gimple_build_assign_with_ops (code,
4148 new_scalar_dest, new_name, new_temp);
4149 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4150 gimple_assign_set_lhs (epilog_stmt, new_temp);
4151 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4152 }
4153 }
4154 }
4155
4156 /* The only case where we need to reduce scalar results in SLP, is
4157 unrolling. If the size of SCALAR_RESULTS is greater than
4158 GROUP_SIZE, we reduce them combining elements modulo
4159 GROUP_SIZE. */
4160 if (slp_reduc)
4161 {
4162 tree res, first_res, new_res;
4163 gimple new_stmt;
4164
4165 /* Reduce multiple scalar results in case of SLP unrolling. */
4166 for (j = group_size; scalar_results.iterate (j, &res);
4167 j++)
4168 {
4169 first_res = scalar_results[j % group_size];
4170 new_stmt = gimple_build_assign_with_ops (code,
4171 new_scalar_dest, first_res, res);
4172 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4173 gimple_assign_set_lhs (new_stmt, new_res);
4174 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4175 scalar_results[j % group_size] = new_res;
4176 }
4177 }
4178 else
4179 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4180 scalar_results.safe_push (new_temp);
4181
4182 extract_scalar_result = false;
4183 }
4184 }
4185
4186 /* 2.4 Extract the final scalar result. Create:
4187 s_out3 = extract_field <v_out2, bitpos> */
4188
4189 if (extract_scalar_result)
4190 {
4191 tree rhs;
4192
4193 if (dump_enabled_p ())
4194 dump_printf_loc (MSG_NOTE, vect_location,
4195 "extract scalar result");
4196
4197 if (BYTES_BIG_ENDIAN)
4198 bitpos = size_binop (MULT_EXPR,
4199 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4200 TYPE_SIZE (scalar_type));
4201 else
4202 bitpos = bitsize_zero_node;
4203
4204 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4205 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4206 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4207 gimple_assign_set_lhs (epilog_stmt, new_temp);
4208 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4209 scalar_results.safe_push (new_temp);
4210 }
4211
4212 vect_finalize_reduction:
4213
4214 if (double_reduc)
4215 loop = loop->inner;
4216
4217 /* 2.5 Adjust the final result by the initial value of the reduction
4218 variable. (When such adjustment is not needed, then
4219 'adjustment_def' is zero). For example, if code is PLUS we create:
4220 new_temp = loop_exit_def + adjustment_def */
4221
4222 if (adjustment_def)
4223 {
4224 gcc_assert (!slp_reduc);
4225 if (nested_in_vect_loop)
4226 {
4227 new_phi = new_phis[0];
4228 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4229 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4230 new_dest = vect_create_destination_var (scalar_dest, vectype);
4231 }
4232 else
4233 {
4234 new_temp = scalar_results[0];
4235 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4236 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4237 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4238 }
4239
4240 epilog_stmt = gimple_build_assign (new_dest, expr);
4241 new_temp = make_ssa_name (new_dest, epilog_stmt);
4242 gimple_assign_set_lhs (epilog_stmt, new_temp);
4243 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4244 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4245 if (nested_in_vect_loop)
4246 {
4247 set_vinfo_for_stmt (epilog_stmt,
4248 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4249 NULL));
4250 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4251 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4252
4253 if (!double_reduc)
4254 scalar_results.quick_push (new_temp);
4255 else
4256 scalar_results[0] = new_temp;
4257 }
4258 else
4259 scalar_results[0] = new_temp;
4260
4261 new_phis[0] = epilog_stmt;
4262 }
4263
4264 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4265 phis with new adjusted scalar results, i.e., replace use <s_out0>
4266 with use <s_out4>.
4267
4268 Transform:
4269 loop_exit:
4270 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4271 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4272 v_out2 = reduce <v_out1>
4273 s_out3 = extract_field <v_out2, 0>
4274 s_out4 = adjust_result <s_out3>
4275 use <s_out0>
4276 use <s_out0>
4277
4278 into:
4279
4280 loop_exit:
4281 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4282 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4283 v_out2 = reduce <v_out1>
4284 s_out3 = extract_field <v_out2, 0>
4285 s_out4 = adjust_result <s_out3>
4286 use <s_out4>
4287 use <s_out4> */
4288
4289
4290 /* In SLP reduction chain we reduce vector results into one vector if
4291 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4292 the last stmt in the reduction chain, since we are looking for the loop
4293 exit phi node. */
4294 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4295 {
4296 scalar_dest = gimple_assign_lhs (
4297 SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4298 group_size = 1;
4299 }
4300
4301 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4302 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4303 need to match SCALAR_RESULTS with corresponding statements. The first
4304 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4305 the first vector stmt, etc.
4306 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4307 if (group_size > new_phis.length ())
4308 {
4309 ratio = group_size / new_phis.length ();
4310 gcc_assert (!(group_size % new_phis.length ()));
4311 }
4312 else
4313 ratio = 1;
4314
4315 for (k = 0; k < group_size; k++)
4316 {
4317 if (k % ratio == 0)
4318 {
4319 epilog_stmt = new_phis[k / ratio];
4320 reduction_phi = reduction_phis[k / ratio];
4321 if (double_reduc)
4322 inner_phi = inner_phis[k / ratio];
4323 }
4324
4325 if (slp_reduc)
4326 {
4327 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4328
4329 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4330 /* SLP statements can't participate in patterns. */
4331 gcc_assert (!orig_stmt);
4332 scalar_dest = gimple_assign_lhs (current_stmt);
4333 }
4334
4335 phis.create (3);
4336 /* Find the loop-closed-use at the loop exit of the original scalar
4337 result. (The reduction result is expected to have two immediate uses -
4338 one at the latch block, and one at the loop exit). */
4339 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4340 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4341 phis.safe_push (USE_STMT (use_p));
4342
4343 /* We expect to have found an exit_phi because of loop-closed-ssa
4344 form. */
4345 gcc_assert (!phis.is_empty ());
4346
4347 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4348 {
4349 if (outer_loop)
4350 {
4351 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4352 gimple vect_phi;
4353
4354 /* FORNOW. Currently not supporting the case that an inner-loop
4355 reduction is not used in the outer-loop (but only outside the
4356 outer-loop), unless it is double reduction. */
4357 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4358 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4359 || double_reduc);
4360
4361 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4362 if (!double_reduc
4363 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4364 != vect_double_reduction_def)
4365 continue;
4366
4367 /* Handle double reduction:
4368
4369 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4370 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4371 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4372 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4373
4374 At that point the regular reduction (stmt2 and stmt3) is
4375 already vectorized, as well as the exit phi node, stmt4.
4376 Here we vectorize the phi node of double reduction, stmt1, and
4377 update all relevant statements. */
4378
4379 /* Go through all the uses of s2 to find double reduction phi
4380 node, i.e., stmt1 above. */
4381 orig_name = PHI_RESULT (exit_phi);
4382 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4383 {
4384 stmt_vec_info use_stmt_vinfo;
4385 stmt_vec_info new_phi_vinfo;
4386 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4387 basic_block bb = gimple_bb (use_stmt);
4388 gimple use;
4389
4390 /* Check that USE_STMT is really double reduction phi
4391 node. */
4392 if (gimple_code (use_stmt) != GIMPLE_PHI
4393 || gimple_phi_num_args (use_stmt) != 2
4394 || bb->loop_father != outer_loop)
4395 continue;
4396 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4397 if (!use_stmt_vinfo
4398 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4399 != vect_double_reduction_def)
4400 continue;
4401
4402 /* Create vector phi node for double reduction:
4403 vs1 = phi <vs0, vs2>
4404 vs1 was created previously in this function by a call to
4405 vect_get_vec_def_for_operand and is stored in
4406 vec_initial_def;
4407 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4408 vs0 is created here. */
4409
4410 /* Create vector phi node. */
4411 vect_phi = create_phi_node (vec_initial_def, bb);
4412 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4413 loop_vec_info_for_loop (outer_loop), NULL);
4414 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4415
4416 /* Create vs0 - initial def of the double reduction phi. */
4417 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4418 loop_preheader_edge (outer_loop));
4419 init_def = get_initial_def_for_reduction (stmt,
4420 preheader_arg, NULL);
4421 vect_phi_init = vect_init_vector (use_stmt, init_def,
4422 vectype, NULL);
4423
4424 /* Update phi node arguments with vs0 and vs2. */
4425 add_phi_arg (vect_phi, vect_phi_init,
4426 loop_preheader_edge (outer_loop),
4427 UNKNOWN_LOCATION);
4428 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4429 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4430 if (dump_enabled_p ())
4431 {
4432 dump_printf_loc (MSG_NOTE, vect_location,
4433 "created double reduction phi node: ");
4434 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4435 }
4436
4437 vect_phi_res = PHI_RESULT (vect_phi);
4438
4439 /* Replace the use, i.e., set the correct vs1 in the regular
4440 reduction phi node. FORNOW, NCOPIES is always 1, so the
4441 loop is redundant. */
4442 use = reduction_phi;
4443 for (j = 0; j < ncopies; j++)
4444 {
4445 edge pr_edge = loop_preheader_edge (loop);
4446 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4447 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4448 }
4449 }
4450 }
4451 }
4452
4453 phis.release ();
4454 if (nested_in_vect_loop)
4455 {
4456 if (double_reduc)
4457 loop = outer_loop;
4458 else
4459 continue;
4460 }
4461
4462 phis.create (3);
4463 /* Find the loop-closed-use at the loop exit of the original scalar
4464 result. (The reduction result is expected to have two immediate uses,
4465 one at the latch block, and one at the loop exit). For double
4466 reductions we are looking for exit phis of the outer loop. */
4467 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4468 {
4469 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4470 phis.safe_push (USE_STMT (use_p));
4471 else
4472 {
4473 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4474 {
4475 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4476
4477 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4478 {
4479 if (!flow_bb_inside_loop_p (loop,
4480 gimple_bb (USE_STMT (phi_use_p))))
4481 phis.safe_push (USE_STMT (phi_use_p));
4482 }
4483 }
4484 }
4485 }
4486
4487 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4488 {
4489 /* Replace the uses: */
4490 orig_name = PHI_RESULT (exit_phi);
4491 scalar_result = scalar_results[k];
4492 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4493 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4494 SET_USE (use_p, scalar_result);
4495 }
4496
4497 phis.release ();
4498 }
4499
4500 scalar_results.release ();
4501 inner_phis.release ();
4502 new_phis.release ();
4503 }
4504
4505
4506 /* Function vectorizable_reduction.
4507
4508 Check if STMT performs a reduction operation that can be vectorized.
4509 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4510 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4511 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4512
4513 This function also handles reduction idioms (patterns) that have been
4514 recognized in advance during vect_pattern_recog. In this case, STMT may be
4515 of this form:
4516 X = pattern_expr (arg0, arg1, ..., X)
4517 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4518 sequence that had been detected and replaced by the pattern-stmt (STMT).
4519
4520 In some cases of reduction patterns, the type of the reduction variable X is
4521 different than the type of the other arguments of STMT.
4522 In such cases, the vectype that is used when transforming STMT into a vector
4523 stmt is different than the vectype that is used to determine the
4524 vectorization factor, because it consists of a different number of elements
4525 than the actual number of elements that are being operated upon in parallel.
4526
4527 For example, consider an accumulation of shorts into an int accumulator.
4528 On some targets it's possible to vectorize this pattern operating on 8
4529 shorts at a time (hence, the vectype for purposes of determining the
4530 vectorization factor should be V8HI); on the other hand, the vectype that
4531 is used to create the vector form is actually V4SI (the type of the result).
4532
4533 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4534 indicates what is the actual level of parallelism (V8HI in the example), so
4535 that the right vectorization factor would be derived. This vectype
4536 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4537 be used to create the vectorized stmt. The right vectype for the vectorized
4538 stmt is obtained from the type of the result X:
4539 get_vectype_for_scalar_type (TREE_TYPE (X))
4540
4541 This means that, contrary to "regular" reductions (or "regular" stmts in
4542 general), the following equation:
4543 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4544 does *NOT* necessarily hold for reduction patterns. */
4545
4546 bool
4547 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4548 gimple *vec_stmt, slp_tree slp_node)
4549 {
4550 tree vec_dest;
4551 tree scalar_dest;
4552 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4553 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4554 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4555 tree vectype_in = NULL_TREE;
4556 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4557 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4558 enum tree_code code, orig_code, epilog_reduc_code;
4559 enum machine_mode vec_mode;
4560 int op_type;
4561 optab optab, reduc_optab;
4562 tree new_temp = NULL_TREE;
4563 tree def;
4564 gimple def_stmt;
4565 enum vect_def_type dt;
4566 gimple new_phi = NULL;
4567 tree scalar_type;
4568 bool is_simple_use;
4569 gimple orig_stmt;
4570 stmt_vec_info orig_stmt_info;
4571 tree expr = NULL_TREE;
4572 int i;
4573 int ncopies;
4574 int epilog_copies;
4575 stmt_vec_info prev_stmt_info, prev_phi_info;
4576 bool single_defuse_cycle = false;
4577 tree reduc_def = NULL_TREE;
4578 gimple new_stmt = NULL;
4579 int j;
4580 tree ops[3];
4581 bool nested_cycle = false, found_nested_cycle_def = false;
4582 gimple reduc_def_stmt = NULL;
4583 /* The default is that the reduction variable is the last in statement. */
4584 int reduc_index = 2;
4585 bool double_reduc = false, dummy;
4586 basic_block def_bb;
4587 struct loop * def_stmt_loop, *outer_loop = NULL;
4588 tree def_arg;
4589 gimple def_arg_stmt;
4590 vec<tree> vec_oprnds0 = vNULL;
4591 vec<tree> vec_oprnds1 = vNULL;
4592 vec<tree> vect_defs = vNULL;
4593 vec<gimple> phis = vNULL;
4594 int vec_num;
4595 tree def0, def1, tem, op0, op1 = NULL_TREE;
4596
4597 /* In case of reduction chain we switch to the first stmt in the chain, but
4598 we don't update STMT_INFO, since only the last stmt is marked as reduction
4599 and has reduction properties. */
4600 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4601 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4602
4603 if (nested_in_vect_loop_p (loop, stmt))
4604 {
4605 outer_loop = loop;
4606 loop = loop->inner;
4607 nested_cycle = true;
4608 }
4609
4610 /* 1. Is vectorizable reduction? */
4611 /* Not supportable if the reduction variable is used in the loop, unless
4612 it's a reduction chain. */
4613 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4614 && !GROUP_FIRST_ELEMENT (stmt_info))
4615 return false;
4616
4617 /* Reductions that are not used even in an enclosing outer-loop,
4618 are expected to be "live" (used out of the loop). */
4619 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4620 && !STMT_VINFO_LIVE_P (stmt_info))
4621 return false;
4622
4623 /* Make sure it was already recognized as a reduction computation. */
4624 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4625 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4626 return false;
4627
4628 /* 2. Has this been recognized as a reduction pattern?
4629
4630 Check if STMT represents a pattern that has been recognized
4631 in earlier analysis stages. For stmts that represent a pattern,
4632 the STMT_VINFO_RELATED_STMT field records the last stmt in
4633 the original sequence that constitutes the pattern. */
4634
4635 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4636 if (orig_stmt)
4637 {
4638 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4639 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4640 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4641 }
4642
4643 /* 3. Check the operands of the operation. The first operands are defined
4644 inside the loop body. The last operand is the reduction variable,
4645 which is defined by the loop-header-phi. */
4646
4647 gcc_assert (is_gimple_assign (stmt));
4648
4649 /* Flatten RHS. */
4650 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4651 {
4652 case GIMPLE_SINGLE_RHS:
4653 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4654 if (op_type == ternary_op)
4655 {
4656 tree rhs = gimple_assign_rhs1 (stmt);
4657 ops[0] = TREE_OPERAND (rhs, 0);
4658 ops[1] = TREE_OPERAND (rhs, 1);
4659 ops[2] = TREE_OPERAND (rhs, 2);
4660 code = TREE_CODE (rhs);
4661 }
4662 else
4663 return false;
4664 break;
4665
4666 case GIMPLE_BINARY_RHS:
4667 code = gimple_assign_rhs_code (stmt);
4668 op_type = TREE_CODE_LENGTH (code);
4669 gcc_assert (op_type == binary_op);
4670 ops[0] = gimple_assign_rhs1 (stmt);
4671 ops[1] = gimple_assign_rhs2 (stmt);
4672 break;
4673
4674 case GIMPLE_TERNARY_RHS:
4675 code = gimple_assign_rhs_code (stmt);
4676 op_type = TREE_CODE_LENGTH (code);
4677 gcc_assert (op_type == ternary_op);
4678 ops[0] = gimple_assign_rhs1 (stmt);
4679 ops[1] = gimple_assign_rhs2 (stmt);
4680 ops[2] = gimple_assign_rhs3 (stmt);
4681 break;
4682
4683 case GIMPLE_UNARY_RHS:
4684 return false;
4685
4686 default:
4687 gcc_unreachable ();
4688 }
4689
4690 if (code == COND_EXPR && slp_node)
4691 return false;
4692
4693 scalar_dest = gimple_assign_lhs (stmt);
4694 scalar_type = TREE_TYPE (scalar_dest);
4695 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4696 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4697 return false;
4698
4699 /* Do not try to vectorize bit-precision reductions. */
4700 if ((TYPE_PRECISION (scalar_type)
4701 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4702 return false;
4703
4704 /* All uses but the last are expected to be defined in the loop.
4705 The last use is the reduction variable. In case of nested cycle this
4706 assumption is not true: we use reduc_index to record the index of the
4707 reduction variable. */
4708 for (i = 0; i < op_type - 1; i++)
4709 {
4710 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4711 if (i == 0 && code == COND_EXPR)
4712 continue;
4713
4714 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4715 &def_stmt, &def, &dt, &tem);
4716 if (!vectype_in)
4717 vectype_in = tem;
4718 gcc_assert (is_simple_use);
4719
4720 if (dt != vect_internal_def
4721 && dt != vect_external_def
4722 && dt != vect_constant_def
4723 && dt != vect_induction_def
4724 && !(dt == vect_nested_cycle && nested_cycle))
4725 return false;
4726
4727 if (dt == vect_nested_cycle)
4728 {
4729 found_nested_cycle_def = true;
4730 reduc_def_stmt = def_stmt;
4731 reduc_index = i;
4732 }
4733 }
4734
4735 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4736 &def_stmt, &def, &dt, &tem);
4737 if (!vectype_in)
4738 vectype_in = tem;
4739 gcc_assert (is_simple_use);
4740 if (!(dt == vect_reduction_def
4741 || dt == vect_nested_cycle
4742 || ((dt == vect_internal_def || dt == vect_external_def
4743 || dt == vect_constant_def || dt == vect_induction_def)
4744 && nested_cycle && found_nested_cycle_def)))
4745 {
4746 /* For pattern recognized stmts, orig_stmt might be a reduction,
4747 but some helper statements for the pattern might not, or
4748 might be COND_EXPRs with reduction uses in the condition. */
4749 gcc_assert (orig_stmt);
4750 return false;
4751 }
4752 if (!found_nested_cycle_def)
4753 reduc_def_stmt = def_stmt;
4754
4755 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4756 if (orig_stmt)
4757 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4758 reduc_def_stmt,
4759 !nested_cycle,
4760 &dummy));
4761 else
4762 {
4763 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4764 !nested_cycle, &dummy);
4765 /* We changed STMT to be the first stmt in reduction chain, hence we
4766 check that in this case the first element in the chain is STMT. */
4767 gcc_assert (stmt == tmp
4768 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4769 }
4770
4771 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4772 return false;
4773
4774 if (slp_node || PURE_SLP_STMT (stmt_info))
4775 ncopies = 1;
4776 else
4777 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4778 / TYPE_VECTOR_SUBPARTS (vectype_in));
4779
4780 gcc_assert (ncopies >= 1);
4781
4782 vec_mode = TYPE_MODE (vectype_in);
4783
4784 if (code == COND_EXPR)
4785 {
4786 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4787 {
4788 if (dump_enabled_p ())
4789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4790 "unsupported condition in reduction");
4791
4792 return false;
4793 }
4794 }
4795 else
4796 {
4797 /* 4. Supportable by target? */
4798
4799 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4800 || code == LROTATE_EXPR || code == RROTATE_EXPR)
4801 {
4802 /* Shifts and rotates are only supported by vectorizable_shifts,
4803 not vectorizable_reduction. */
4804 if (dump_enabled_p ())
4805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4806 "unsupported shift or rotation.");
4807 return false;
4808 }
4809
4810 /* 4.1. check support for the operation in the loop */
4811 optab = optab_for_tree_code (code, vectype_in, optab_default);
4812 if (!optab)
4813 {
4814 if (dump_enabled_p ())
4815 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4816 "no optab.");
4817
4818 return false;
4819 }
4820
4821 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4822 {
4823 if (dump_enabled_p ())
4824 dump_printf (MSG_NOTE, "op not supported by target.");
4825
4826 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4827 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4828 < vect_min_worthwhile_factor (code))
4829 return false;
4830
4831 if (dump_enabled_p ())
4832 dump_printf (MSG_NOTE, "proceeding using word mode.");
4833 }
4834
4835 /* Worthwhile without SIMD support? */
4836 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4837 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4838 < vect_min_worthwhile_factor (code))
4839 {
4840 if (dump_enabled_p ())
4841 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4842 "not worthwhile without SIMD support.");
4843
4844 return false;
4845 }
4846 }
4847
4848 /* 4.2. Check support for the epilog operation.
4849
4850 If STMT represents a reduction pattern, then the type of the
4851 reduction variable may be different than the type of the rest
4852 of the arguments. For example, consider the case of accumulation
4853 of shorts into an int accumulator; The original code:
4854 S1: int_a = (int) short_a;
4855 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4856
4857 was replaced with:
4858 STMT: int_acc = widen_sum <short_a, int_acc>
4859
4860 This means that:
4861 1. The tree-code that is used to create the vector operation in the
4862 epilog code (that reduces the partial results) is not the
4863 tree-code of STMT, but is rather the tree-code of the original
4864 stmt from the pattern that STMT is replacing. I.e, in the example
4865 above we want to use 'widen_sum' in the loop, but 'plus' in the
4866 epilog.
4867 2. The type (mode) we use to check available target support
4868 for the vector operation to be created in the *epilog*, is
4869 determined by the type of the reduction variable (in the example
4870 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4871 However the type (mode) we use to check available target support
4872 for the vector operation to be created *inside the loop*, is
4873 determined by the type of the other arguments to STMT (in the
4874 example we'd check this: optab_handler (widen_sum_optab,
4875 vect_short_mode)).
4876
4877 This is contrary to "regular" reductions, in which the types of all
4878 the arguments are the same as the type of the reduction variable.
4879 For "regular" reductions we can therefore use the same vector type
4880 (and also the same tree-code) when generating the epilog code and
4881 when generating the code inside the loop. */
4882
4883 if (orig_stmt)
4884 {
4885 /* This is a reduction pattern: get the vectype from the type of the
4886 reduction variable, and get the tree-code from orig_stmt. */
4887 orig_code = gimple_assign_rhs_code (orig_stmt);
4888 gcc_assert (vectype_out);
4889 vec_mode = TYPE_MODE (vectype_out);
4890 }
4891 else
4892 {
4893 /* Regular reduction: use the same vectype and tree-code as used for
4894 the vector code inside the loop can be used for the epilog code. */
4895 orig_code = code;
4896 }
4897
4898 if (nested_cycle)
4899 {
4900 def_bb = gimple_bb (reduc_def_stmt);
4901 def_stmt_loop = def_bb->loop_father;
4902 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4903 loop_preheader_edge (def_stmt_loop));
4904 if (TREE_CODE (def_arg) == SSA_NAME
4905 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4906 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4907 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4908 && vinfo_for_stmt (def_arg_stmt)
4909 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4910 == vect_double_reduction_def)
4911 double_reduc = true;
4912 }
4913
4914 epilog_reduc_code = ERROR_MARK;
4915 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4916 {
4917 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4918 optab_default);
4919 if (!reduc_optab)
4920 {
4921 if (dump_enabled_p ())
4922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4923 "no optab for reduction.");
4924
4925 epilog_reduc_code = ERROR_MARK;
4926 }
4927
4928 if (reduc_optab
4929 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4930 {
4931 if (dump_enabled_p ())
4932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4933 "reduc op not supported by target.");
4934
4935 epilog_reduc_code = ERROR_MARK;
4936 }
4937 }
4938 else
4939 {
4940 if (!nested_cycle || double_reduc)
4941 {
4942 if (dump_enabled_p ())
4943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4944 "no reduc code for scalar code.");
4945
4946 return false;
4947 }
4948 }
4949
4950 if (double_reduc && ncopies > 1)
4951 {
4952 if (dump_enabled_p ())
4953 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4954 "multiple types in double reduction");
4955
4956 return false;
4957 }
4958
4959 /* In case of widenning multiplication by a constant, we update the type
4960 of the constant to be the type of the other operand. We check that the
4961 constant fits the type in the pattern recognition pass. */
4962 if (code == DOT_PROD_EXPR
4963 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4964 {
4965 if (TREE_CODE (ops[0]) == INTEGER_CST)
4966 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4967 else if (TREE_CODE (ops[1]) == INTEGER_CST)
4968 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4969 else
4970 {
4971 if (dump_enabled_p ())
4972 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4973 "invalid types in dot-prod");
4974
4975 return false;
4976 }
4977 }
4978
4979 if (!vec_stmt) /* transformation not required. */
4980 {
4981 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4982 return false;
4983 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4984 return true;
4985 }
4986
4987 /** Transform. **/
4988
4989 if (dump_enabled_p ())
4990 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
4991
4992 /* FORNOW: Multiple types are not supported for condition. */
4993 if (code == COND_EXPR)
4994 gcc_assert (ncopies == 1);
4995
4996 /* Create the destination vector */
4997 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4998
4999 /* In case the vectorization factor (VF) is bigger than the number
5000 of elements that we can fit in a vectype (nunits), we have to generate
5001 more than one vector stmt - i.e - we need to "unroll" the
5002 vector stmt by a factor VF/nunits. For more details see documentation
5003 in vectorizable_operation. */
5004
5005 /* If the reduction is used in an outer loop we need to generate
5006 VF intermediate results, like so (e.g. for ncopies=2):
5007 r0 = phi (init, r0)
5008 r1 = phi (init, r1)
5009 r0 = x0 + r0;
5010 r1 = x1 + r1;
5011 (i.e. we generate VF results in 2 registers).
5012 In this case we have a separate def-use cycle for each copy, and therefore
5013 for each copy we get the vector def for the reduction variable from the
5014 respective phi node created for this copy.
5015
5016 Otherwise (the reduction is unused in the loop nest), we can combine
5017 together intermediate results, like so (e.g. for ncopies=2):
5018 r = phi (init, r)
5019 r = x0 + r;
5020 r = x1 + r;
5021 (i.e. we generate VF/2 results in a single register).
5022 In this case for each copy we get the vector def for the reduction variable
5023 from the vectorized reduction operation generated in the previous iteration.
5024 */
5025
5026 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5027 {
5028 single_defuse_cycle = true;
5029 epilog_copies = 1;
5030 }
5031 else
5032 epilog_copies = ncopies;
5033
5034 prev_stmt_info = NULL;
5035 prev_phi_info = NULL;
5036 if (slp_node)
5037 {
5038 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5039 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5040 == TYPE_VECTOR_SUBPARTS (vectype_in));
5041 }
5042 else
5043 {
5044 vec_num = 1;
5045 vec_oprnds0.create (1);
5046 if (op_type == ternary_op)
5047 vec_oprnds1.create (1);
5048 }
5049
5050 phis.create (vec_num);
5051 vect_defs.create (vec_num);
5052 if (!slp_node)
5053 vect_defs.quick_push (NULL_TREE);
5054
5055 for (j = 0; j < ncopies; j++)
5056 {
5057 if (j == 0 || !single_defuse_cycle)
5058 {
5059 for (i = 0; i < vec_num; i++)
5060 {
5061 /* Create the reduction-phi that defines the reduction
5062 operand. */
5063 new_phi = create_phi_node (vec_dest, loop->header);
5064 set_vinfo_for_stmt (new_phi,
5065 new_stmt_vec_info (new_phi, loop_vinfo,
5066 NULL));
5067 if (j == 0 || slp_node)
5068 phis.quick_push (new_phi);
5069 }
5070 }
5071
5072 if (code == COND_EXPR)
5073 {
5074 gcc_assert (!slp_node);
5075 vectorizable_condition (stmt, gsi, vec_stmt,
5076 PHI_RESULT (phis[0]),
5077 reduc_index, NULL);
5078 /* Multiple types are not supported for condition. */
5079 break;
5080 }
5081
5082 /* Handle uses. */
5083 if (j == 0)
5084 {
5085 op0 = ops[!reduc_index];
5086 if (op_type == ternary_op)
5087 {
5088 if (reduc_index == 0)
5089 op1 = ops[2];
5090 else
5091 op1 = ops[1];
5092 }
5093
5094 if (slp_node)
5095 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5096 slp_node, -1);
5097 else
5098 {
5099 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5100 stmt, NULL);
5101 vec_oprnds0.quick_push (loop_vec_def0);
5102 if (op_type == ternary_op)
5103 {
5104 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5105 NULL);
5106 vec_oprnds1.quick_push (loop_vec_def1);
5107 }
5108 }
5109 }
5110 else
5111 {
5112 if (!slp_node)
5113 {
5114 enum vect_def_type dt;
5115 gimple dummy_stmt;
5116 tree dummy;
5117
5118 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5119 &dummy_stmt, &dummy, &dt);
5120 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5121 loop_vec_def0);
5122 vec_oprnds0[0] = loop_vec_def0;
5123 if (op_type == ternary_op)
5124 {
5125 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5126 &dummy, &dt);
5127 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5128 loop_vec_def1);
5129 vec_oprnds1[0] = loop_vec_def1;
5130 }
5131 }
5132
5133 if (single_defuse_cycle)
5134 reduc_def = gimple_assign_lhs (new_stmt);
5135
5136 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5137 }
5138
5139 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5140 {
5141 if (slp_node)
5142 reduc_def = PHI_RESULT (phis[i]);
5143 else
5144 {
5145 if (!single_defuse_cycle || j == 0)
5146 reduc_def = PHI_RESULT (new_phi);
5147 }
5148
5149 def1 = ((op_type == ternary_op)
5150 ? vec_oprnds1[i] : NULL);
5151 if (op_type == binary_op)
5152 {
5153 if (reduc_index == 0)
5154 expr = build2 (code, vectype_out, reduc_def, def0);
5155 else
5156 expr = build2 (code, vectype_out, def0, reduc_def);
5157 }
5158 else
5159 {
5160 if (reduc_index == 0)
5161 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5162 else
5163 {
5164 if (reduc_index == 1)
5165 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5166 else
5167 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5168 }
5169 }
5170
5171 new_stmt = gimple_build_assign (vec_dest, expr);
5172 new_temp = make_ssa_name (vec_dest, new_stmt);
5173 gimple_assign_set_lhs (new_stmt, new_temp);
5174 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5175
5176 if (slp_node)
5177 {
5178 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5179 vect_defs.quick_push (new_temp);
5180 }
5181 else
5182 vect_defs[0] = new_temp;
5183 }
5184
5185 if (slp_node)
5186 continue;
5187
5188 if (j == 0)
5189 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5190 else
5191 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5192
5193 prev_stmt_info = vinfo_for_stmt (new_stmt);
5194 prev_phi_info = vinfo_for_stmt (new_phi);
5195 }
5196
5197 /* Finalize the reduction-phi (set its arguments) and create the
5198 epilog reduction code. */
5199 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5200 {
5201 new_temp = gimple_assign_lhs (*vec_stmt);
5202 vect_defs[0] = new_temp;
5203 }
5204
5205 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5206 epilog_reduc_code, phis, reduc_index,
5207 double_reduc, slp_node);
5208
5209 phis.release ();
5210 vect_defs.release ();
5211 vec_oprnds0.release ();
5212 vec_oprnds1.release ();
5213
5214 return true;
5215 }
5216
5217 /* Function vect_min_worthwhile_factor.
5218
5219 For a loop where we could vectorize the operation indicated by CODE,
5220 return the minimum vectorization factor that makes it worthwhile
5221 to use generic vectors. */
5222 int
5223 vect_min_worthwhile_factor (enum tree_code code)
5224 {
5225 switch (code)
5226 {
5227 case PLUS_EXPR:
5228 case MINUS_EXPR:
5229 case NEGATE_EXPR:
5230 return 4;
5231
5232 case BIT_AND_EXPR:
5233 case BIT_IOR_EXPR:
5234 case BIT_XOR_EXPR:
5235 case BIT_NOT_EXPR:
5236 return 2;
5237
5238 default:
5239 return INT_MAX;
5240 }
5241 }
5242
5243
5244 /* Function vectorizable_induction
5245
5246 Check if PHI performs an induction computation that can be vectorized.
5247 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5248 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5249 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5250
5251 bool
5252 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5253 gimple *vec_stmt)
5254 {
5255 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5256 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5257 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5258 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5259 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5260 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5261 tree vec_def;
5262
5263 gcc_assert (ncopies >= 1);
5264 /* FORNOW. These restrictions should be relaxed. */
5265 if (nested_in_vect_loop_p (loop, phi))
5266 {
5267 imm_use_iterator imm_iter;
5268 use_operand_p use_p;
5269 gimple exit_phi;
5270 edge latch_e;
5271 tree loop_arg;
5272
5273 if (ncopies > 1)
5274 {
5275 if (dump_enabled_p ())
5276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5277 "multiple types in nested loop.");
5278 return false;
5279 }
5280
5281 exit_phi = NULL;
5282 latch_e = loop_latch_edge (loop->inner);
5283 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5284 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5285 {
5286 if (!flow_bb_inside_loop_p (loop->inner,
5287 gimple_bb (USE_STMT (use_p))))
5288 {
5289 exit_phi = USE_STMT (use_p);
5290 break;
5291 }
5292 }
5293 if (exit_phi)
5294 {
5295 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5296 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5297 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5298 {
5299 if (dump_enabled_p ())
5300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5301 "inner-loop induction only used outside "
5302 "of the outer vectorized loop.");
5303 return false;
5304 }
5305 }
5306 }
5307
5308 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5309 return false;
5310
5311 /* FORNOW: SLP not supported. */
5312 if (STMT_SLP_TYPE (stmt_info))
5313 return false;
5314
5315 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5316
5317 if (gimple_code (phi) != GIMPLE_PHI)
5318 return false;
5319
5320 if (!vec_stmt) /* transformation not required. */
5321 {
5322 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5323 if (dump_enabled_p ())
5324 dump_printf_loc (MSG_NOTE, vect_location,
5325 "=== vectorizable_induction ===");
5326 vect_model_induction_cost (stmt_info, ncopies);
5327 return true;
5328 }
5329
5330 /** Transform. **/
5331
5332 if (dump_enabled_p ())
5333 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5334
5335 vec_def = get_initial_def_for_induction (phi);
5336 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5337 return true;
5338 }
5339
5340 /* Function vectorizable_live_operation.
5341
5342 STMT computes a value that is used outside the loop. Check if
5343 it can be supported. */
5344
5345 bool
5346 vectorizable_live_operation (gimple stmt,
5347 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5348 gimple *vec_stmt ATTRIBUTE_UNUSED)
5349 {
5350 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5351 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5352 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5353 int i;
5354 int op_type;
5355 tree op;
5356 tree def;
5357 gimple def_stmt;
5358 enum vect_def_type dt;
5359 enum tree_code code;
5360 enum gimple_rhs_class rhs_class;
5361
5362 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5363
5364 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5365 return false;
5366
5367 if (!is_gimple_assign (stmt))
5368 return false;
5369
5370 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5371 return false;
5372
5373 /* FORNOW. CHECKME. */
5374 if (nested_in_vect_loop_p (loop, stmt))
5375 return false;
5376
5377 code = gimple_assign_rhs_code (stmt);
5378 op_type = TREE_CODE_LENGTH (code);
5379 rhs_class = get_gimple_rhs_class (code);
5380 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5381 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5382
5383 /* FORNOW: support only if all uses are invariant. This means
5384 that the scalar operations can remain in place, unvectorized.
5385 The original last scalar value that they compute will be used. */
5386
5387 for (i = 0; i < op_type; i++)
5388 {
5389 if (rhs_class == GIMPLE_SINGLE_RHS)
5390 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5391 else
5392 op = gimple_op (stmt, i + 1);
5393 if (op
5394 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5395 &dt))
5396 {
5397 if (dump_enabled_p ())
5398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5399 "use not simple.");
5400 return false;
5401 }
5402
5403 if (dt != vect_external_def && dt != vect_constant_def)
5404 return false;
5405 }
5406
5407 /* No transformation is required for the cases we currently support. */
5408 return true;
5409 }
5410
5411 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5412
5413 static void
5414 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5415 {
5416 ssa_op_iter op_iter;
5417 imm_use_iterator imm_iter;
5418 def_operand_p def_p;
5419 gimple ustmt;
5420
5421 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5422 {
5423 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5424 {
5425 basic_block bb;
5426
5427 if (!is_gimple_debug (ustmt))
5428 continue;
5429
5430 bb = gimple_bb (ustmt);
5431
5432 if (!flow_bb_inside_loop_p (loop, bb))
5433 {
5434 if (gimple_debug_bind_p (ustmt))
5435 {
5436 if (dump_enabled_p ())
5437 dump_printf_loc (MSG_NOTE, vect_location,
5438 "killing debug use");
5439
5440 gimple_debug_bind_reset_value (ustmt);
5441 update_stmt (ustmt);
5442 }
5443 else
5444 gcc_unreachable ();
5445 }
5446 }
5447 }
5448 }
5449
5450 /* Function vect_transform_loop.
5451
5452 The analysis phase has determined that the loop is vectorizable.
5453 Vectorize the loop - created vectorized stmts to replace the scalar
5454 stmts in the loop, and update the loop exit condition. */
5455
5456 void
5457 vect_transform_loop (loop_vec_info loop_vinfo)
5458 {
5459 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5460 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5461 int nbbs = loop->num_nodes;
5462 gimple_stmt_iterator si;
5463 int i;
5464 tree ratio = NULL;
5465 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5466 bool grouped_store;
5467 bool slp_scheduled = false;
5468 unsigned int nunits;
5469 gimple stmt, pattern_stmt;
5470 gimple_seq pattern_def_seq = NULL;
5471 gimple_stmt_iterator pattern_def_si = gsi_none ();
5472 bool transform_pattern_stmt = false;
5473 bool check_profitability = false;
5474 int th;
5475 /* Record number of iterations before we started tampering with the profile. */
5476 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5477
5478 if (dump_enabled_p ())
5479 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5480
5481 /* If profile is inprecise, we have chance to fix it up. */
5482 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5483 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5484
5485 /* Use the more conservative vectorization threshold. If the number
5486 of iterations is constant assume the cost check has been performed
5487 by our caller. If the threshold makes all loops profitable that
5488 run at least the vectorization factor number of times checking
5489 is pointless, too. */
5490 th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5491 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5492 th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5493 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5494 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5495 {
5496 if (dump_enabled_p ())
5497 dump_printf_loc (MSG_NOTE, vect_location,
5498 "Profitability threshold is %d loop iterations.", th);
5499 check_profitability = true;
5500 }
5501
5502 /* Version the loop first, if required, so the profitability check
5503 comes first. */
5504
5505 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5506 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5507 {
5508 vect_loop_versioning (loop_vinfo, th, check_profitability);
5509 check_profitability = false;
5510 }
5511
5512 /* Peel the loop if there are data refs with unknown alignment.
5513 Only one data ref with unknown store is allowed. */
5514
5515 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5516 {
5517 vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5518 check_profitability = false;
5519 }
5520
5521 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5522 compile time constant), or it is a constant that doesn't divide by the
5523 vectorization factor, then an epilog loop needs to be created.
5524 We therefore duplicate the loop: the original loop will be vectorized,
5525 and will compute the first (n/VF) iterations. The second copy of the loop
5526 will remain scalar and will compute the remaining (n%VF) iterations.
5527 (VF is the vectorization factor). */
5528
5529 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5530 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5531 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5532 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5533 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5534 th, check_profitability);
5535 else
5536 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5537 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5538
5539 /* 1) Make sure the loop header has exactly two entries
5540 2) Make sure we have a preheader basic block. */
5541
5542 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5543
5544 split_edge (loop_preheader_edge (loop));
5545
5546 /* FORNOW: the vectorizer supports only loops which body consist
5547 of one basic block (header + empty latch). When the vectorizer will
5548 support more involved loop forms, the order by which the BBs are
5549 traversed need to be reconsidered. */
5550
5551 for (i = 0; i < nbbs; i++)
5552 {
5553 basic_block bb = bbs[i];
5554 stmt_vec_info stmt_info;
5555 gimple phi;
5556
5557 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5558 {
5559 phi = gsi_stmt (si);
5560 if (dump_enabled_p ())
5561 {
5562 dump_printf_loc (MSG_NOTE, vect_location,
5563 "------>vectorizing phi: ");
5564 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5565 }
5566 stmt_info = vinfo_for_stmt (phi);
5567 if (!stmt_info)
5568 continue;
5569
5570 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5571 vect_loop_kill_debug_uses (loop, phi);
5572
5573 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5574 && !STMT_VINFO_LIVE_P (stmt_info))
5575 continue;
5576
5577 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5578 != (unsigned HOST_WIDE_INT) vectorization_factor)
5579 && dump_enabled_p ())
5580 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5581
5582 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5583 {
5584 if (dump_enabled_p ())
5585 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5586 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5587 }
5588 }
5589
5590 pattern_stmt = NULL;
5591 for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5592 {
5593 bool is_store;
5594
5595 if (transform_pattern_stmt)
5596 stmt = pattern_stmt;
5597 else
5598 stmt = gsi_stmt (si);
5599
5600 if (dump_enabled_p ())
5601 {
5602 dump_printf_loc (MSG_NOTE, vect_location,
5603 "------>vectorizing statement: ");
5604 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5605 }
5606
5607 stmt_info = vinfo_for_stmt (stmt);
5608
5609 /* vector stmts created in the outer-loop during vectorization of
5610 stmts in an inner-loop may not have a stmt_info, and do not
5611 need to be vectorized. */
5612 if (!stmt_info)
5613 {
5614 gsi_next (&si);
5615 continue;
5616 }
5617
5618 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5619 vect_loop_kill_debug_uses (loop, stmt);
5620
5621 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5622 && !STMT_VINFO_LIVE_P (stmt_info))
5623 {
5624 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5625 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5626 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5627 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5628 {
5629 stmt = pattern_stmt;
5630 stmt_info = vinfo_for_stmt (stmt);
5631 }
5632 else
5633 {
5634 gsi_next (&si);
5635 continue;
5636 }
5637 }
5638 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5639 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5640 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5641 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5642 transform_pattern_stmt = true;
5643
5644 /* If pattern statement has def stmts, vectorize them too. */
5645 if (is_pattern_stmt_p (stmt_info))
5646 {
5647 if (pattern_def_seq == NULL)
5648 {
5649 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5650 pattern_def_si = gsi_start (pattern_def_seq);
5651 }
5652 else if (!gsi_end_p (pattern_def_si))
5653 gsi_next (&pattern_def_si);
5654 if (pattern_def_seq != NULL)
5655 {
5656 gimple pattern_def_stmt = NULL;
5657 stmt_vec_info pattern_def_stmt_info = NULL;
5658
5659 while (!gsi_end_p (pattern_def_si))
5660 {
5661 pattern_def_stmt = gsi_stmt (pattern_def_si);
5662 pattern_def_stmt_info
5663 = vinfo_for_stmt (pattern_def_stmt);
5664 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5665 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5666 break;
5667 gsi_next (&pattern_def_si);
5668 }
5669
5670 if (!gsi_end_p (pattern_def_si))
5671 {
5672 if (dump_enabled_p ())
5673 {
5674 dump_printf_loc (MSG_NOTE, vect_location,
5675 "==> vectorizing pattern def "
5676 "stmt: ");
5677 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5678 pattern_def_stmt, 0);
5679 }
5680
5681 stmt = pattern_def_stmt;
5682 stmt_info = pattern_def_stmt_info;
5683 }
5684 else
5685 {
5686 pattern_def_si = gsi_none ();
5687 transform_pattern_stmt = false;
5688 }
5689 }
5690 else
5691 transform_pattern_stmt = false;
5692 }
5693
5694 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5695 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5696 STMT_VINFO_VECTYPE (stmt_info));
5697 if (!STMT_SLP_TYPE (stmt_info)
5698 && nunits != (unsigned int) vectorization_factor
5699 && dump_enabled_p ())
5700 /* For SLP VF is set according to unrolling factor, and not to
5701 vector size, hence for SLP this print is not valid. */
5702 dump_printf_loc (MSG_NOTE, vect_location,
5703 "multiple-types.");
5704
5705 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5706 reached. */
5707 if (STMT_SLP_TYPE (stmt_info))
5708 {
5709 if (!slp_scheduled)
5710 {
5711 slp_scheduled = true;
5712
5713 if (dump_enabled_p ())
5714 dump_printf_loc (MSG_NOTE, vect_location,
5715 "=== scheduling SLP instances ===");
5716
5717 vect_schedule_slp (loop_vinfo, NULL);
5718 }
5719
5720 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5721 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5722 {
5723 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5724 {
5725 pattern_def_seq = NULL;
5726 gsi_next (&si);
5727 }
5728 continue;
5729 }
5730 }
5731
5732 /* -------- vectorize statement ------------ */
5733 if (dump_enabled_p ())
5734 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5735
5736 grouped_store = false;
5737 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5738 if (is_store)
5739 {
5740 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5741 {
5742 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5743 interleaving chain was completed - free all the stores in
5744 the chain. */
5745 gsi_next (&si);
5746 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5747 continue;
5748 }
5749 else
5750 {
5751 /* Free the attached stmt_vec_info and remove the stmt. */
5752 gimple store = gsi_stmt (si);
5753 free_stmt_vec_info (store);
5754 unlink_stmt_vdef (store);
5755 gsi_remove (&si, true);
5756 release_defs (store);
5757 continue;
5758 }
5759 }
5760
5761 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5762 {
5763 pattern_def_seq = NULL;
5764 gsi_next (&si);
5765 }
5766 } /* stmts in BB */
5767 } /* BBs in loop */
5768
5769 slpeel_make_loop_iterate_ntimes (loop, ratio);
5770
5771 /* Reduce loop iterations by the vectorization factor. */
5772 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
5773 expected_iterations / vectorization_factor);
5774 loop->nb_iterations_upper_bound
5775 = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5776 FLOOR_DIV_EXPR);
5777 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5778 && loop->nb_iterations_upper_bound != double_int_zero)
5779 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5780 if (loop->any_estimate)
5781 {
5782 loop->nb_iterations_estimate
5783 = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5784 FLOOR_DIV_EXPR);
5785 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5786 && loop->nb_iterations_estimate != double_int_zero)
5787 loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5788 }
5789
5790 if (dump_enabled_p ())
5791 {
5792 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5793 "LOOP VECTORIZED\n");
5794 if (loop->inner)
5795 dump_printf_loc (MSG_NOTE, vect_location,
5796 "OUTER LOOP VECTORIZED\n");
5797 }
5798 }