tree-vect-loop.c (get_initial_def_for_induction): Properly generate vector constants.
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "basic-block.h"
30 #include "gimple-pretty-print.h"
31 #include "tree-flow.h"
32 #include "tree-pass.h"
33 #include "cfgloop.h"
34 #include "expr.h"
35 #include "recog.h"
36 #include "optabs.h"
37 #include "params.h"
38 #include "diagnostic-core.h"
39 #include "tree-chrec.h"
40 #include "tree-scalar-evolution.h"
41 #include "tree-vectorizer.h"
42 #include "target.h"
43
44 /* Loop Vectorization Pass.
45
46 This pass tries to vectorize loops.
47
48 For example, the vectorizer transforms the following simple loop:
49
50 short a[N]; short b[N]; short c[N]; int i;
51
52 for (i=0; i<N; i++){
53 a[i] = b[i] + c[i];
54 }
55
56 as if it was manually vectorized by rewriting the source code into:
57
58 typedef int __attribute__((mode(V8HI))) v8hi;
59 short a[N]; short b[N]; short c[N]; int i;
60 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
61 v8hi va, vb, vc;
62
63 for (i=0; i<N/8; i++){
64 vb = pb[i];
65 vc = pc[i];
66 va = vb + vc;
67 pa[i] = va;
68 }
69
70 The main entry to this pass is vectorize_loops(), in which
71 the vectorizer applies a set of analyses on a given set of loops,
72 followed by the actual vectorization transformation for the loops that
73 had successfully passed the analysis phase.
74 Throughout this pass we make a distinction between two types of
75 data: scalars (which are represented by SSA_NAMES), and memory references
76 ("data-refs"). These two types of data require different handling both
77 during analysis and transformation. The types of data-refs that the
78 vectorizer currently supports are ARRAY_REFS which base is an array DECL
79 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80 accesses are required to have a simple (consecutive) access pattern.
81
82 Analysis phase:
83 ===============
84 The driver for the analysis phase is vect_analyze_loop().
85 It applies a set of analyses, some of which rely on the scalar evolution
86 analyzer (scev) developed by Sebastian Pop.
87
88 During the analysis phase the vectorizer records some information
89 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90 loop, as well as general information about the loop as a whole, which is
91 recorded in a "loop_vec_info" struct attached to each loop.
92
93 Transformation phase:
94 =====================
95 The loop transformation phase scans all the stmts in the loop, and
96 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97 the loop that needs to be vectorized. It inserts the vector code sequence
98 just before the scalar stmt S, and records a pointer to the vector code
99 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100 attached to S). This pointer will be used for the vectorization of following
101 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102 otherwise, we rely on dead code elimination for removing it.
103
104 For example, say stmt S1 was vectorized into stmt VS1:
105
106 VS1: vb = px[i];
107 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
108 S2: a = b;
109
110 To vectorize stmt S2, the vectorizer first finds the stmt that defines
111 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
113 resulting sequence would be:
114
115 VS1: vb = px[i];
116 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
117 VS2: va = vb;
118 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
119
120 Operands that are not SSA_NAMEs, are data-refs that appear in
121 load/store operations (like 'x[i]' in S1), and are handled differently.
122
123 Target modeling:
124 =================
125 Currently the only target specific information that is used is the
126 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
127 Targets that can support different sizes of vectors, for now will need
128 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
129 flexibility will be added in the future.
130
131 Since we only vectorize operations which vector form can be
132 expressed using existing tree codes, to verify that an operation is
133 supported, the vectorizer checks the relevant optab at the relevant
134 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
135 the value found is CODE_FOR_nothing, then there's no target support, and
136 we can't vectorize the stmt.
137
138 For additional information on this project see:
139 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
140 */
141
142 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
143
144 /* Function vect_determine_vectorization_factor
145
146 Determine the vectorization factor (VF). VF is the number of data elements
147 that are operated upon in parallel in a single iteration of the vectorized
148 loop. For example, when vectorizing a loop that operates on 4byte elements,
149 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
150 elements can fit in a single vector register.
151
152 We currently support vectorization of loops in which all types operated upon
153 are of the same size. Therefore this function currently sets VF according to
154 the size of the types operated upon, and fails if there are multiple sizes
155 in the loop.
156
157 VF is also the factor by which the loop iterations are strip-mined, e.g.:
158 original loop:
159 for (i=0; i<N; i++){
160 a[i] = b[i] + c[i];
161 }
162
163 vectorized loop:
164 for (i=0; i<N; i+=VF){
165 a[i:VF] = b[i:VF] + c[i:VF];
166 }
167 */
168
169 static bool
170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
171 {
172 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
173 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
174 int nbbs = loop->num_nodes;
175 gimple_stmt_iterator si;
176 unsigned int vectorization_factor = 0;
177 tree scalar_type;
178 gimple phi;
179 tree vectype;
180 unsigned int nunits;
181 stmt_vec_info stmt_info;
182 int i;
183 HOST_WIDE_INT dummy;
184 gimple stmt, pattern_stmt = NULL;
185 gimple_seq pattern_def_seq = NULL;
186 gimple_stmt_iterator pattern_def_si = gsi_none ();
187 bool analyze_pattern_stmt = false;
188
189 if (dump_enabled_p ())
190 dump_printf_loc (MSG_NOTE, vect_location,
191 "=== vect_determine_vectorization_factor ===");
192
193 for (i = 0; i < nbbs; i++)
194 {
195 basic_block bb = bbs[i];
196
197 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
198 {
199 phi = gsi_stmt (si);
200 stmt_info = vinfo_for_stmt (phi);
201 if (dump_enabled_p ())
202 {
203 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
204 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
205 }
206
207 gcc_assert (stmt_info);
208
209 if (STMT_VINFO_RELEVANT_P (stmt_info))
210 {
211 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
212 scalar_type = TREE_TYPE (PHI_RESULT (phi));
213
214 if (dump_enabled_p ())
215 {
216 dump_printf_loc (MSG_NOTE, vect_location,
217 "get vectype for scalar type: ");
218 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
219 }
220
221 vectype = get_vectype_for_scalar_type (scalar_type);
222 if (!vectype)
223 {
224 if (dump_enabled_p ())
225 {
226 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
227 "not vectorized: unsupported "
228 "data-type ");
229 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
230 scalar_type);
231 }
232 return false;
233 }
234 STMT_VINFO_VECTYPE (stmt_info) = vectype;
235
236 if (dump_enabled_p ())
237 {
238 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
239 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
240 }
241
242 nunits = TYPE_VECTOR_SUBPARTS (vectype);
243 if (dump_enabled_p ())
244 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
245
246 if (!vectorization_factor
247 || (nunits > vectorization_factor))
248 vectorization_factor = nunits;
249 }
250 }
251
252 for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
253 {
254 tree vf_vectype;
255
256 if (analyze_pattern_stmt)
257 stmt = pattern_stmt;
258 else
259 stmt = gsi_stmt (si);
260
261 stmt_info = vinfo_for_stmt (stmt);
262
263 if (dump_enabled_p ())
264 {
265 dump_printf_loc (MSG_NOTE, vect_location,
266 "==> examining statement: ");
267 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
268 }
269
270 gcc_assert (stmt_info);
271
272 /* Skip stmts which do not need to be vectorized. */
273 if (!STMT_VINFO_RELEVANT_P (stmt_info)
274 && !STMT_VINFO_LIVE_P (stmt_info))
275 {
276 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
277 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
278 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
279 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
280 {
281 stmt = pattern_stmt;
282 stmt_info = vinfo_for_stmt (pattern_stmt);
283 if (dump_enabled_p ())
284 {
285 dump_printf_loc (MSG_NOTE, vect_location,
286 "==> examining pattern statement: ");
287 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
288 }
289 }
290 else
291 {
292 if (dump_enabled_p ())
293 dump_printf_loc (MSG_NOTE, vect_location, "skip.");
294 gsi_next (&si);
295 continue;
296 }
297 }
298 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
302 analyze_pattern_stmt = true;
303
304 /* If a pattern statement has def stmts, analyze them too. */
305 if (is_pattern_stmt_p (stmt_info))
306 {
307 if (pattern_def_seq == NULL)
308 {
309 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
310 pattern_def_si = gsi_start (pattern_def_seq);
311 }
312 else if (!gsi_end_p (pattern_def_si))
313 gsi_next (&pattern_def_si);
314 if (pattern_def_seq != NULL)
315 {
316 gimple pattern_def_stmt = NULL;
317 stmt_vec_info pattern_def_stmt_info = NULL;
318
319 while (!gsi_end_p (pattern_def_si))
320 {
321 pattern_def_stmt = gsi_stmt (pattern_def_si);
322 pattern_def_stmt_info
323 = vinfo_for_stmt (pattern_def_stmt);
324 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
325 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
326 break;
327 gsi_next (&pattern_def_si);
328 }
329
330 if (!gsi_end_p (pattern_def_si))
331 {
332 if (dump_enabled_p ())
333 {
334 dump_printf_loc (MSG_NOTE, vect_location,
335 "==> examining pattern def stmt: ");
336 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
337 pattern_def_stmt, 0);
338 }
339
340 stmt = pattern_def_stmt;
341 stmt_info = pattern_def_stmt_info;
342 }
343 else
344 {
345 pattern_def_si = gsi_none ();
346 analyze_pattern_stmt = false;
347 }
348 }
349 else
350 analyze_pattern_stmt = false;
351 }
352
353 if (gimple_get_lhs (stmt) == NULL_TREE)
354 {
355 if (dump_enabled_p ())
356 {
357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
358 "not vectorized: irregular stmt.");
359 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
360 0);
361 }
362 return false;
363 }
364
365 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
366 {
367 if (dump_enabled_p ())
368 {
369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
370 "not vectorized: vector stmt in loop:");
371 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
372 }
373 return false;
374 }
375
376 if (STMT_VINFO_VECTYPE (stmt_info))
377 {
378 /* The only case when a vectype had been already set is for stmts
379 that contain a dataref, or for "pattern-stmts" (stmts
380 generated by the vectorizer to represent/replace a certain
381 idiom). */
382 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
383 || is_pattern_stmt_p (stmt_info)
384 || !gsi_end_p (pattern_def_si));
385 vectype = STMT_VINFO_VECTYPE (stmt_info);
386 }
387 else
388 {
389 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
390 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
391 if (dump_enabled_p ())
392 {
393 dump_printf_loc (MSG_NOTE, vect_location,
394 "get vectype for scalar type: ");
395 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
396 }
397 vectype = get_vectype_for_scalar_type (scalar_type);
398 if (!vectype)
399 {
400 if (dump_enabled_p ())
401 {
402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
403 "not vectorized: unsupported "
404 "data-type ");
405 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
406 scalar_type);
407 }
408 return false;
409 }
410
411 STMT_VINFO_VECTYPE (stmt_info) = vectype;
412
413 if (dump_enabled_p ())
414 {
415 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
416 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
417 }
418 }
419
420 /* The vectorization factor is according to the smallest
421 scalar type (or the largest vector size, but we only
422 support one vector size per loop). */
423 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
424 &dummy);
425 if (dump_enabled_p ())
426 {
427 dump_printf_loc (MSG_NOTE, vect_location,
428 "get vectype for scalar type: ");
429 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
430 }
431 vf_vectype = get_vectype_for_scalar_type (scalar_type);
432 if (!vf_vectype)
433 {
434 if (dump_enabled_p ())
435 {
436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
437 "not vectorized: unsupported data-type ");
438 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
439 scalar_type);
440 }
441 return false;
442 }
443
444 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
445 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
446 {
447 if (dump_enabled_p ())
448 {
449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
450 "not vectorized: different sized vector "
451 "types in statement, ");
452 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
453 vectype);
454 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
455 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
456 vf_vectype);
457 }
458 return false;
459 }
460
461 if (dump_enabled_p ())
462 {
463 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
464 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
465 }
466
467 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
470 if (!vectorization_factor
471 || (nunits > vectorization_factor))
472 vectorization_factor = nunits;
473
474 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
475 {
476 pattern_def_seq = NULL;
477 gsi_next (&si);
478 }
479 }
480 }
481
482 /* TODO: Analyze cost. Decide if worth while to vectorize. */
483 if (dump_enabled_p ())
484 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
485 vectorization_factor);
486 if (vectorization_factor <= 1)
487 {
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
490 "not vectorized: unsupported data-type");
491 return false;
492 }
493 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
494
495 return true;
496 }
497
498
499 /* Function vect_is_simple_iv_evolution.
500
501 FORNOW: A simple evolution of an induction variables in the loop is
502 considered a polynomial evolution with constant step. */
503
504 static bool
505 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
506 tree * step)
507 {
508 tree init_expr;
509 tree step_expr;
510 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
511
512 /* When there is no evolution in this loop, the evolution function
513 is not "simple". */
514 if (evolution_part == NULL_TREE)
515 return false;
516
517 /* When the evolution is a polynomial of degree >= 2
518 the evolution function is not "simple". */
519 if (tree_is_chrec (evolution_part))
520 return false;
521
522 step_expr = evolution_part;
523 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
524
525 if (dump_enabled_p ())
526 {
527 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
528 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
529 dump_printf (MSG_NOTE, ", init: ");
530 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
531 }
532
533 *init = init_expr;
534 *step = step_expr;
535
536 if (TREE_CODE (step_expr) != INTEGER_CST)
537 {
538 if (dump_enabled_p ())
539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
540 "step unknown.");
541 return false;
542 }
543
544 return true;
545 }
546
547 /* Function vect_analyze_scalar_cycles_1.
548
549 Examine the cross iteration def-use cycles of scalar variables
550 in LOOP. LOOP_VINFO represents the loop that is now being
551 considered for vectorization (can be LOOP, or an outer-loop
552 enclosing LOOP). */
553
554 static void
555 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
556 {
557 basic_block bb = loop->header;
558 tree dumy;
559 vec<gimple> worklist;
560 worklist.create (64);
561 gimple_stmt_iterator gsi;
562 bool double_reduc;
563
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_NOTE, vect_location,
566 "=== vect_analyze_scalar_cycles ===");
567
568 /* First - identify all inductions. Reduction detection assumes that all the
569 inductions have been identified, therefore, this order must not be
570 changed. */
571 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
572 {
573 gimple phi = gsi_stmt (gsi);
574 tree access_fn = NULL;
575 tree def = PHI_RESULT (phi);
576 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
577
578 if (dump_enabled_p ())
579 {
580 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
581 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
582 }
583
584 /* Skip virtual phi's. The data dependences that are associated with
585 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
586 if (virtual_operand_p (def))
587 continue;
588
589 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
590
591 /* Analyze the evolution function. */
592 access_fn = analyze_scalar_evolution (loop, def);
593 if (access_fn)
594 {
595 STRIP_NOPS (access_fn);
596 if (dump_enabled_p ())
597 {
598 dump_printf_loc (MSG_NOTE, vect_location,
599 "Access function of PHI: ");
600 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
601 }
602 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
603 = evolution_part_in_loop_num (access_fn, loop->num);
604 }
605
606 if (!access_fn
607 || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
608 {
609 worklist.safe_push (phi);
610 continue;
611 }
612
613 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
614
615 if (dump_enabled_p ())
616 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
617 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
618 }
619
620
621 /* Second - identify all reductions and nested cycles. */
622 while (worklist.length () > 0)
623 {
624 gimple phi = worklist.pop ();
625 tree def = PHI_RESULT (phi);
626 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
627 gimple reduc_stmt;
628 bool nested_cycle;
629
630 if (dump_enabled_p ())
631 {
632 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
633 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
634 }
635
636 gcc_assert (!virtual_operand_p (def)
637 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
638
639 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
640 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
641 &double_reduc);
642 if (reduc_stmt)
643 {
644 if (double_reduc)
645 {
646 if (dump_enabled_p ())
647 dump_printf_loc (MSG_NOTE, vect_location,
648 "Detected double reduction.");
649
650 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
651 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
652 vect_double_reduction_def;
653 }
654 else
655 {
656 if (nested_cycle)
657 {
658 if (dump_enabled_p ())
659 dump_printf_loc (MSG_NOTE, vect_location,
660 "Detected vectorizable nested cycle.");
661
662 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
663 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
664 vect_nested_cycle;
665 }
666 else
667 {
668 if (dump_enabled_p ())
669 dump_printf_loc (MSG_NOTE, vect_location,
670 "Detected reduction.");
671
672 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
673 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
674 vect_reduction_def;
675 /* Store the reduction cycles for possible vectorization in
676 loop-aware SLP. */
677 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
678 }
679 }
680 }
681 else
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
684 "Unknown def-use cycle pattern.");
685 }
686
687 worklist.release ();
688 }
689
690
691 /* Function vect_analyze_scalar_cycles.
692
693 Examine the cross iteration def-use cycles of scalar variables, by
694 analyzing the loop-header PHIs of scalar variables. Classify each
695 cycle as one of the following: invariant, induction, reduction, unknown.
696 We do that for the loop represented by LOOP_VINFO, and also to its
697 inner-loop, if exists.
698 Examples for scalar cycles:
699
700 Example1: reduction:
701
702 loop1:
703 for (i=0; i<N; i++)
704 sum += a[i];
705
706 Example2: induction:
707
708 loop2:
709 for (i=0; i<N; i++)
710 a[i] = i; */
711
712 static void
713 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
714 {
715 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
716
717 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
718
719 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
720 Reductions in such inner-loop therefore have different properties than
721 the reductions in the nest that gets vectorized:
722 1. When vectorized, they are executed in the same order as in the original
723 scalar loop, so we can't change the order of computation when
724 vectorizing them.
725 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
726 current checks are too strict. */
727
728 if (loop->inner)
729 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
730 }
731
732 /* Function vect_get_loop_niters.
733
734 Determine how many iterations the loop is executed.
735 If an expression that represents the number of iterations
736 can be constructed, place it in NUMBER_OF_ITERATIONS.
737 Return the loop exit condition. */
738
739 static gimple
740 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
741 {
742 tree niters;
743
744 if (dump_enabled_p ())
745 dump_printf_loc (MSG_NOTE, vect_location,
746 "=== get_loop_niters ===");
747 niters = number_of_exit_cond_executions (loop);
748
749 if (niters != NULL_TREE
750 && niters != chrec_dont_know)
751 {
752 *number_of_iterations = niters;
753
754 if (dump_enabled_p ())
755 {
756 dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
757 dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
758 }
759 }
760
761 return get_loop_exit_condition (loop);
762 }
763
764
765 /* Function bb_in_loop_p
766
767 Used as predicate for dfs order traversal of the loop bbs. */
768
769 static bool
770 bb_in_loop_p (const_basic_block bb, const void *data)
771 {
772 const struct loop *const loop = (const struct loop *)data;
773 if (flow_bb_inside_loop_p (loop, bb))
774 return true;
775 return false;
776 }
777
778
779 /* Function new_loop_vec_info.
780
781 Create and initialize a new loop_vec_info struct for LOOP, as well as
782 stmt_vec_info structs for all the stmts in LOOP. */
783
784 static loop_vec_info
785 new_loop_vec_info (struct loop *loop)
786 {
787 loop_vec_info res;
788 basic_block *bbs;
789 gimple_stmt_iterator si;
790 unsigned int i, nbbs;
791
792 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
793 LOOP_VINFO_LOOP (res) = loop;
794
795 bbs = get_loop_body (loop);
796
797 /* Create/Update stmt_info for all stmts in the loop. */
798 for (i = 0; i < loop->num_nodes; i++)
799 {
800 basic_block bb = bbs[i];
801
802 /* BBs in a nested inner-loop will have been already processed (because
803 we will have called vect_analyze_loop_form for any nested inner-loop).
804 Therefore, for stmts in an inner-loop we just want to update the
805 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
806 loop_info of the outer-loop we are currently considering to vectorize
807 (instead of the loop_info of the inner-loop).
808 For stmts in other BBs we need to create a stmt_info from scratch. */
809 if (bb->loop_father != loop)
810 {
811 /* Inner-loop bb. */
812 gcc_assert (loop->inner && bb->loop_father == loop->inner);
813 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
814 {
815 gimple phi = gsi_stmt (si);
816 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
817 loop_vec_info inner_loop_vinfo =
818 STMT_VINFO_LOOP_VINFO (stmt_info);
819 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
820 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
821 }
822 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
823 {
824 gimple stmt = gsi_stmt (si);
825 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
826 loop_vec_info inner_loop_vinfo =
827 STMT_VINFO_LOOP_VINFO (stmt_info);
828 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
829 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
830 }
831 }
832 else
833 {
834 /* bb in current nest. */
835 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
836 {
837 gimple phi = gsi_stmt (si);
838 gimple_set_uid (phi, 0);
839 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
840 }
841
842 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
843 {
844 gimple stmt = gsi_stmt (si);
845 gimple_set_uid (stmt, 0);
846 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
847 }
848 }
849 }
850
851 /* CHECKME: We want to visit all BBs before their successors (except for
852 latch blocks, for which this assertion wouldn't hold). In the simple
853 case of the loop forms we allow, a dfs order of the BBs would the same
854 as reversed postorder traversal, so we are safe. */
855
856 free (bbs);
857 bbs = XCNEWVEC (basic_block, loop->num_nodes);
858 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
859 bbs, loop->num_nodes, loop);
860 gcc_assert (nbbs == loop->num_nodes);
861
862 LOOP_VINFO_BBS (res) = bbs;
863 LOOP_VINFO_NITERS (res) = NULL;
864 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
865 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
866 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
867 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
868 LOOP_VINFO_VECT_FACTOR (res) = 0;
869 LOOP_VINFO_LOOP_NEST (res).create (3);
870 LOOP_VINFO_DATAREFS (res).create (10);
871 LOOP_VINFO_DDRS (res).create (10 * 10);
872 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
873 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
874 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
875 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
876 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
877 LOOP_VINFO_GROUPED_STORES (res).create (10);
878 LOOP_VINFO_REDUCTIONS (res).create (10);
879 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
880 LOOP_VINFO_SLP_INSTANCES (res).create (10);
881 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
882 LOOP_VINFO_PEELING_HTAB (res) = NULL;
883 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
884 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
885 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
886
887 return res;
888 }
889
890
891 /* Function destroy_loop_vec_info.
892
893 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
894 stmts in the loop. */
895
896 void
897 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
898 {
899 struct loop *loop;
900 basic_block *bbs;
901 int nbbs;
902 gimple_stmt_iterator si;
903 int j;
904 vec<slp_instance> slp_instances;
905 slp_instance instance;
906 bool swapped;
907
908 if (!loop_vinfo)
909 return;
910
911 loop = LOOP_VINFO_LOOP (loop_vinfo);
912
913 bbs = LOOP_VINFO_BBS (loop_vinfo);
914 nbbs = clean_stmts ? loop->num_nodes : 0;
915 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
916
917 for (j = 0; j < nbbs; j++)
918 {
919 basic_block bb = bbs[j];
920 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
921 free_stmt_vec_info (gsi_stmt (si));
922
923 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
924 {
925 gimple stmt = gsi_stmt (si);
926
927 /* We may have broken canonical form by moving a constant
928 into RHS1 of a commutative op. Fix such occurrences. */
929 if (swapped && is_gimple_assign (stmt))
930 {
931 enum tree_code code = gimple_assign_rhs_code (stmt);
932
933 if ((code == PLUS_EXPR
934 || code == POINTER_PLUS_EXPR
935 || code == MULT_EXPR)
936 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
937 swap_tree_operands (stmt,
938 gimple_assign_rhs1_ptr (stmt),
939 gimple_assign_rhs2_ptr (stmt));
940 }
941
942 /* Free stmt_vec_info. */
943 free_stmt_vec_info (stmt);
944 gsi_next (&si);
945 }
946 }
947
948 free (LOOP_VINFO_BBS (loop_vinfo));
949 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
950 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
951 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
952 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
953 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
954 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
955 FOR_EACH_VEC_ELT (slp_instances, j, instance)
956 vect_free_slp_instance (instance);
957
958 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
959 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
960 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
961 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
962
963 if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
964 htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
965
966 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
967
968 free (loop_vinfo);
969 loop->aux = NULL;
970 }
971
972
973 /* Function vect_analyze_loop_1.
974
975 Apply a set of analyses on LOOP, and create a loop_vec_info struct
976 for it. The different analyses will record information in the
977 loop_vec_info struct. This is a subset of the analyses applied in
978 vect_analyze_loop, to be applied on an inner-loop nested in the loop
979 that is now considered for (outer-loop) vectorization. */
980
981 static loop_vec_info
982 vect_analyze_loop_1 (struct loop *loop)
983 {
984 loop_vec_info loop_vinfo;
985
986 if (dump_enabled_p ())
987 dump_printf_loc (MSG_NOTE, vect_location,
988 "===== analyze_loop_nest_1 =====");
989
990 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
991
992 loop_vinfo = vect_analyze_loop_form (loop);
993 if (!loop_vinfo)
994 {
995 if (dump_enabled_p ())
996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
997 "bad inner-loop form.");
998 return NULL;
999 }
1000
1001 return loop_vinfo;
1002 }
1003
1004
1005 /* Function vect_analyze_loop_form.
1006
1007 Verify that certain CFG restrictions hold, including:
1008 - the loop has a pre-header
1009 - the loop has a single entry and exit
1010 - the loop exit condition is simple enough, and the number of iterations
1011 can be analyzed (a countable loop). */
1012
1013 loop_vec_info
1014 vect_analyze_loop_form (struct loop *loop)
1015 {
1016 loop_vec_info loop_vinfo;
1017 gimple loop_cond;
1018 tree number_of_iterations = NULL;
1019 loop_vec_info inner_loop_vinfo = NULL;
1020
1021 if (dump_enabled_p ())
1022 dump_printf_loc (MSG_NOTE, vect_location,
1023 "=== vect_analyze_loop_form ===");
1024
1025 /* Different restrictions apply when we are considering an inner-most loop,
1026 vs. an outer (nested) loop.
1027 (FORNOW. May want to relax some of these restrictions in the future). */
1028
1029 if (!loop->inner)
1030 {
1031 /* Inner-most loop. We currently require that the number of BBs is
1032 exactly 2 (the header and latch). Vectorizable inner-most loops
1033 look like this:
1034
1035 (pre-header)
1036 |
1037 header <--------+
1038 | | |
1039 | +--> latch --+
1040 |
1041 (exit-bb) */
1042
1043 if (loop->num_nodes != 2)
1044 {
1045 if (dump_enabled_p ())
1046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1047 "not vectorized: control flow in loop.");
1048 return NULL;
1049 }
1050
1051 if (empty_block_p (loop->header))
1052 {
1053 if (dump_enabled_p ())
1054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055 "not vectorized: empty loop.");
1056 return NULL;
1057 }
1058 }
1059 else
1060 {
1061 struct loop *innerloop = loop->inner;
1062 edge entryedge;
1063
1064 /* Nested loop. We currently require that the loop is doubly-nested,
1065 contains a single inner loop, and the number of BBs is exactly 5.
1066 Vectorizable outer-loops look like this:
1067
1068 (pre-header)
1069 |
1070 header <---+
1071 | |
1072 inner-loop |
1073 | |
1074 tail ------+
1075 |
1076 (exit-bb)
1077
1078 The inner-loop has the properties expected of inner-most loops
1079 as described above. */
1080
1081 if ((loop->inner)->inner || (loop->inner)->next)
1082 {
1083 if (dump_enabled_p ())
1084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1085 "not vectorized: multiple nested loops.");
1086 return NULL;
1087 }
1088
1089 /* Analyze the inner-loop. */
1090 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1091 if (!inner_loop_vinfo)
1092 {
1093 if (dump_enabled_p ())
1094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1095 "not vectorized: Bad inner loop.");
1096 return NULL;
1097 }
1098
1099 if (!expr_invariant_in_loop_p (loop,
1100 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1101 {
1102 if (dump_enabled_p ())
1103 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1104 "not vectorized: inner-loop count not invariant.");
1105 destroy_loop_vec_info (inner_loop_vinfo, true);
1106 return NULL;
1107 }
1108
1109 if (loop->num_nodes != 5)
1110 {
1111 if (dump_enabled_p ())
1112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113 "not vectorized: control flow in loop.");
1114 destroy_loop_vec_info (inner_loop_vinfo, true);
1115 return NULL;
1116 }
1117
1118 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1119 entryedge = EDGE_PRED (innerloop->header, 0);
1120 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1121 entryedge = EDGE_PRED (innerloop->header, 1);
1122
1123 if (entryedge->src != loop->header
1124 || !single_exit (innerloop)
1125 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1126 {
1127 if (dump_enabled_p ())
1128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1129 "not vectorized: unsupported outerloop form.");
1130 destroy_loop_vec_info (inner_loop_vinfo, true);
1131 return NULL;
1132 }
1133
1134 if (dump_enabled_p ())
1135 dump_printf_loc (MSG_NOTE, vect_location,
1136 "Considering outer-loop vectorization.");
1137 }
1138
1139 if (!single_exit (loop)
1140 || EDGE_COUNT (loop->header->preds) != 2)
1141 {
1142 if (dump_enabled_p ())
1143 {
1144 if (!single_exit (loop))
1145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1146 "not vectorized: multiple exits.");
1147 else if (EDGE_COUNT (loop->header->preds) != 2)
1148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1149 "not vectorized: too many incoming edges.");
1150 }
1151 if (inner_loop_vinfo)
1152 destroy_loop_vec_info (inner_loop_vinfo, true);
1153 return NULL;
1154 }
1155
1156 /* We assume that the loop exit condition is at the end of the loop. i.e,
1157 that the loop is represented as a do-while (with a proper if-guard
1158 before the loop if needed), where the loop header contains all the
1159 executable statements, and the latch is empty. */
1160 if (!empty_block_p (loop->latch)
1161 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1162 {
1163 if (dump_enabled_p ())
1164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165 "not vectorized: latch block not empty.");
1166 if (inner_loop_vinfo)
1167 destroy_loop_vec_info (inner_loop_vinfo, true);
1168 return NULL;
1169 }
1170
1171 /* Make sure there exists a single-predecessor exit bb: */
1172 if (!single_pred_p (single_exit (loop)->dest))
1173 {
1174 edge e = single_exit (loop);
1175 if (!(e->flags & EDGE_ABNORMAL))
1176 {
1177 split_loop_exit_edge (e);
1178 if (dump_enabled_p ())
1179 dump_printf (MSG_NOTE, "split exit edge.");
1180 }
1181 else
1182 {
1183 if (dump_enabled_p ())
1184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1185 "not vectorized: abnormal loop exit edge.");
1186 if (inner_loop_vinfo)
1187 destroy_loop_vec_info (inner_loop_vinfo, true);
1188 return NULL;
1189 }
1190 }
1191
1192 loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1193 if (!loop_cond)
1194 {
1195 if (dump_enabled_p ())
1196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1197 "not vectorized: complicated exit condition.");
1198 if (inner_loop_vinfo)
1199 destroy_loop_vec_info (inner_loop_vinfo, true);
1200 return NULL;
1201 }
1202
1203 if (!number_of_iterations)
1204 {
1205 if (dump_enabled_p ())
1206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1207 "not vectorized: number of iterations cannot be "
1208 "computed.");
1209 if (inner_loop_vinfo)
1210 destroy_loop_vec_info (inner_loop_vinfo, true);
1211 return NULL;
1212 }
1213
1214 if (chrec_contains_undetermined (number_of_iterations))
1215 {
1216 if (dump_enabled_p ())
1217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218 "Infinite number of iterations.");
1219 if (inner_loop_vinfo)
1220 destroy_loop_vec_info (inner_loop_vinfo, true);
1221 return NULL;
1222 }
1223
1224 if (!NITERS_KNOWN_P (number_of_iterations))
1225 {
1226 if (dump_enabled_p ())
1227 {
1228 dump_printf_loc (MSG_NOTE, vect_location,
1229 "Symbolic number of iterations is ");
1230 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1231 }
1232 }
1233 else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1234 {
1235 if (dump_enabled_p ())
1236 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1237 "not vectorized: number of iterations = 0.");
1238 if (inner_loop_vinfo)
1239 destroy_loop_vec_info (inner_loop_vinfo, true);
1240 return NULL;
1241 }
1242
1243 loop_vinfo = new_loop_vec_info (loop);
1244 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1245 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1246
1247 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1248
1249 /* CHECKME: May want to keep it around it in the future. */
1250 if (inner_loop_vinfo)
1251 destroy_loop_vec_info (inner_loop_vinfo, false);
1252
1253 gcc_assert (!loop->aux);
1254 loop->aux = loop_vinfo;
1255 return loop_vinfo;
1256 }
1257
1258
1259 /* Function vect_analyze_loop_operations.
1260
1261 Scan the loop stmts and make sure they are all vectorizable. */
1262
1263 static bool
1264 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1265 {
1266 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1267 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1268 int nbbs = loop->num_nodes;
1269 gimple_stmt_iterator si;
1270 unsigned int vectorization_factor = 0;
1271 int i;
1272 gimple phi;
1273 stmt_vec_info stmt_info;
1274 bool need_to_vectorize = false;
1275 int min_profitable_iters;
1276 int min_scalar_loop_bound;
1277 unsigned int th;
1278 bool only_slp_in_loop = true, ok;
1279 HOST_WIDE_INT max_niter;
1280 HOST_WIDE_INT estimated_niter;
1281 int min_profitable_estimate;
1282
1283 if (dump_enabled_p ())
1284 dump_printf_loc (MSG_NOTE, vect_location,
1285 "=== vect_analyze_loop_operations ===");
1286
1287 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1288 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1289 if (slp)
1290 {
1291 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1292 vectorization factor of the loop is the unrolling factor required by
1293 the SLP instances. If that unrolling factor is 1, we say, that we
1294 perform pure SLP on loop - cross iteration parallelism is not
1295 exploited. */
1296 for (i = 0; i < nbbs; i++)
1297 {
1298 basic_block bb = bbs[i];
1299 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1300 {
1301 gimple stmt = gsi_stmt (si);
1302 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1303 gcc_assert (stmt_info);
1304 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1305 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1306 && !PURE_SLP_STMT (stmt_info))
1307 /* STMT needs both SLP and loop-based vectorization. */
1308 only_slp_in_loop = false;
1309 }
1310 }
1311
1312 if (only_slp_in_loop)
1313 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1314 else
1315 vectorization_factor = least_common_multiple (vectorization_factor,
1316 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1317
1318 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1319 if (dump_enabled_p ())
1320 dump_printf_loc (MSG_NOTE, vect_location,
1321 "Updating vectorization factor to %d ",
1322 vectorization_factor);
1323 }
1324
1325 for (i = 0; i < nbbs; i++)
1326 {
1327 basic_block bb = bbs[i];
1328
1329 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1330 {
1331 phi = gsi_stmt (si);
1332 ok = true;
1333
1334 stmt_info = vinfo_for_stmt (phi);
1335 if (dump_enabled_p ())
1336 {
1337 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1338 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1339 }
1340
1341 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1342 (i.e., a phi in the tail of the outer-loop). */
1343 if (! is_loop_header_bb_p (bb))
1344 {
1345 /* FORNOW: we currently don't support the case that these phis
1346 are not used in the outerloop (unless it is double reduction,
1347 i.e., this phi is vect_reduction_def), cause this case
1348 requires to actually do something here. */
1349 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1350 || STMT_VINFO_LIVE_P (stmt_info))
1351 && STMT_VINFO_DEF_TYPE (stmt_info)
1352 != vect_double_reduction_def)
1353 {
1354 if (dump_enabled_p ())
1355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1356 "Unsupported loop-closed phi in "
1357 "outer-loop.");
1358 return false;
1359 }
1360
1361 /* If PHI is used in the outer loop, we check that its operand
1362 is defined in the inner loop. */
1363 if (STMT_VINFO_RELEVANT_P (stmt_info))
1364 {
1365 tree phi_op;
1366 gimple op_def_stmt;
1367
1368 if (gimple_phi_num_args (phi) != 1)
1369 return false;
1370
1371 phi_op = PHI_ARG_DEF (phi, 0);
1372 if (TREE_CODE (phi_op) != SSA_NAME)
1373 return false;
1374
1375 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1376 if (!op_def_stmt
1377 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1378 || !vinfo_for_stmt (op_def_stmt))
1379 return false;
1380
1381 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1382 != vect_used_in_outer
1383 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1384 != vect_used_in_outer_by_reduction)
1385 return false;
1386 }
1387
1388 continue;
1389 }
1390
1391 gcc_assert (stmt_info);
1392
1393 if (STMT_VINFO_LIVE_P (stmt_info))
1394 {
1395 /* FORNOW: not yet supported. */
1396 if (dump_enabled_p ())
1397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1398 "not vectorized: value used after loop.");
1399 return false;
1400 }
1401
1402 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1403 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1404 {
1405 /* A scalar-dependence cycle that we don't support. */
1406 if (dump_enabled_p ())
1407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1408 "not vectorized: scalar dependence cycle.");
1409 return false;
1410 }
1411
1412 if (STMT_VINFO_RELEVANT_P (stmt_info))
1413 {
1414 need_to_vectorize = true;
1415 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1416 ok = vectorizable_induction (phi, NULL, NULL);
1417 }
1418
1419 if (!ok)
1420 {
1421 if (dump_enabled_p ())
1422 {
1423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1424 "not vectorized: relevant phi not "
1425 "supported: ");
1426 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1427 }
1428 return false;
1429 }
1430 }
1431
1432 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1433 {
1434 gimple stmt = gsi_stmt (si);
1435 if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1436 return false;
1437 }
1438 } /* bbs */
1439
1440 /* All operations in the loop are either irrelevant (deal with loop
1441 control, or dead), or only used outside the loop and can be moved
1442 out of the loop (e.g. invariants, inductions). The loop can be
1443 optimized away by scalar optimizations. We're better off not
1444 touching this loop. */
1445 if (!need_to_vectorize)
1446 {
1447 if (dump_enabled_p ())
1448 dump_printf_loc (MSG_NOTE, vect_location,
1449 "All the computation can be taken out of the loop.");
1450 if (dump_enabled_p ())
1451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1452 "not vectorized: redundant loop. no profit to "
1453 "vectorize.");
1454 return false;
1455 }
1456
1457 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1458 dump_printf_loc (MSG_NOTE, vect_location,
1459 "vectorization_factor = %d, niters = "
1460 HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1461 LOOP_VINFO_INT_NITERS (loop_vinfo));
1462
1463 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1464 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1465 || ((max_niter = max_stmt_executions_int (loop)) != -1
1466 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1467 {
1468 if (dump_enabled_p ())
1469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1470 "not vectorized: iteration count too small.");
1471 if (dump_enabled_p ())
1472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1473 "not vectorized: iteration count smaller than "
1474 "vectorization factor.");
1475 return false;
1476 }
1477
1478 /* Analyze cost. Decide if worth while to vectorize. */
1479
1480 /* Once VF is set, SLP costs should be updated since the number of created
1481 vector stmts depends on VF. */
1482 vect_update_slp_costs_according_to_vf (loop_vinfo);
1483
1484 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1485 &min_profitable_estimate);
1486 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1487
1488 if (min_profitable_iters < 0)
1489 {
1490 if (dump_enabled_p ())
1491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1492 "not vectorized: vectorization not profitable.");
1493 if (dump_enabled_p ())
1494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1495 "not vectorized: vector version will never be "
1496 "profitable.");
1497 return false;
1498 }
1499
1500 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1501 * vectorization_factor) - 1);
1502
1503
1504 /* Use the cost model only if it is more conservative than user specified
1505 threshold. */
1506
1507 th = (unsigned) min_scalar_loop_bound;
1508 if (min_profitable_iters
1509 && (!min_scalar_loop_bound
1510 || min_profitable_iters > min_scalar_loop_bound))
1511 th = (unsigned) min_profitable_iters;
1512
1513 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1514 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1515 {
1516 if (dump_enabled_p ())
1517 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1518 "not vectorized: vectorization not profitable.");
1519 if (dump_enabled_p ())
1520 dump_printf_loc (MSG_NOTE, vect_location,
1521 "not vectorized: iteration count smaller than user "
1522 "specified loop bound parameter or minimum profitable "
1523 "iterations (whichever is more conservative).");
1524 return false;
1525 }
1526
1527 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1528 && ((unsigned HOST_WIDE_INT) estimated_niter
1529 <= MAX (th, (unsigned)min_profitable_estimate)))
1530 {
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533 "not vectorized: estimated iteration count too "
1534 "small.");
1535 if (dump_enabled_p ())
1536 dump_printf_loc (MSG_NOTE, vect_location,
1537 "not vectorized: estimated iteration count smaller "
1538 "than specified loop bound parameter or minimum "
1539 "profitable iterations (whichever is more "
1540 "conservative).");
1541 return false;
1542 }
1543
1544 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1545 || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1546 || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1547 {
1548 if (dump_enabled_p ())
1549 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1550 if (!vect_can_advance_ivs_p (loop_vinfo))
1551 {
1552 if (dump_enabled_p ())
1553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1554 "not vectorized: can't create epilog loop 1.");
1555 return false;
1556 }
1557 if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1558 {
1559 if (dump_enabled_p ())
1560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1561 "not vectorized: can't create epilog loop 2.");
1562 return false;
1563 }
1564 }
1565
1566 return true;
1567 }
1568
1569
1570 /* Function vect_analyze_loop_2.
1571
1572 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1573 for it. The different analyses will record information in the
1574 loop_vec_info struct. */
1575 static bool
1576 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1577 {
1578 bool ok, slp = false;
1579 int max_vf = MAX_VECTORIZATION_FACTOR;
1580 int min_vf = 2;
1581
1582 /* Find all data references in the loop (which correspond to vdefs/vuses)
1583 and analyze their evolution in the loop. Also adjust the minimal
1584 vectorization factor according to the loads and stores.
1585
1586 FORNOW: Handle only simple, array references, which
1587 alignment can be forced, and aligned pointer-references. */
1588
1589 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1590 if (!ok)
1591 {
1592 if (dump_enabled_p ())
1593 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1594 "bad data references.");
1595 return false;
1596 }
1597
1598 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1599 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1600
1601 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1602 if (!ok)
1603 {
1604 if (dump_enabled_p ())
1605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606 "bad data access.");
1607 return false;
1608 }
1609
1610 /* Classify all cross-iteration scalar data-flow cycles.
1611 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1612
1613 vect_analyze_scalar_cycles (loop_vinfo);
1614
1615 vect_pattern_recog (loop_vinfo, NULL);
1616
1617 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1618
1619 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1620 if (!ok)
1621 {
1622 if (dump_enabled_p ())
1623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1624 "unexpected pattern.");
1625 return false;
1626 }
1627
1628 /* Analyze data dependences between the data-refs in the loop
1629 and adjust the maximum vectorization factor according to
1630 the dependences.
1631 FORNOW: fail at the first data dependence that we encounter. */
1632
1633 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1634 if (!ok
1635 || max_vf < min_vf)
1636 {
1637 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639 "bad data dependence.");
1640 return false;
1641 }
1642
1643 ok = vect_determine_vectorization_factor (loop_vinfo);
1644 if (!ok)
1645 {
1646 if (dump_enabled_p ())
1647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1648 "can't determine vectorization factor.");
1649 return false;
1650 }
1651 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1652 {
1653 if (dump_enabled_p ())
1654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1655 "bad data dependence.");
1656 return false;
1657 }
1658
1659 /* Analyze the alignment of the data-refs in the loop.
1660 Fail if a data reference is found that cannot be vectorized. */
1661
1662 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1663 if (!ok)
1664 {
1665 if (dump_enabled_p ())
1666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667 "bad data alignment.");
1668 return false;
1669 }
1670
1671 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1672 It is important to call pruning after vect_analyze_data_ref_accesses,
1673 since we use grouping information gathered by interleaving analysis. */
1674 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1675 if (!ok)
1676 {
1677 if (dump_enabled_p ())
1678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679 "too long list of versioning for alias "
1680 "run-time tests.");
1681 return false;
1682 }
1683
1684 /* This pass will decide on using loop versioning and/or loop peeling in
1685 order to enhance the alignment of data references in the loop. */
1686
1687 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1688 if (!ok)
1689 {
1690 if (dump_enabled_p ())
1691 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1692 "bad data alignment.");
1693 return false;
1694 }
1695
1696 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1697 ok = vect_analyze_slp (loop_vinfo, NULL);
1698 if (ok)
1699 {
1700 /* Decide which possible SLP instances to SLP. */
1701 slp = vect_make_slp_decision (loop_vinfo);
1702
1703 /* Find stmts that need to be both vectorized and SLPed. */
1704 vect_detect_hybrid_slp (loop_vinfo);
1705 }
1706 else
1707 return false;
1708
1709 /* Scan all the operations in the loop and make sure they are
1710 vectorizable. */
1711
1712 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1713 if (!ok)
1714 {
1715 if (dump_enabled_p ())
1716 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1717 "bad operation or unsupported loop bound.");
1718 return false;
1719 }
1720
1721 return true;
1722 }
1723
1724 /* Function vect_analyze_loop.
1725
1726 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1727 for it. The different analyses will record information in the
1728 loop_vec_info struct. */
1729 loop_vec_info
1730 vect_analyze_loop (struct loop *loop)
1731 {
1732 loop_vec_info loop_vinfo;
1733 unsigned int vector_sizes;
1734
1735 /* Autodetect first vector size we try. */
1736 current_vector_size = 0;
1737 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1738
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_NOTE, vect_location,
1741 "===== analyze_loop_nest =====");
1742
1743 if (loop_outer (loop)
1744 && loop_vec_info_for_loop (loop_outer (loop))
1745 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1746 {
1747 if (dump_enabled_p ())
1748 dump_printf_loc (MSG_NOTE, vect_location,
1749 "outer-loop already vectorized.");
1750 return NULL;
1751 }
1752
1753 while (1)
1754 {
1755 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1756 loop_vinfo = vect_analyze_loop_form (loop);
1757 if (!loop_vinfo)
1758 {
1759 if (dump_enabled_p ())
1760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1761 "bad loop form.");
1762 return NULL;
1763 }
1764
1765 if (vect_analyze_loop_2 (loop_vinfo))
1766 {
1767 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1768
1769 return loop_vinfo;
1770 }
1771
1772 destroy_loop_vec_info (loop_vinfo, true);
1773
1774 vector_sizes &= ~current_vector_size;
1775 if (vector_sizes == 0
1776 || current_vector_size == 0)
1777 return NULL;
1778
1779 /* Try the next biggest vector size. */
1780 current_vector_size = 1 << floor_log2 (vector_sizes);
1781 if (dump_enabled_p ())
1782 dump_printf_loc (MSG_NOTE, vect_location,
1783 "***** Re-trying analysis with "
1784 "vector size %d\n", current_vector_size);
1785 }
1786 }
1787
1788
1789 /* Function reduction_code_for_scalar_code
1790
1791 Input:
1792 CODE - tree_code of a reduction operations.
1793
1794 Output:
1795 REDUC_CODE - the corresponding tree-code to be used to reduce the
1796 vector of partial results into a single scalar result (which
1797 will also reside in a vector) or ERROR_MARK if the operation is
1798 a supported reduction operation, but does not have such tree-code.
1799
1800 Return FALSE if CODE currently cannot be vectorized as reduction. */
1801
1802 static bool
1803 reduction_code_for_scalar_code (enum tree_code code,
1804 enum tree_code *reduc_code)
1805 {
1806 switch (code)
1807 {
1808 case MAX_EXPR:
1809 *reduc_code = REDUC_MAX_EXPR;
1810 return true;
1811
1812 case MIN_EXPR:
1813 *reduc_code = REDUC_MIN_EXPR;
1814 return true;
1815
1816 case PLUS_EXPR:
1817 *reduc_code = REDUC_PLUS_EXPR;
1818 return true;
1819
1820 case MULT_EXPR:
1821 case MINUS_EXPR:
1822 case BIT_IOR_EXPR:
1823 case BIT_XOR_EXPR:
1824 case BIT_AND_EXPR:
1825 *reduc_code = ERROR_MARK;
1826 return true;
1827
1828 default:
1829 return false;
1830 }
1831 }
1832
1833
1834 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1835 STMT is printed with a message MSG. */
1836
1837 static void
1838 report_vect_op (int msg_type, gimple stmt, const char *msg)
1839 {
1840 dump_printf_loc (msg_type, vect_location, "%s", msg);
1841 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1842 }
1843
1844
1845 /* Detect SLP reduction of the form:
1846
1847 #a1 = phi <a5, a0>
1848 a2 = operation (a1)
1849 a3 = operation (a2)
1850 a4 = operation (a3)
1851 a5 = operation (a4)
1852
1853 #a = phi <a5>
1854
1855 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1856 FIRST_STMT is the first reduction stmt in the chain
1857 (a2 = operation (a1)).
1858
1859 Return TRUE if a reduction chain was detected. */
1860
1861 static bool
1862 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1863 {
1864 struct loop *loop = (gimple_bb (phi))->loop_father;
1865 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1866 enum tree_code code;
1867 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1868 stmt_vec_info use_stmt_info, current_stmt_info;
1869 tree lhs;
1870 imm_use_iterator imm_iter;
1871 use_operand_p use_p;
1872 int nloop_uses, size = 0, n_out_of_loop_uses;
1873 bool found = false;
1874
1875 if (loop != vect_loop)
1876 return false;
1877
1878 lhs = PHI_RESULT (phi);
1879 code = gimple_assign_rhs_code (first_stmt);
1880 while (1)
1881 {
1882 nloop_uses = 0;
1883 n_out_of_loop_uses = 0;
1884 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1885 {
1886 gimple use_stmt = USE_STMT (use_p);
1887 if (is_gimple_debug (use_stmt))
1888 continue;
1889
1890 use_stmt = USE_STMT (use_p);
1891
1892 /* Check if we got back to the reduction phi. */
1893 if (use_stmt == phi)
1894 {
1895 loop_use_stmt = use_stmt;
1896 found = true;
1897 break;
1898 }
1899
1900 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1901 {
1902 if (vinfo_for_stmt (use_stmt)
1903 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1904 {
1905 loop_use_stmt = use_stmt;
1906 nloop_uses++;
1907 }
1908 }
1909 else
1910 n_out_of_loop_uses++;
1911
1912 /* There are can be either a single use in the loop or two uses in
1913 phi nodes. */
1914 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1915 return false;
1916 }
1917
1918 if (found)
1919 break;
1920
1921 /* We reached a statement with no loop uses. */
1922 if (nloop_uses == 0)
1923 return false;
1924
1925 /* This is a loop exit phi, and we haven't reached the reduction phi. */
1926 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1927 return false;
1928
1929 if (!is_gimple_assign (loop_use_stmt)
1930 || code != gimple_assign_rhs_code (loop_use_stmt)
1931 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1932 return false;
1933
1934 /* Insert USE_STMT into reduction chain. */
1935 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1936 if (current_stmt)
1937 {
1938 current_stmt_info = vinfo_for_stmt (current_stmt);
1939 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1940 GROUP_FIRST_ELEMENT (use_stmt_info)
1941 = GROUP_FIRST_ELEMENT (current_stmt_info);
1942 }
1943 else
1944 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1945
1946 lhs = gimple_assign_lhs (loop_use_stmt);
1947 current_stmt = loop_use_stmt;
1948 size++;
1949 }
1950
1951 if (!found || loop_use_stmt != phi || size < 2)
1952 return false;
1953
1954 /* Swap the operands, if needed, to make the reduction operand be the second
1955 operand. */
1956 lhs = PHI_RESULT (phi);
1957 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1958 while (next_stmt)
1959 {
1960 if (gimple_assign_rhs2 (next_stmt) == lhs)
1961 {
1962 tree op = gimple_assign_rhs1 (next_stmt);
1963 gimple def_stmt = NULL;
1964
1965 if (TREE_CODE (op) == SSA_NAME)
1966 def_stmt = SSA_NAME_DEF_STMT (op);
1967
1968 /* Check that the other def is either defined in the loop
1969 ("vect_internal_def"), or it's an induction (defined by a
1970 loop-header phi-node). */
1971 if (def_stmt
1972 && gimple_bb (def_stmt)
1973 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1974 && (is_gimple_assign (def_stmt)
1975 || is_gimple_call (def_stmt)
1976 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1977 == vect_induction_def
1978 || (gimple_code (def_stmt) == GIMPLE_PHI
1979 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1980 == vect_internal_def
1981 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1982 {
1983 lhs = gimple_assign_lhs (next_stmt);
1984 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1985 continue;
1986 }
1987
1988 return false;
1989 }
1990 else
1991 {
1992 tree op = gimple_assign_rhs2 (next_stmt);
1993 gimple def_stmt = NULL;
1994
1995 if (TREE_CODE (op) == SSA_NAME)
1996 def_stmt = SSA_NAME_DEF_STMT (op);
1997
1998 /* Check that the other def is either defined in the loop
1999 ("vect_internal_def"), or it's an induction (defined by a
2000 loop-header phi-node). */
2001 if (def_stmt
2002 && gimple_bb (def_stmt)
2003 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2004 && (is_gimple_assign (def_stmt)
2005 || is_gimple_call (def_stmt)
2006 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2007 == vect_induction_def
2008 || (gimple_code (def_stmt) == GIMPLE_PHI
2009 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2010 == vect_internal_def
2011 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2012 {
2013 if (dump_enabled_p ())
2014 {
2015 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2016 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2017 }
2018
2019 swap_tree_operands (next_stmt,
2020 gimple_assign_rhs1_ptr (next_stmt),
2021 gimple_assign_rhs2_ptr (next_stmt));
2022 update_stmt (next_stmt);
2023
2024 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2025 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2026 }
2027 else
2028 return false;
2029 }
2030
2031 lhs = gimple_assign_lhs (next_stmt);
2032 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2033 }
2034
2035 /* Save the chain for further analysis in SLP detection. */
2036 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2037 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2038 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2039
2040 return true;
2041 }
2042
2043
2044 /* Function vect_is_simple_reduction_1
2045
2046 (1) Detect a cross-iteration def-use cycle that represents a simple
2047 reduction computation. We look for the following pattern:
2048
2049 loop_header:
2050 a1 = phi < a0, a2 >
2051 a3 = ...
2052 a2 = operation (a3, a1)
2053
2054 such that:
2055 1. operation is commutative and associative and it is safe to
2056 change the order of the computation (if CHECK_REDUCTION is true)
2057 2. no uses for a2 in the loop (a2 is used out of the loop)
2058 3. no uses of a1 in the loop besides the reduction operation
2059 4. no uses of a1 outside the loop.
2060
2061 Conditions 1,4 are tested here.
2062 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2063
2064 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2065 nested cycles, if CHECK_REDUCTION is false.
2066
2067 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2068 reductions:
2069
2070 a1 = phi < a0, a2 >
2071 inner loop (def of a3)
2072 a2 = phi < a3 >
2073
2074 If MODIFY is true it tries also to rework the code in-place to enable
2075 detection of more reduction patterns. For the time being we rewrite
2076 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2077 */
2078
2079 static gimple
2080 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2081 bool check_reduction, bool *double_reduc,
2082 bool modify)
2083 {
2084 struct loop *loop = (gimple_bb (phi))->loop_father;
2085 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2086 edge latch_e = loop_latch_edge (loop);
2087 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2088 gimple def_stmt, def1 = NULL, def2 = NULL;
2089 enum tree_code orig_code, code;
2090 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2091 tree type;
2092 int nloop_uses;
2093 tree name;
2094 imm_use_iterator imm_iter;
2095 use_operand_p use_p;
2096 bool phi_def;
2097
2098 *double_reduc = false;
2099
2100 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2101 otherwise, we assume outer loop vectorization. */
2102 gcc_assert ((check_reduction && loop == vect_loop)
2103 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2104
2105 name = PHI_RESULT (phi);
2106 nloop_uses = 0;
2107 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2108 {
2109 gimple use_stmt = USE_STMT (use_p);
2110 if (is_gimple_debug (use_stmt))
2111 continue;
2112
2113 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2114 {
2115 if (dump_enabled_p ())
2116 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2117 "intermediate value used outside loop.");
2118
2119 return NULL;
2120 }
2121
2122 if (vinfo_for_stmt (use_stmt)
2123 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2124 nloop_uses++;
2125 if (nloop_uses > 1)
2126 {
2127 if (dump_enabled_p ())
2128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2129 "reduction used in loop.");
2130 return NULL;
2131 }
2132 }
2133
2134 if (TREE_CODE (loop_arg) != SSA_NAME)
2135 {
2136 if (dump_enabled_p ())
2137 {
2138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2139 "reduction: not ssa_name: ");
2140 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2141 }
2142 return NULL;
2143 }
2144
2145 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2146 if (!def_stmt)
2147 {
2148 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150 "reduction: no def_stmt.");
2151 return NULL;
2152 }
2153
2154 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2155 {
2156 if (dump_enabled_p ())
2157 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2158 return NULL;
2159 }
2160
2161 if (is_gimple_assign (def_stmt))
2162 {
2163 name = gimple_assign_lhs (def_stmt);
2164 phi_def = false;
2165 }
2166 else
2167 {
2168 name = PHI_RESULT (def_stmt);
2169 phi_def = true;
2170 }
2171
2172 nloop_uses = 0;
2173 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2174 {
2175 gimple use_stmt = USE_STMT (use_p);
2176 if (is_gimple_debug (use_stmt))
2177 continue;
2178 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2179 && vinfo_for_stmt (use_stmt)
2180 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2181 nloop_uses++;
2182 if (nloop_uses > 1)
2183 {
2184 if (dump_enabled_p ())
2185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2186 "reduction used in loop.");
2187 return NULL;
2188 }
2189 }
2190
2191 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2192 defined in the inner loop. */
2193 if (phi_def)
2194 {
2195 op1 = PHI_ARG_DEF (def_stmt, 0);
2196
2197 if (gimple_phi_num_args (def_stmt) != 1
2198 || TREE_CODE (op1) != SSA_NAME)
2199 {
2200 if (dump_enabled_p ())
2201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2202 "unsupported phi node definition.");
2203
2204 return NULL;
2205 }
2206
2207 def1 = SSA_NAME_DEF_STMT (op1);
2208 if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2209 && loop->inner
2210 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2211 && is_gimple_assign (def1))
2212 {
2213 if (dump_enabled_p ())
2214 report_vect_op (MSG_NOTE, def_stmt,
2215 "detected double reduction: ");
2216
2217 *double_reduc = true;
2218 return def_stmt;
2219 }
2220
2221 return NULL;
2222 }
2223
2224 code = orig_code = gimple_assign_rhs_code (def_stmt);
2225
2226 /* We can handle "res -= x[i]", which is non-associative by
2227 simply rewriting this into "res += -x[i]". Avoid changing
2228 gimple instruction for the first simple tests and only do this
2229 if we're allowed to change code at all. */
2230 if (code == MINUS_EXPR
2231 && modify
2232 && (op1 = gimple_assign_rhs1 (def_stmt))
2233 && TREE_CODE (op1) == SSA_NAME
2234 && SSA_NAME_DEF_STMT (op1) == phi)
2235 code = PLUS_EXPR;
2236
2237 if (check_reduction
2238 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2239 {
2240 if (dump_enabled_p ())
2241 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2242 "reduction: not commutative/associative: ");
2243 return NULL;
2244 }
2245
2246 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2247 {
2248 if (code != COND_EXPR)
2249 {
2250 if (dump_enabled_p ())
2251 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2252 "reduction: not binary operation: ");
2253
2254 return NULL;
2255 }
2256
2257 op3 = gimple_assign_rhs1 (def_stmt);
2258 if (COMPARISON_CLASS_P (op3))
2259 {
2260 op4 = TREE_OPERAND (op3, 1);
2261 op3 = TREE_OPERAND (op3, 0);
2262 }
2263
2264 op1 = gimple_assign_rhs2 (def_stmt);
2265 op2 = gimple_assign_rhs3 (def_stmt);
2266
2267 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2268 {
2269 if (dump_enabled_p ())
2270 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2271 "reduction: uses not ssa_names: ");
2272
2273 return NULL;
2274 }
2275 }
2276 else
2277 {
2278 op1 = gimple_assign_rhs1 (def_stmt);
2279 op2 = gimple_assign_rhs2 (def_stmt);
2280
2281 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2282 {
2283 if (dump_enabled_p ())
2284 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2285 "reduction: uses not ssa_names: ");
2286
2287 return NULL;
2288 }
2289 }
2290
2291 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2292 if ((TREE_CODE (op1) == SSA_NAME
2293 && !types_compatible_p (type,TREE_TYPE (op1)))
2294 || (TREE_CODE (op2) == SSA_NAME
2295 && !types_compatible_p (type, TREE_TYPE (op2)))
2296 || (op3 && TREE_CODE (op3) == SSA_NAME
2297 && !types_compatible_p (type, TREE_TYPE (op3)))
2298 || (op4 && TREE_CODE (op4) == SSA_NAME
2299 && !types_compatible_p (type, TREE_TYPE (op4))))
2300 {
2301 if (dump_enabled_p ())
2302 {
2303 dump_printf_loc (MSG_NOTE, vect_location,
2304 "reduction: multiple types: operation type: ");
2305 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2306 dump_printf (MSG_NOTE, ", operands types: ");
2307 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2308 TREE_TYPE (op1));
2309 dump_printf (MSG_NOTE, ",");
2310 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2311 TREE_TYPE (op2));
2312 if (op3)
2313 {
2314 dump_printf (MSG_NOTE, ",");
2315 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2316 TREE_TYPE (op3));
2317 }
2318
2319 if (op4)
2320 {
2321 dump_printf (MSG_NOTE, ",");
2322 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2323 TREE_TYPE (op4));
2324 }
2325 }
2326
2327 return NULL;
2328 }
2329
2330 /* Check that it's ok to change the order of the computation.
2331 Generally, when vectorizing a reduction we change the order of the
2332 computation. This may change the behavior of the program in some
2333 cases, so we need to check that this is ok. One exception is when
2334 vectorizing an outer-loop: the inner-loop is executed sequentially,
2335 and therefore vectorizing reductions in the inner-loop during
2336 outer-loop vectorization is safe. */
2337
2338 /* CHECKME: check for !flag_finite_math_only too? */
2339 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2340 && check_reduction)
2341 {
2342 /* Changing the order of operations changes the semantics. */
2343 if (dump_enabled_p ())
2344 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2345 "reduction: unsafe fp math optimization: ");
2346 return NULL;
2347 }
2348 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2349 && check_reduction)
2350 {
2351 /* Changing the order of operations changes the semantics. */
2352 if (dump_enabled_p ())
2353 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2354 "reduction: unsafe int math optimization: ");
2355 return NULL;
2356 }
2357 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2358 {
2359 /* Changing the order of operations changes the semantics. */
2360 if (dump_enabled_p ())
2361 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2362 "reduction: unsafe fixed-point math optimization: ");
2363 return NULL;
2364 }
2365
2366 /* If we detected "res -= x[i]" earlier, rewrite it into
2367 "res += -x[i]" now. If this turns out to be useless reassoc
2368 will clean it up again. */
2369 if (orig_code == MINUS_EXPR)
2370 {
2371 tree rhs = gimple_assign_rhs2 (def_stmt);
2372 tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2373 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2374 rhs, NULL);
2375 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2376 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2377 loop_info, NULL));
2378 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2379 gimple_assign_set_rhs2 (def_stmt, negrhs);
2380 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2381 update_stmt (def_stmt);
2382 }
2383
2384 /* Reduction is safe. We're dealing with one of the following:
2385 1) integer arithmetic and no trapv
2386 2) floating point arithmetic, and special flags permit this optimization
2387 3) nested cycle (i.e., outer loop vectorization). */
2388 if (TREE_CODE (op1) == SSA_NAME)
2389 def1 = SSA_NAME_DEF_STMT (op1);
2390
2391 if (TREE_CODE (op2) == SSA_NAME)
2392 def2 = SSA_NAME_DEF_STMT (op2);
2393
2394 if (code != COND_EXPR
2395 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2396 {
2397 if (dump_enabled_p ())
2398 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2399 return NULL;
2400 }
2401
2402 /* Check that one def is the reduction def, defined by PHI,
2403 the other def is either defined in the loop ("vect_internal_def"),
2404 or it's an induction (defined by a loop-header phi-node). */
2405
2406 if (def2 && def2 == phi
2407 && (code == COND_EXPR
2408 || !def1 || gimple_nop_p (def1)
2409 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2410 && (is_gimple_assign (def1)
2411 || is_gimple_call (def1)
2412 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2413 == vect_induction_def
2414 || (gimple_code (def1) == GIMPLE_PHI
2415 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2416 == vect_internal_def
2417 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2418 {
2419 if (dump_enabled_p ())
2420 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2421 return def_stmt;
2422 }
2423
2424 if (def1 && def1 == phi
2425 && (code == COND_EXPR
2426 || !def2 || gimple_nop_p (def2)
2427 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2428 && (is_gimple_assign (def2)
2429 || is_gimple_call (def2)
2430 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2431 == vect_induction_def
2432 || (gimple_code (def2) == GIMPLE_PHI
2433 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2434 == vect_internal_def
2435 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2436 {
2437 if (check_reduction)
2438 {
2439 /* Swap operands (just for simplicity - so that the rest of the code
2440 can assume that the reduction variable is always the last (second)
2441 argument). */
2442 if (dump_enabled_p ())
2443 report_vect_op (MSG_NOTE, def_stmt,
2444 "detected reduction: need to swap operands: ");
2445
2446 swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2447 gimple_assign_rhs2_ptr (def_stmt));
2448
2449 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2450 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2451 }
2452 else
2453 {
2454 if (dump_enabled_p ())
2455 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2456 }
2457
2458 return def_stmt;
2459 }
2460
2461 /* Try to find SLP reduction chain. */
2462 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2463 {
2464 if (dump_enabled_p ())
2465 report_vect_op (MSG_NOTE, def_stmt,
2466 "reduction: detected reduction chain: ");
2467
2468 return def_stmt;
2469 }
2470
2471 if (dump_enabled_p ())
2472 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2473 "reduction: unknown pattern: ");
2474
2475 return NULL;
2476 }
2477
2478 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2479 in-place. Arguments as there. */
2480
2481 static gimple
2482 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2483 bool check_reduction, bool *double_reduc)
2484 {
2485 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2486 double_reduc, false);
2487 }
2488
2489 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2490 in-place if it enables detection of more reductions. Arguments
2491 as there. */
2492
2493 gimple
2494 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2495 bool check_reduction, bool *double_reduc)
2496 {
2497 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2498 double_reduc, true);
2499 }
2500
2501 /* Calculate the cost of one scalar iteration of the loop. */
2502 int
2503 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2504 {
2505 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2506 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2507 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2508 int innerloop_iters, i, stmt_cost;
2509
2510 /* Count statements in scalar loop. Using this as scalar cost for a single
2511 iteration for now.
2512
2513 TODO: Add outer loop support.
2514
2515 TODO: Consider assigning different costs to different scalar
2516 statements. */
2517
2518 /* FORNOW. */
2519 innerloop_iters = 1;
2520 if (loop->inner)
2521 innerloop_iters = 50; /* FIXME */
2522
2523 for (i = 0; i < nbbs; i++)
2524 {
2525 gimple_stmt_iterator si;
2526 basic_block bb = bbs[i];
2527
2528 if (bb->loop_father == loop->inner)
2529 factor = innerloop_iters;
2530 else
2531 factor = 1;
2532
2533 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2534 {
2535 gimple stmt = gsi_stmt (si);
2536 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2537
2538 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2539 continue;
2540
2541 /* Skip stmts that are not vectorized inside the loop. */
2542 if (stmt_info
2543 && !STMT_VINFO_RELEVANT_P (stmt_info)
2544 && (!STMT_VINFO_LIVE_P (stmt_info)
2545 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2546 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2547 continue;
2548
2549 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2550 {
2551 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2552 stmt_cost = vect_get_stmt_cost (scalar_load);
2553 else
2554 stmt_cost = vect_get_stmt_cost (scalar_store);
2555 }
2556 else
2557 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2558
2559 scalar_single_iter_cost += stmt_cost * factor;
2560 }
2561 }
2562 return scalar_single_iter_cost;
2563 }
2564
2565 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2566 int
2567 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2568 int *peel_iters_epilogue,
2569 int scalar_single_iter_cost,
2570 stmt_vector_for_cost *prologue_cost_vec,
2571 stmt_vector_for_cost *epilogue_cost_vec)
2572 {
2573 int retval = 0;
2574 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2575
2576 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2577 {
2578 *peel_iters_epilogue = vf/2;
2579 if (dump_enabled_p ())
2580 dump_printf_loc (MSG_NOTE, vect_location,
2581 "cost model: epilogue peel iters set to vf/2 "
2582 "because loop iterations are unknown .");
2583
2584 /* If peeled iterations are known but number of scalar loop
2585 iterations are unknown, count a taken branch per peeled loop. */
2586 retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2587 NULL, 0, vect_prologue);
2588 }
2589 else
2590 {
2591 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2592 peel_iters_prologue = niters < peel_iters_prologue ?
2593 niters : peel_iters_prologue;
2594 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2595 /* If we need to peel for gaps, but no peeling is required, we have to
2596 peel VF iterations. */
2597 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2598 *peel_iters_epilogue = vf;
2599 }
2600
2601 if (peel_iters_prologue)
2602 retval += record_stmt_cost (prologue_cost_vec,
2603 peel_iters_prologue * scalar_single_iter_cost,
2604 scalar_stmt, NULL, 0, vect_prologue);
2605 if (*peel_iters_epilogue)
2606 retval += record_stmt_cost (epilogue_cost_vec,
2607 *peel_iters_epilogue * scalar_single_iter_cost,
2608 scalar_stmt, NULL, 0, vect_epilogue);
2609 return retval;
2610 }
2611
2612 /* Function vect_estimate_min_profitable_iters
2613
2614 Return the number of iterations required for the vector version of the
2615 loop to be profitable relative to the cost of the scalar version of the
2616 loop. */
2617
2618 static void
2619 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2620 int *ret_min_profitable_niters,
2621 int *ret_min_profitable_estimate)
2622 {
2623 int min_profitable_iters;
2624 int min_profitable_estimate;
2625 int peel_iters_prologue;
2626 int peel_iters_epilogue;
2627 unsigned vec_inside_cost = 0;
2628 int vec_outside_cost = 0;
2629 unsigned vec_prologue_cost = 0;
2630 unsigned vec_epilogue_cost = 0;
2631 int scalar_single_iter_cost = 0;
2632 int scalar_outside_cost = 0;
2633 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2634 int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2635 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2636
2637 /* Cost model disabled. */
2638 if (!flag_vect_cost_model)
2639 {
2640 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2641 *ret_min_profitable_niters = 0;
2642 *ret_min_profitable_estimate = 0;
2643 return;
2644 }
2645
2646 /* Requires loop versioning tests to handle misalignment. */
2647 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2648 {
2649 /* FIXME: Make cost depend on complexity of individual check. */
2650 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2651 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2652 vect_prologue);
2653 dump_printf (MSG_NOTE,
2654 "cost model: Adding cost of checks for loop "
2655 "versioning to treat misalignment.\n");
2656 }
2657
2658 /* Requires loop versioning with alias checks. */
2659 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2660 {
2661 /* FIXME: Make cost depend on complexity of individual check. */
2662 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2663 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2664 vect_prologue);
2665 dump_printf (MSG_NOTE,
2666 "cost model: Adding cost of checks for loop "
2667 "versioning aliasing.\n");
2668 }
2669
2670 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2671 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2672 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2673 vect_prologue);
2674
2675 /* Count statements in scalar loop. Using this as scalar cost for a single
2676 iteration for now.
2677
2678 TODO: Add outer loop support.
2679
2680 TODO: Consider assigning different costs to different scalar
2681 statements. */
2682
2683 scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2684
2685 /* Add additional cost for the peeled instructions in prologue and epilogue
2686 loop.
2687
2688 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2689 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2690
2691 TODO: Build an expression that represents peel_iters for prologue and
2692 epilogue to be used in a run-time test. */
2693
2694 if (npeel < 0)
2695 {
2696 peel_iters_prologue = vf/2;
2697 dump_printf (MSG_NOTE, "cost model: "
2698 "prologue peel iters set to vf/2.");
2699
2700 /* If peeling for alignment is unknown, loop bound of main loop becomes
2701 unknown. */
2702 peel_iters_epilogue = vf/2;
2703 dump_printf (MSG_NOTE, "cost model: "
2704 "epilogue peel iters set to vf/2 because "
2705 "peeling for alignment is unknown.");
2706
2707 /* If peeled iterations are unknown, count a taken branch and a not taken
2708 branch per peeled loop. Even if scalar loop iterations are known,
2709 vector iterations are not known since peeled prologue iterations are
2710 not known. Hence guards remain the same. */
2711 (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2712 NULL, 0, vect_prologue);
2713 (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2714 NULL, 0, vect_prologue);
2715 /* FORNOW: Don't attempt to pass individual scalar instructions to
2716 the model; just assume linear cost for scalar iterations. */
2717 (void) add_stmt_cost (target_cost_data,
2718 peel_iters_prologue * scalar_single_iter_cost,
2719 scalar_stmt, NULL, 0, vect_prologue);
2720 (void) add_stmt_cost (target_cost_data,
2721 peel_iters_epilogue * scalar_single_iter_cost,
2722 scalar_stmt, NULL, 0, vect_epilogue);
2723 }
2724 else
2725 {
2726 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2727 stmt_info_for_cost *si;
2728 int j;
2729 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2730
2731 prologue_cost_vec.create (2);
2732 epilogue_cost_vec.create (2);
2733 peel_iters_prologue = npeel;
2734
2735 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2736 &peel_iters_epilogue,
2737 scalar_single_iter_cost,
2738 &prologue_cost_vec,
2739 &epilogue_cost_vec);
2740
2741 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2742 {
2743 struct _stmt_vec_info *stmt_info
2744 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2745 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2746 si->misalign, vect_prologue);
2747 }
2748
2749 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2750 {
2751 struct _stmt_vec_info *stmt_info
2752 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2753 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2754 si->misalign, vect_epilogue);
2755 }
2756
2757 prologue_cost_vec.release ();
2758 epilogue_cost_vec.release ();
2759 }
2760
2761 /* FORNOW: The scalar outside cost is incremented in one of the
2762 following ways:
2763
2764 1. The vectorizer checks for alignment and aliasing and generates
2765 a condition that allows dynamic vectorization. A cost model
2766 check is ANDED with the versioning condition. Hence scalar code
2767 path now has the added cost of the versioning check.
2768
2769 if (cost > th & versioning_check)
2770 jmp to vector code
2771
2772 Hence run-time scalar is incremented by not-taken branch cost.
2773
2774 2. The vectorizer then checks if a prologue is required. If the
2775 cost model check was not done before during versioning, it has to
2776 be done before the prologue check.
2777
2778 if (cost <= th)
2779 prologue = scalar_iters
2780 if (prologue == 0)
2781 jmp to vector code
2782 else
2783 execute prologue
2784 if (prologue == num_iters)
2785 go to exit
2786
2787 Hence the run-time scalar cost is incremented by a taken branch,
2788 plus a not-taken branch, plus a taken branch cost.
2789
2790 3. The vectorizer then checks if an epilogue is required. If the
2791 cost model check was not done before during prologue check, it
2792 has to be done with the epilogue check.
2793
2794 if (prologue == 0)
2795 jmp to vector code
2796 else
2797 execute prologue
2798 if (prologue == num_iters)
2799 go to exit
2800 vector code:
2801 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2802 jmp to epilogue
2803
2804 Hence the run-time scalar cost should be incremented by 2 taken
2805 branches.
2806
2807 TODO: The back end may reorder the BBS's differently and reverse
2808 conditions/branch directions. Change the estimates below to
2809 something more reasonable. */
2810
2811 /* If the number of iterations is known and we do not do versioning, we can
2812 decide whether to vectorize at compile time. Hence the scalar version
2813 do not carry cost model guard costs. */
2814 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2815 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2816 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2817 {
2818 /* Cost model check occurs at versioning. */
2819 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2820 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2821 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2822 else
2823 {
2824 /* Cost model check occurs at prologue generation. */
2825 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2826 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2827 + vect_get_stmt_cost (cond_branch_not_taken);
2828 /* Cost model check occurs at epilogue generation. */
2829 else
2830 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2831 }
2832 }
2833
2834 /* Complete the target-specific cost calculations. */
2835 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2836 &vec_inside_cost, &vec_epilogue_cost);
2837
2838 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2839
2840 /* Calculate number of iterations required to make the vector version
2841 profitable, relative to the loop bodies only. The following condition
2842 must hold true:
2843 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2844 where
2845 SIC = scalar iteration cost, VIC = vector iteration cost,
2846 VOC = vector outside cost, VF = vectorization factor,
2847 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2848 SOC = scalar outside cost for run time cost model check. */
2849
2850 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2851 {
2852 if (vec_outside_cost <= 0)
2853 min_profitable_iters = 1;
2854 else
2855 {
2856 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2857 - vec_inside_cost * peel_iters_prologue
2858 - vec_inside_cost * peel_iters_epilogue)
2859 / ((scalar_single_iter_cost * vf)
2860 - vec_inside_cost);
2861
2862 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2863 <= (((int) vec_inside_cost * min_profitable_iters)
2864 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2865 min_profitable_iters++;
2866 }
2867 }
2868 /* vector version will never be profitable. */
2869 else
2870 {
2871 if (dump_enabled_p ())
2872 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2873 "cost model: the vector iteration cost = %d "
2874 "divided by the scalar iteration cost = %d "
2875 "is greater or equal to the vectorization factor = %d.",
2876 vec_inside_cost, scalar_single_iter_cost, vf);
2877 *ret_min_profitable_niters = -1;
2878 *ret_min_profitable_estimate = -1;
2879 return;
2880 }
2881
2882 if (dump_enabled_p ())
2883 {
2884 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2885 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
2886 vec_inside_cost);
2887 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
2888 vec_prologue_cost);
2889 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
2890 vec_epilogue_cost);
2891 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
2892 scalar_single_iter_cost);
2893 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
2894 scalar_outside_cost);
2895 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
2896 vec_outside_cost);
2897 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
2898 peel_iters_prologue);
2899 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
2900 peel_iters_epilogue);
2901 dump_printf (MSG_NOTE,
2902 " Calculated minimum iters for profitability: %d\n",
2903 min_profitable_iters);
2904 }
2905
2906 min_profitable_iters =
2907 min_profitable_iters < vf ? vf : min_profitable_iters;
2908
2909 /* Because the condition we create is:
2910 if (niters <= min_profitable_iters)
2911 then skip the vectorized loop. */
2912 min_profitable_iters--;
2913
2914 if (dump_enabled_p ())
2915 dump_printf_loc (MSG_NOTE, vect_location,
2916 " Runtime profitability threshold = %d\n", min_profitable_iters);
2917
2918 *ret_min_profitable_niters = min_profitable_iters;
2919
2920 /* Calculate number of iterations required to make the vector version
2921 profitable, relative to the loop bodies only.
2922
2923 Non-vectorized variant is SIC * niters and it must win over vector
2924 variant on the expected loop trip count. The following condition must hold true:
2925 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
2926
2927 if (vec_outside_cost <= 0)
2928 min_profitable_estimate = 1;
2929 else
2930 {
2931 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2932 - vec_inside_cost * peel_iters_prologue
2933 - vec_inside_cost * peel_iters_epilogue)
2934 / ((scalar_single_iter_cost * vf)
2935 - vec_inside_cost);
2936 }
2937 min_profitable_estimate --;
2938 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2939 if (dump_enabled_p ())
2940 dump_printf_loc (MSG_NOTE, vect_location,
2941 " Static estimate profitability threshold = %d\n",
2942 min_profitable_iters);
2943
2944 *ret_min_profitable_estimate = min_profitable_estimate;
2945 }
2946
2947
2948 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2949 functions. Design better to avoid maintenance issues. */
2950
2951 /* Function vect_model_reduction_cost.
2952
2953 Models cost for a reduction operation, including the vector ops
2954 generated within the strip-mine loop, the initial definition before
2955 the loop, and the epilogue code that must be generated. */
2956
2957 static bool
2958 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2959 int ncopies)
2960 {
2961 int prologue_cost = 0, epilogue_cost = 0;
2962 enum tree_code code;
2963 optab optab;
2964 tree vectype;
2965 gimple stmt, orig_stmt;
2966 tree reduction_op;
2967 enum machine_mode mode;
2968 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2969 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2970 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2971
2972 /* Cost of reduction op inside loop. */
2973 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2974 stmt_info, 0, vect_body);
2975 stmt = STMT_VINFO_STMT (stmt_info);
2976
2977 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2978 {
2979 case GIMPLE_SINGLE_RHS:
2980 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2981 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2982 break;
2983 case GIMPLE_UNARY_RHS:
2984 reduction_op = gimple_assign_rhs1 (stmt);
2985 break;
2986 case GIMPLE_BINARY_RHS:
2987 reduction_op = gimple_assign_rhs2 (stmt);
2988 break;
2989 case GIMPLE_TERNARY_RHS:
2990 reduction_op = gimple_assign_rhs3 (stmt);
2991 break;
2992 default:
2993 gcc_unreachable ();
2994 }
2995
2996 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2997 if (!vectype)
2998 {
2999 if (dump_enabled_p ())
3000 {
3001 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3002 "unsupported data-type ");
3003 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3004 TREE_TYPE (reduction_op));
3005 }
3006 return false;
3007 }
3008
3009 mode = TYPE_MODE (vectype);
3010 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3011
3012 if (!orig_stmt)
3013 orig_stmt = STMT_VINFO_STMT (stmt_info);
3014
3015 code = gimple_assign_rhs_code (orig_stmt);
3016
3017 /* Add in cost for initial definition. */
3018 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3019 stmt_info, 0, vect_prologue);
3020
3021 /* Determine cost of epilogue code.
3022
3023 We have a reduction operator that will reduce the vector in one statement.
3024 Also requires scalar extract. */
3025
3026 if (!nested_in_vect_loop_p (loop, orig_stmt))
3027 {
3028 if (reduc_code != ERROR_MARK)
3029 {
3030 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3031 stmt_info, 0, vect_epilogue);
3032 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3033 stmt_info, 0, vect_epilogue);
3034 }
3035 else
3036 {
3037 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3038 tree bitsize =
3039 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3040 int element_bitsize = tree_low_cst (bitsize, 1);
3041 int nelements = vec_size_in_bits / element_bitsize;
3042
3043 optab = optab_for_tree_code (code, vectype, optab_default);
3044
3045 /* We have a whole vector shift available. */
3046 if (VECTOR_MODE_P (mode)
3047 && optab_handler (optab, mode) != CODE_FOR_nothing
3048 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3049 {
3050 /* Final reduction via vector shifts and the reduction operator.
3051 Also requires scalar extract. */
3052 epilogue_cost += add_stmt_cost (target_cost_data,
3053 exact_log2 (nelements) * 2,
3054 vector_stmt, stmt_info, 0,
3055 vect_epilogue);
3056 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3057 vec_to_scalar, stmt_info, 0,
3058 vect_epilogue);
3059 }
3060 else
3061 /* Use extracts and reduction op for final reduction. For N
3062 elements, we have N extracts and N-1 reduction ops. */
3063 epilogue_cost += add_stmt_cost (target_cost_data,
3064 nelements + nelements - 1,
3065 vector_stmt, stmt_info, 0,
3066 vect_epilogue);
3067 }
3068 }
3069
3070 if (dump_enabled_p ())
3071 dump_printf (MSG_NOTE,
3072 "vect_model_reduction_cost: inside_cost = %d, "
3073 "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3074 prologue_cost, epilogue_cost);
3075
3076 return true;
3077 }
3078
3079
3080 /* Function vect_model_induction_cost.
3081
3082 Models cost for induction operations. */
3083
3084 static void
3085 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3086 {
3087 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3088 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3089 unsigned inside_cost, prologue_cost;
3090
3091 /* loop cost for vec_loop. */
3092 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3093 stmt_info, 0, vect_body);
3094
3095 /* prologue cost for vec_init and vec_step. */
3096 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3097 stmt_info, 0, vect_prologue);
3098
3099 if (dump_enabled_p ())
3100 dump_printf_loc (MSG_NOTE, vect_location,
3101 "vect_model_induction_cost: inside_cost = %d, "
3102 "prologue_cost = %d .", inside_cost, prologue_cost);
3103 }
3104
3105
3106 /* Function get_initial_def_for_induction
3107
3108 Input:
3109 STMT - a stmt that performs an induction operation in the loop.
3110 IV_PHI - the initial value of the induction variable
3111
3112 Output:
3113 Return a vector variable, initialized with the first VF values of
3114 the induction variable. E.g., for an iv with IV_PHI='X' and
3115 evolution S, for a vector of 4 units, we want to return:
3116 [X, X + S, X + 2*S, X + 3*S]. */
3117
3118 static tree
3119 get_initial_def_for_induction (gimple iv_phi)
3120 {
3121 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3122 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3123 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3124 tree scalar_type;
3125 tree vectype;
3126 int nunits;
3127 edge pe = loop_preheader_edge (loop);
3128 struct loop *iv_loop;
3129 basic_block new_bb;
3130 tree new_vec, vec_init, vec_step, t;
3131 tree access_fn;
3132 tree new_var;
3133 tree new_name;
3134 gimple init_stmt, induction_phi, new_stmt;
3135 tree induc_def, vec_def, vec_dest;
3136 tree init_expr, step_expr;
3137 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3138 int i;
3139 bool ok;
3140 int ncopies;
3141 tree expr;
3142 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3143 bool nested_in_vect_loop = false;
3144 gimple_seq stmts = NULL;
3145 imm_use_iterator imm_iter;
3146 use_operand_p use_p;
3147 gimple exit_phi;
3148 edge latch_e;
3149 tree loop_arg;
3150 gimple_stmt_iterator si;
3151 basic_block bb = gimple_bb (iv_phi);
3152 tree stepvectype;
3153 tree resvectype;
3154
3155 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3156 if (nested_in_vect_loop_p (loop, iv_phi))
3157 {
3158 nested_in_vect_loop = true;
3159 iv_loop = loop->inner;
3160 }
3161 else
3162 iv_loop = loop;
3163 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3164
3165 latch_e = loop_latch_edge (iv_loop);
3166 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3167
3168 access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3169 gcc_assert (access_fn);
3170 STRIP_NOPS (access_fn);
3171 ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3172 &init_expr, &step_expr);
3173 gcc_assert (ok);
3174 pe = loop_preheader_edge (iv_loop);
3175
3176 scalar_type = TREE_TYPE (init_expr);
3177 vectype = get_vectype_for_scalar_type (scalar_type);
3178 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3179 gcc_assert (vectype);
3180 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3181 ncopies = vf / nunits;
3182
3183 gcc_assert (phi_info);
3184 gcc_assert (ncopies >= 1);
3185
3186 /* Find the first insertion point in the BB. */
3187 si = gsi_after_labels (bb);
3188
3189 /* Create the vector that holds the initial_value of the induction. */
3190 if (nested_in_vect_loop)
3191 {
3192 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3193 been created during vectorization of previous stmts. We obtain it
3194 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3195 tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3196 loop_preheader_edge (iv_loop));
3197 vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3198 /* If the initial value is not of proper type, convert it. */
3199 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3200 {
3201 new_stmt = gimple_build_assign_with_ops
3202 (VIEW_CONVERT_EXPR,
3203 vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3204 build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3205 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3206 gimple_assign_set_lhs (new_stmt, vec_init);
3207 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3208 new_stmt);
3209 gcc_assert (!new_bb);
3210 set_vinfo_for_stmt (new_stmt,
3211 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3212 }
3213 }
3214 else
3215 {
3216 vec<constructor_elt, va_gc> *v;
3217
3218 /* iv_loop is the loop to be vectorized. Create:
3219 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3220 new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3221 new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3222 if (stmts)
3223 {
3224 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3225 gcc_assert (!new_bb);
3226 }
3227
3228 vec_alloc (v, nunits);
3229 bool constant_p = is_gimple_min_invariant (new_name);
3230 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3231 for (i = 1; i < nunits; i++)
3232 {
3233 /* Create: new_name_i = new_name + step_expr */
3234 enum tree_code code = POINTER_TYPE_P (scalar_type)
3235 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3236 new_name = fold_build2 (code, scalar_type, new_name, step_expr);
3237 if (!is_gimple_min_invariant (new_name))
3238 {
3239 init_stmt = gimple_build_assign (new_var, new_name);
3240 new_name = make_ssa_name (new_var, init_stmt);
3241 gimple_assign_set_lhs (init_stmt, new_name);
3242 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3243 gcc_assert (!new_bb);
3244 if (dump_enabled_p ())
3245 {
3246 dump_printf_loc (MSG_NOTE, vect_location,
3247 "created new init_stmt: ");
3248 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3249 }
3250 constant_p = false;
3251 }
3252 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3253 }
3254 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3255 if (constant_p)
3256 new_vec = build_vector_from_ctor (vectype, v);
3257 else
3258 new_vec = build_constructor (vectype, v);
3259 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3260 }
3261
3262
3263 /* Create the vector that holds the step of the induction. */
3264 if (nested_in_vect_loop)
3265 /* iv_loop is nested in the loop to be vectorized. Generate:
3266 vec_step = [S, S, S, S] */
3267 new_name = step_expr;
3268 else
3269 {
3270 /* iv_loop is the loop to be vectorized. Generate:
3271 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3272 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3273 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3274 expr, step_expr);
3275 }
3276
3277 t = unshare_expr (new_name);
3278 gcc_assert (CONSTANT_CLASS_P (new_name));
3279 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3280 gcc_assert (stepvectype);
3281 new_vec = build_vector_from_val (stepvectype, t);
3282 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3283
3284
3285 /* Create the following def-use cycle:
3286 loop prolog:
3287 vec_init = ...
3288 vec_step = ...
3289 loop:
3290 vec_iv = PHI <vec_init, vec_loop>
3291 ...
3292 STMT
3293 ...
3294 vec_loop = vec_iv + vec_step; */
3295
3296 /* Create the induction-phi that defines the induction-operand. */
3297 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3298 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3299 set_vinfo_for_stmt (induction_phi,
3300 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3301 induc_def = PHI_RESULT (induction_phi);
3302
3303 /* Create the iv update inside the loop */
3304 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3305 induc_def, vec_step);
3306 vec_def = make_ssa_name (vec_dest, new_stmt);
3307 gimple_assign_set_lhs (new_stmt, vec_def);
3308 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3309 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3310 NULL));
3311
3312 /* Set the arguments of the phi node: */
3313 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3314 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3315 UNKNOWN_LOCATION);
3316
3317
3318 /* In case that vectorization factor (VF) is bigger than the number
3319 of elements that we can fit in a vectype (nunits), we have to generate
3320 more than one vector stmt - i.e - we need to "unroll" the
3321 vector stmt by a factor VF/nunits. For more details see documentation
3322 in vectorizable_operation. */
3323
3324 if (ncopies > 1)
3325 {
3326 stmt_vec_info prev_stmt_vinfo;
3327 /* FORNOW. This restriction should be relaxed. */
3328 gcc_assert (!nested_in_vect_loop);
3329
3330 /* Create the vector that holds the step of the induction. */
3331 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3332 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3333 expr, step_expr);
3334 t = unshare_expr (new_name);
3335 gcc_assert (CONSTANT_CLASS_P (new_name));
3336 new_vec = build_vector_from_val (stepvectype, t);
3337 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3338
3339 vec_def = induc_def;
3340 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3341 for (i = 1; i < ncopies; i++)
3342 {
3343 /* vec_i = vec_prev + vec_step */
3344 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3345 vec_def, vec_step);
3346 vec_def = make_ssa_name (vec_dest, new_stmt);
3347 gimple_assign_set_lhs (new_stmt, vec_def);
3348
3349 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3350 if (!useless_type_conversion_p (resvectype, vectype))
3351 {
3352 new_stmt = gimple_build_assign_with_ops
3353 (VIEW_CONVERT_EXPR,
3354 vect_get_new_vect_var (resvectype, vect_simple_var,
3355 "vec_iv_"),
3356 build1 (VIEW_CONVERT_EXPR, resvectype,
3357 gimple_assign_lhs (new_stmt)), NULL_TREE);
3358 gimple_assign_set_lhs (new_stmt,
3359 make_ssa_name
3360 (gimple_assign_lhs (new_stmt), new_stmt));
3361 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3362 }
3363 set_vinfo_for_stmt (new_stmt,
3364 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3365 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3366 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3367 }
3368 }
3369
3370 if (nested_in_vect_loop)
3371 {
3372 /* Find the loop-closed exit-phi of the induction, and record
3373 the final vector of induction results: */
3374 exit_phi = NULL;
3375 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3376 {
3377 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3378 {
3379 exit_phi = USE_STMT (use_p);
3380 break;
3381 }
3382 }
3383 if (exit_phi)
3384 {
3385 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3386 /* FORNOW. Currently not supporting the case that an inner-loop induction
3387 is not used in the outer-loop (i.e. only outside the outer-loop). */
3388 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3389 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3390
3391 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3392 if (dump_enabled_p ())
3393 {
3394 dump_printf_loc (MSG_NOTE, vect_location,
3395 "vector of inductions after inner-loop:");
3396 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3397 }
3398 }
3399 }
3400
3401
3402 if (dump_enabled_p ())
3403 {
3404 dump_printf_loc (MSG_NOTE, vect_location,
3405 "transform induction: created def-use cycle: ");
3406 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3407 dump_printf (MSG_NOTE, "\n");
3408 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3409 SSA_NAME_DEF_STMT (vec_def), 0);
3410 }
3411
3412 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3413 if (!useless_type_conversion_p (resvectype, vectype))
3414 {
3415 new_stmt = gimple_build_assign_with_ops
3416 (VIEW_CONVERT_EXPR,
3417 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3418 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3419 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3420 gimple_assign_set_lhs (new_stmt, induc_def);
3421 si = gsi_after_labels (bb);
3422 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3423 set_vinfo_for_stmt (new_stmt,
3424 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3425 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3426 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3427 }
3428
3429 return induc_def;
3430 }
3431
3432
3433 /* Function get_initial_def_for_reduction
3434
3435 Input:
3436 STMT - a stmt that performs a reduction operation in the loop.
3437 INIT_VAL - the initial value of the reduction variable
3438
3439 Output:
3440 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3441 of the reduction (used for adjusting the epilog - see below).
3442 Return a vector variable, initialized according to the operation that STMT
3443 performs. This vector will be used as the initial value of the
3444 vector of partial results.
3445
3446 Option1 (adjust in epilog): Initialize the vector as follows:
3447 add/bit or/xor: [0,0,...,0,0]
3448 mult/bit and: [1,1,...,1,1]
3449 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3450 and when necessary (e.g. add/mult case) let the caller know
3451 that it needs to adjust the result by init_val.
3452
3453 Option2: Initialize the vector as follows:
3454 add/bit or/xor: [init_val,0,0,...,0]
3455 mult/bit and: [init_val,1,1,...,1]
3456 min/max/cond_expr: [init_val,init_val,...,init_val]
3457 and no adjustments are needed.
3458
3459 For example, for the following code:
3460
3461 s = init_val;
3462 for (i=0;i<n;i++)
3463 s = s + a[i];
3464
3465 STMT is 's = s + a[i]', and the reduction variable is 's'.
3466 For a vector of 4 units, we want to return either [0,0,0,init_val],
3467 or [0,0,0,0] and let the caller know that it needs to adjust
3468 the result at the end by 'init_val'.
3469
3470 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3471 initialization vector is simpler (same element in all entries), if
3472 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3473
3474 A cost model should help decide between these two schemes. */
3475
3476 tree
3477 get_initial_def_for_reduction (gimple stmt, tree init_val,
3478 tree *adjustment_def)
3479 {
3480 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3481 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3482 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3483 tree scalar_type = TREE_TYPE (init_val);
3484 tree vectype = get_vectype_for_scalar_type (scalar_type);
3485 int nunits;
3486 enum tree_code code = gimple_assign_rhs_code (stmt);
3487 tree def_for_init;
3488 tree init_def;
3489 tree *elts;
3490 int i;
3491 bool nested_in_vect_loop = false;
3492 tree init_value;
3493 REAL_VALUE_TYPE real_init_val = dconst0;
3494 int int_init_val = 0;
3495 gimple def_stmt = NULL;
3496
3497 gcc_assert (vectype);
3498 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3499
3500 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3501 || SCALAR_FLOAT_TYPE_P (scalar_type));
3502
3503 if (nested_in_vect_loop_p (loop, stmt))
3504 nested_in_vect_loop = true;
3505 else
3506 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3507
3508 /* In case of double reduction we only create a vector variable to be put
3509 in the reduction phi node. The actual statement creation is done in
3510 vect_create_epilog_for_reduction. */
3511 if (adjustment_def && nested_in_vect_loop
3512 && TREE_CODE (init_val) == SSA_NAME
3513 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3514 && gimple_code (def_stmt) == GIMPLE_PHI
3515 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3516 && vinfo_for_stmt (def_stmt)
3517 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3518 == vect_double_reduction_def)
3519 {
3520 *adjustment_def = NULL;
3521 return vect_create_destination_var (init_val, vectype);
3522 }
3523
3524 if (TREE_CONSTANT (init_val))
3525 {
3526 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3527 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3528 else
3529 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3530 }
3531 else
3532 init_value = init_val;
3533
3534 switch (code)
3535 {
3536 case WIDEN_SUM_EXPR:
3537 case DOT_PROD_EXPR:
3538 case PLUS_EXPR:
3539 case MINUS_EXPR:
3540 case BIT_IOR_EXPR:
3541 case BIT_XOR_EXPR:
3542 case MULT_EXPR:
3543 case BIT_AND_EXPR:
3544 /* ADJUSMENT_DEF is NULL when called from
3545 vect_create_epilog_for_reduction to vectorize double reduction. */
3546 if (adjustment_def)
3547 {
3548 if (nested_in_vect_loop)
3549 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3550 NULL);
3551 else
3552 *adjustment_def = init_val;
3553 }
3554
3555 if (code == MULT_EXPR)
3556 {
3557 real_init_val = dconst1;
3558 int_init_val = 1;
3559 }
3560
3561 if (code == BIT_AND_EXPR)
3562 int_init_val = -1;
3563
3564 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3565 def_for_init = build_real (scalar_type, real_init_val);
3566 else
3567 def_for_init = build_int_cst (scalar_type, int_init_val);
3568
3569 /* Create a vector of '0' or '1' except the first element. */
3570 elts = XALLOCAVEC (tree, nunits);
3571 for (i = nunits - 2; i >= 0; --i)
3572 elts[i + 1] = def_for_init;
3573
3574 /* Option1: the first element is '0' or '1' as well. */
3575 if (adjustment_def)
3576 {
3577 elts[0] = def_for_init;
3578 init_def = build_vector (vectype, elts);
3579 break;
3580 }
3581
3582 /* Option2: the first element is INIT_VAL. */
3583 elts[0] = init_val;
3584 if (TREE_CONSTANT (init_val))
3585 init_def = build_vector (vectype, elts);
3586 else
3587 {
3588 vec<constructor_elt, va_gc> *v;
3589 vec_alloc (v, nunits);
3590 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3591 for (i = 1; i < nunits; ++i)
3592 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3593 init_def = build_constructor (vectype, v);
3594 }
3595
3596 break;
3597
3598 case MIN_EXPR:
3599 case MAX_EXPR:
3600 case COND_EXPR:
3601 if (adjustment_def)
3602 {
3603 *adjustment_def = NULL_TREE;
3604 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3605 break;
3606 }
3607
3608 init_def = build_vector_from_val (vectype, init_value);
3609 break;
3610
3611 default:
3612 gcc_unreachable ();
3613 }
3614
3615 return init_def;
3616 }
3617
3618
3619 /* Function vect_create_epilog_for_reduction
3620
3621 Create code at the loop-epilog to finalize the result of a reduction
3622 computation.
3623
3624 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3625 reduction statements.
3626 STMT is the scalar reduction stmt that is being vectorized.
3627 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3628 number of elements that we can fit in a vectype (nunits). In this case
3629 we have to generate more than one vector stmt - i.e - we need to "unroll"
3630 the vector stmt by a factor VF/nunits. For more details see documentation
3631 in vectorizable_operation.
3632 REDUC_CODE is the tree-code for the epilog reduction.
3633 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3634 computation.
3635 REDUC_INDEX is the index of the operand in the right hand side of the
3636 statement that is defined by REDUCTION_PHI.
3637 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3638 SLP_NODE is an SLP node containing a group of reduction statements. The
3639 first one in this group is STMT.
3640
3641 This function:
3642 1. Creates the reduction def-use cycles: sets the arguments for
3643 REDUCTION_PHIS:
3644 The loop-entry argument is the vectorized initial-value of the reduction.
3645 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3646 sums.
3647 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3648 by applying the operation specified by REDUC_CODE if available, or by
3649 other means (whole-vector shifts or a scalar loop).
3650 The function also creates a new phi node at the loop exit to preserve
3651 loop-closed form, as illustrated below.
3652
3653 The flow at the entry to this function:
3654
3655 loop:
3656 vec_def = phi <null, null> # REDUCTION_PHI
3657 VECT_DEF = vector_stmt # vectorized form of STMT
3658 s_loop = scalar_stmt # (scalar) STMT
3659 loop_exit:
3660 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3661 use <s_out0>
3662 use <s_out0>
3663
3664 The above is transformed by this function into:
3665
3666 loop:
3667 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3668 VECT_DEF = vector_stmt # vectorized form of STMT
3669 s_loop = scalar_stmt # (scalar) STMT
3670 loop_exit:
3671 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3672 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3673 v_out2 = reduce <v_out1>
3674 s_out3 = extract_field <v_out2, 0>
3675 s_out4 = adjust_result <s_out3>
3676 use <s_out4>
3677 use <s_out4>
3678 */
3679
3680 static void
3681 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3682 int ncopies, enum tree_code reduc_code,
3683 vec<gimple> reduction_phis,
3684 int reduc_index, bool double_reduc,
3685 slp_tree slp_node)
3686 {
3687 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3688 stmt_vec_info prev_phi_info;
3689 tree vectype;
3690 enum machine_mode mode;
3691 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3692 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3693 basic_block exit_bb;
3694 tree scalar_dest;
3695 tree scalar_type;
3696 gimple new_phi = NULL, phi;
3697 gimple_stmt_iterator exit_gsi;
3698 tree vec_dest;
3699 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3700 gimple epilog_stmt = NULL;
3701 enum tree_code code = gimple_assign_rhs_code (stmt);
3702 gimple exit_phi;
3703 tree bitsize, bitpos;
3704 tree adjustment_def = NULL;
3705 tree vec_initial_def = NULL;
3706 tree reduction_op, expr, def;
3707 tree orig_name, scalar_result;
3708 imm_use_iterator imm_iter, phi_imm_iter;
3709 use_operand_p use_p, phi_use_p;
3710 bool extract_scalar_result = false;
3711 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3712 bool nested_in_vect_loop = false;
3713 vec<gimple> new_phis = vNULL;
3714 vec<gimple> inner_phis = vNULL;
3715 enum vect_def_type dt = vect_unknown_def_type;
3716 int j, i;
3717 vec<tree> scalar_results = vNULL;
3718 unsigned int group_size = 1, k, ratio;
3719 vec<tree> vec_initial_defs = vNULL;
3720 vec<gimple> phis;
3721 bool slp_reduc = false;
3722 tree new_phi_result;
3723 gimple inner_phi = NULL;
3724
3725 if (slp_node)
3726 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3727
3728 if (nested_in_vect_loop_p (loop, stmt))
3729 {
3730 outer_loop = loop;
3731 loop = loop->inner;
3732 nested_in_vect_loop = true;
3733 gcc_assert (!slp_node);
3734 }
3735
3736 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3737 {
3738 case GIMPLE_SINGLE_RHS:
3739 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3740 == ternary_op);
3741 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3742 break;
3743 case GIMPLE_UNARY_RHS:
3744 reduction_op = gimple_assign_rhs1 (stmt);
3745 break;
3746 case GIMPLE_BINARY_RHS:
3747 reduction_op = reduc_index ?
3748 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3749 break;
3750 case GIMPLE_TERNARY_RHS:
3751 reduction_op = gimple_op (stmt, reduc_index + 1);
3752 break;
3753 default:
3754 gcc_unreachable ();
3755 }
3756
3757 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3758 gcc_assert (vectype);
3759 mode = TYPE_MODE (vectype);
3760
3761 /* 1. Create the reduction def-use cycle:
3762 Set the arguments of REDUCTION_PHIS, i.e., transform
3763
3764 loop:
3765 vec_def = phi <null, null> # REDUCTION_PHI
3766 VECT_DEF = vector_stmt # vectorized form of STMT
3767 ...
3768
3769 into:
3770
3771 loop:
3772 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3773 VECT_DEF = vector_stmt # vectorized form of STMT
3774 ...
3775
3776 (in case of SLP, do it for all the phis). */
3777
3778 /* Get the loop-entry arguments. */
3779 if (slp_node)
3780 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3781 NULL, slp_node, reduc_index);
3782 else
3783 {
3784 vec_initial_defs.create (1);
3785 /* For the case of reduction, vect_get_vec_def_for_operand returns
3786 the scalar def before the loop, that defines the initial value
3787 of the reduction variable. */
3788 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3789 &adjustment_def);
3790 vec_initial_defs.quick_push (vec_initial_def);
3791 }
3792
3793 /* Set phi nodes arguments. */
3794 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3795 {
3796 tree vec_init_def = vec_initial_defs[i];
3797 tree def = vect_defs[i];
3798 for (j = 0; j < ncopies; j++)
3799 {
3800 /* Set the loop-entry arg of the reduction-phi. */
3801 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3802 UNKNOWN_LOCATION);
3803
3804 /* Set the loop-latch arg for the reduction-phi. */
3805 if (j > 0)
3806 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3807
3808 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3809
3810 if (dump_enabled_p ())
3811 {
3812 dump_printf_loc (MSG_NOTE, vect_location,
3813 "transform reduction: created def-use cycle: ");
3814 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3815 dump_printf (MSG_NOTE, "\n");
3816 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3817 }
3818
3819 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3820 }
3821 }
3822
3823 vec_initial_defs.release ();
3824
3825 /* 2. Create epilog code.
3826 The reduction epilog code operates across the elements of the vector
3827 of partial results computed by the vectorized loop.
3828 The reduction epilog code consists of:
3829
3830 step 1: compute the scalar result in a vector (v_out2)
3831 step 2: extract the scalar result (s_out3) from the vector (v_out2)
3832 step 3: adjust the scalar result (s_out3) if needed.
3833
3834 Step 1 can be accomplished using one the following three schemes:
3835 (scheme 1) using reduc_code, if available.
3836 (scheme 2) using whole-vector shifts, if available.
3837 (scheme 3) using a scalar loop. In this case steps 1+2 above are
3838 combined.
3839
3840 The overall epilog code looks like this:
3841
3842 s_out0 = phi <s_loop> # original EXIT_PHI
3843 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3844 v_out2 = reduce <v_out1> # step 1
3845 s_out3 = extract_field <v_out2, 0> # step 2
3846 s_out4 = adjust_result <s_out3> # step 3
3847
3848 (step 3 is optional, and steps 1 and 2 may be combined).
3849 Lastly, the uses of s_out0 are replaced by s_out4. */
3850
3851
3852 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3853 v_out1 = phi <VECT_DEF>
3854 Store them in NEW_PHIS. */
3855
3856 exit_bb = single_exit (loop)->dest;
3857 prev_phi_info = NULL;
3858 new_phis.create (vect_defs.length ());
3859 FOR_EACH_VEC_ELT (vect_defs, i, def)
3860 {
3861 for (j = 0; j < ncopies; j++)
3862 {
3863 tree new_def = copy_ssa_name (def, NULL);
3864 phi = create_phi_node (new_def, exit_bb);
3865 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3866 if (j == 0)
3867 new_phis.quick_push (phi);
3868 else
3869 {
3870 def = vect_get_vec_def_for_stmt_copy (dt, def);
3871 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3872 }
3873
3874 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3875 prev_phi_info = vinfo_for_stmt (phi);
3876 }
3877 }
3878
3879 /* The epilogue is created for the outer-loop, i.e., for the loop being
3880 vectorized. Create exit phis for the outer loop. */
3881 if (double_reduc)
3882 {
3883 loop = outer_loop;
3884 exit_bb = single_exit (loop)->dest;
3885 inner_phis.create (vect_defs.length ());
3886 FOR_EACH_VEC_ELT (new_phis, i, phi)
3887 {
3888 tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3889 gimple outer_phi = create_phi_node (new_result, exit_bb);
3890 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3891 PHI_RESULT (phi));
3892 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3893 loop_vinfo, NULL));
3894 inner_phis.quick_push (phi);
3895 new_phis[i] = outer_phi;
3896 prev_phi_info = vinfo_for_stmt (outer_phi);
3897 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3898 {
3899 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3900 new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3901 outer_phi = create_phi_node (new_result, exit_bb);
3902 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3903 PHI_RESULT (phi));
3904 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3905 loop_vinfo, NULL));
3906 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3907 prev_phi_info = vinfo_for_stmt (outer_phi);
3908 }
3909 }
3910 }
3911
3912 exit_gsi = gsi_after_labels (exit_bb);
3913
3914 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3915 (i.e. when reduc_code is not available) and in the final adjustment
3916 code (if needed). Also get the original scalar reduction variable as
3917 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
3918 represents a reduction pattern), the tree-code and scalar-def are
3919 taken from the original stmt that the pattern-stmt (STMT) replaces.
3920 Otherwise (it is a regular reduction) - the tree-code and scalar-def
3921 are taken from STMT. */
3922
3923 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3924 if (!orig_stmt)
3925 {
3926 /* Regular reduction */
3927 orig_stmt = stmt;
3928 }
3929 else
3930 {
3931 /* Reduction pattern */
3932 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3933 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3934 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3935 }
3936
3937 code = gimple_assign_rhs_code (orig_stmt);
3938 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3939 partial results are added and not subtracted. */
3940 if (code == MINUS_EXPR)
3941 code = PLUS_EXPR;
3942
3943 scalar_dest = gimple_assign_lhs (orig_stmt);
3944 scalar_type = TREE_TYPE (scalar_dest);
3945 scalar_results.create (group_size);
3946 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3947 bitsize = TYPE_SIZE (scalar_type);
3948
3949 /* In case this is a reduction in an inner-loop while vectorizing an outer
3950 loop - we don't need to extract a single scalar result at the end of the
3951 inner-loop (unless it is double reduction, i.e., the use of reduction is
3952 outside the outer-loop). The final vector of partial results will be used
3953 in the vectorized outer-loop, or reduced to a scalar result at the end of
3954 the outer-loop. */
3955 if (nested_in_vect_loop && !double_reduc)
3956 goto vect_finalize_reduction;
3957
3958 /* SLP reduction without reduction chain, e.g.,
3959 # a1 = phi <a2, a0>
3960 # b1 = phi <b2, b0>
3961 a2 = operation (a1)
3962 b2 = operation (b1) */
3963 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3964
3965 /* In case of reduction chain, e.g.,
3966 # a1 = phi <a3, a0>
3967 a2 = operation (a1)
3968 a3 = operation (a2),
3969
3970 we may end up with more than one vector result. Here we reduce them to
3971 one vector. */
3972 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3973 {
3974 tree first_vect = PHI_RESULT (new_phis[0]);
3975 tree tmp;
3976 gimple new_vec_stmt = NULL;
3977
3978 vec_dest = vect_create_destination_var (scalar_dest, vectype);
3979 for (k = 1; k < new_phis.length (); k++)
3980 {
3981 gimple next_phi = new_phis[k];
3982 tree second_vect = PHI_RESULT (next_phi);
3983
3984 tmp = build2 (code, vectype, first_vect, second_vect);
3985 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3986 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3987 gimple_assign_set_lhs (new_vec_stmt, first_vect);
3988 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3989 }
3990
3991 new_phi_result = first_vect;
3992 if (new_vec_stmt)
3993 {
3994 new_phis.truncate (0);
3995 new_phis.safe_push (new_vec_stmt);
3996 }
3997 }
3998 else
3999 new_phi_result = PHI_RESULT (new_phis[0]);
4000
4001 /* 2.3 Create the reduction code, using one of the three schemes described
4002 above. In SLP we simply need to extract all the elements from the
4003 vector (without reducing them), so we use scalar shifts. */
4004 if (reduc_code != ERROR_MARK && !slp_reduc)
4005 {
4006 tree tmp;
4007
4008 /*** Case 1: Create:
4009 v_out2 = reduc_expr <v_out1> */
4010
4011 if (dump_enabled_p ())
4012 dump_printf_loc (MSG_NOTE, vect_location,
4013 "Reduce using direct vector reduction.");
4014
4015 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4016 tmp = build1 (reduc_code, vectype, new_phi_result);
4017 epilog_stmt = gimple_build_assign (vec_dest, tmp);
4018 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4019 gimple_assign_set_lhs (epilog_stmt, new_temp);
4020 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4021
4022 extract_scalar_result = true;
4023 }
4024 else
4025 {
4026 enum tree_code shift_code = ERROR_MARK;
4027 bool have_whole_vector_shift = true;
4028 int bit_offset;
4029 int element_bitsize = tree_low_cst (bitsize, 1);
4030 int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4031 tree vec_temp;
4032
4033 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4034 shift_code = VEC_RSHIFT_EXPR;
4035 else
4036 have_whole_vector_shift = false;
4037
4038 /* Regardless of whether we have a whole vector shift, if we're
4039 emulating the operation via tree-vect-generic, we don't want
4040 to use it. Only the first round of the reduction is likely
4041 to still be profitable via emulation. */
4042 /* ??? It might be better to emit a reduction tree code here, so that
4043 tree-vect-generic can expand the first round via bit tricks. */
4044 if (!VECTOR_MODE_P (mode))
4045 have_whole_vector_shift = false;
4046 else
4047 {
4048 optab optab = optab_for_tree_code (code, vectype, optab_default);
4049 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4050 have_whole_vector_shift = false;
4051 }
4052
4053 if (have_whole_vector_shift && !slp_reduc)
4054 {
4055 /*** Case 2: Create:
4056 for (offset = VS/2; offset >= element_size; offset/=2)
4057 {
4058 Create: va' = vec_shift <va, offset>
4059 Create: va = vop <va, va'>
4060 } */
4061
4062 if (dump_enabled_p ())
4063 dump_printf_loc (MSG_NOTE, vect_location,
4064 "Reduce using vector shifts");
4065
4066 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4067 new_temp = new_phi_result;
4068 for (bit_offset = vec_size_in_bits/2;
4069 bit_offset >= element_bitsize;
4070 bit_offset /= 2)
4071 {
4072 tree bitpos = size_int (bit_offset);
4073
4074 epilog_stmt = gimple_build_assign_with_ops (shift_code,
4075 vec_dest, new_temp, bitpos);
4076 new_name = make_ssa_name (vec_dest, epilog_stmt);
4077 gimple_assign_set_lhs (epilog_stmt, new_name);
4078 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4079
4080 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4081 new_name, new_temp);
4082 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4083 gimple_assign_set_lhs (epilog_stmt, new_temp);
4084 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4085 }
4086
4087 extract_scalar_result = true;
4088 }
4089 else
4090 {
4091 tree rhs;
4092
4093 /*** Case 3: Create:
4094 s = extract_field <v_out2, 0>
4095 for (offset = element_size;
4096 offset < vector_size;
4097 offset += element_size;)
4098 {
4099 Create: s' = extract_field <v_out2, offset>
4100 Create: s = op <s, s'> // For non SLP cases
4101 } */
4102
4103 if (dump_enabled_p ())
4104 dump_printf_loc (MSG_NOTE, vect_location,
4105 "Reduce using scalar code. ");
4106
4107 vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4108 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4109 {
4110 if (gimple_code (new_phi) == GIMPLE_PHI)
4111 vec_temp = PHI_RESULT (new_phi);
4112 else
4113 vec_temp = gimple_assign_lhs (new_phi);
4114 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4115 bitsize_zero_node);
4116 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4117 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4118 gimple_assign_set_lhs (epilog_stmt, new_temp);
4119 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4120
4121 /* In SLP we don't need to apply reduction operation, so we just
4122 collect s' values in SCALAR_RESULTS. */
4123 if (slp_reduc)
4124 scalar_results.safe_push (new_temp);
4125
4126 for (bit_offset = element_bitsize;
4127 bit_offset < vec_size_in_bits;
4128 bit_offset += element_bitsize)
4129 {
4130 tree bitpos = bitsize_int (bit_offset);
4131 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4132 bitsize, bitpos);
4133
4134 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4135 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4136 gimple_assign_set_lhs (epilog_stmt, new_name);
4137 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4138
4139 if (slp_reduc)
4140 {
4141 /* In SLP we don't need to apply reduction operation, so
4142 we just collect s' values in SCALAR_RESULTS. */
4143 new_temp = new_name;
4144 scalar_results.safe_push (new_name);
4145 }
4146 else
4147 {
4148 epilog_stmt = gimple_build_assign_with_ops (code,
4149 new_scalar_dest, new_name, new_temp);
4150 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4151 gimple_assign_set_lhs (epilog_stmt, new_temp);
4152 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4153 }
4154 }
4155 }
4156
4157 /* The only case where we need to reduce scalar results in SLP, is
4158 unrolling. If the size of SCALAR_RESULTS is greater than
4159 GROUP_SIZE, we reduce them combining elements modulo
4160 GROUP_SIZE. */
4161 if (slp_reduc)
4162 {
4163 tree res, first_res, new_res;
4164 gimple new_stmt;
4165
4166 /* Reduce multiple scalar results in case of SLP unrolling. */
4167 for (j = group_size; scalar_results.iterate (j, &res);
4168 j++)
4169 {
4170 first_res = scalar_results[j % group_size];
4171 new_stmt = gimple_build_assign_with_ops (code,
4172 new_scalar_dest, first_res, res);
4173 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4174 gimple_assign_set_lhs (new_stmt, new_res);
4175 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4176 scalar_results[j % group_size] = new_res;
4177 }
4178 }
4179 else
4180 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4181 scalar_results.safe_push (new_temp);
4182
4183 extract_scalar_result = false;
4184 }
4185 }
4186
4187 /* 2.4 Extract the final scalar result. Create:
4188 s_out3 = extract_field <v_out2, bitpos> */
4189
4190 if (extract_scalar_result)
4191 {
4192 tree rhs;
4193
4194 if (dump_enabled_p ())
4195 dump_printf_loc (MSG_NOTE, vect_location,
4196 "extract scalar result");
4197
4198 if (BYTES_BIG_ENDIAN)
4199 bitpos = size_binop (MULT_EXPR,
4200 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4201 TYPE_SIZE (scalar_type));
4202 else
4203 bitpos = bitsize_zero_node;
4204
4205 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4206 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4207 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4208 gimple_assign_set_lhs (epilog_stmt, new_temp);
4209 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4210 scalar_results.safe_push (new_temp);
4211 }
4212
4213 vect_finalize_reduction:
4214
4215 if (double_reduc)
4216 loop = loop->inner;
4217
4218 /* 2.5 Adjust the final result by the initial value of the reduction
4219 variable. (When such adjustment is not needed, then
4220 'adjustment_def' is zero). For example, if code is PLUS we create:
4221 new_temp = loop_exit_def + adjustment_def */
4222
4223 if (adjustment_def)
4224 {
4225 gcc_assert (!slp_reduc);
4226 if (nested_in_vect_loop)
4227 {
4228 new_phi = new_phis[0];
4229 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4230 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4231 new_dest = vect_create_destination_var (scalar_dest, vectype);
4232 }
4233 else
4234 {
4235 new_temp = scalar_results[0];
4236 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4237 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4238 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4239 }
4240
4241 epilog_stmt = gimple_build_assign (new_dest, expr);
4242 new_temp = make_ssa_name (new_dest, epilog_stmt);
4243 gimple_assign_set_lhs (epilog_stmt, new_temp);
4244 SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4245 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4246 if (nested_in_vect_loop)
4247 {
4248 set_vinfo_for_stmt (epilog_stmt,
4249 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4250 NULL));
4251 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4252 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4253
4254 if (!double_reduc)
4255 scalar_results.quick_push (new_temp);
4256 else
4257 scalar_results[0] = new_temp;
4258 }
4259 else
4260 scalar_results[0] = new_temp;
4261
4262 new_phis[0] = epilog_stmt;
4263 }
4264
4265 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4266 phis with new adjusted scalar results, i.e., replace use <s_out0>
4267 with use <s_out4>.
4268
4269 Transform:
4270 loop_exit:
4271 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4272 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4273 v_out2 = reduce <v_out1>
4274 s_out3 = extract_field <v_out2, 0>
4275 s_out4 = adjust_result <s_out3>
4276 use <s_out0>
4277 use <s_out0>
4278
4279 into:
4280
4281 loop_exit:
4282 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4283 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4284 v_out2 = reduce <v_out1>
4285 s_out3 = extract_field <v_out2, 0>
4286 s_out4 = adjust_result <s_out3>
4287 use <s_out4>
4288 use <s_out4> */
4289
4290
4291 /* In SLP reduction chain we reduce vector results into one vector if
4292 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4293 the last stmt in the reduction chain, since we are looking for the loop
4294 exit phi node. */
4295 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4296 {
4297 scalar_dest = gimple_assign_lhs (
4298 SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4299 group_size = 1;
4300 }
4301
4302 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4303 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4304 need to match SCALAR_RESULTS with corresponding statements. The first
4305 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4306 the first vector stmt, etc.
4307 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4308 if (group_size > new_phis.length ())
4309 {
4310 ratio = group_size / new_phis.length ();
4311 gcc_assert (!(group_size % new_phis.length ()));
4312 }
4313 else
4314 ratio = 1;
4315
4316 for (k = 0; k < group_size; k++)
4317 {
4318 if (k % ratio == 0)
4319 {
4320 epilog_stmt = new_phis[k / ratio];
4321 reduction_phi = reduction_phis[k / ratio];
4322 if (double_reduc)
4323 inner_phi = inner_phis[k / ratio];
4324 }
4325
4326 if (slp_reduc)
4327 {
4328 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4329
4330 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4331 /* SLP statements can't participate in patterns. */
4332 gcc_assert (!orig_stmt);
4333 scalar_dest = gimple_assign_lhs (current_stmt);
4334 }
4335
4336 phis.create (3);
4337 /* Find the loop-closed-use at the loop exit of the original scalar
4338 result. (The reduction result is expected to have two immediate uses -
4339 one at the latch block, and one at the loop exit). */
4340 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4341 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4342 phis.safe_push (USE_STMT (use_p));
4343
4344 /* We expect to have found an exit_phi because of loop-closed-ssa
4345 form. */
4346 gcc_assert (!phis.is_empty ());
4347
4348 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4349 {
4350 if (outer_loop)
4351 {
4352 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4353 gimple vect_phi;
4354
4355 /* FORNOW. Currently not supporting the case that an inner-loop
4356 reduction is not used in the outer-loop (but only outside the
4357 outer-loop), unless it is double reduction. */
4358 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4359 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4360 || double_reduc);
4361
4362 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4363 if (!double_reduc
4364 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4365 != vect_double_reduction_def)
4366 continue;
4367
4368 /* Handle double reduction:
4369
4370 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4371 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4372 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4373 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4374
4375 At that point the regular reduction (stmt2 and stmt3) is
4376 already vectorized, as well as the exit phi node, stmt4.
4377 Here we vectorize the phi node of double reduction, stmt1, and
4378 update all relevant statements. */
4379
4380 /* Go through all the uses of s2 to find double reduction phi
4381 node, i.e., stmt1 above. */
4382 orig_name = PHI_RESULT (exit_phi);
4383 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4384 {
4385 stmt_vec_info use_stmt_vinfo;
4386 stmt_vec_info new_phi_vinfo;
4387 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4388 basic_block bb = gimple_bb (use_stmt);
4389 gimple use;
4390
4391 /* Check that USE_STMT is really double reduction phi
4392 node. */
4393 if (gimple_code (use_stmt) != GIMPLE_PHI
4394 || gimple_phi_num_args (use_stmt) != 2
4395 || bb->loop_father != outer_loop)
4396 continue;
4397 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4398 if (!use_stmt_vinfo
4399 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4400 != vect_double_reduction_def)
4401 continue;
4402
4403 /* Create vector phi node for double reduction:
4404 vs1 = phi <vs0, vs2>
4405 vs1 was created previously in this function by a call to
4406 vect_get_vec_def_for_operand and is stored in
4407 vec_initial_def;
4408 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4409 vs0 is created here. */
4410
4411 /* Create vector phi node. */
4412 vect_phi = create_phi_node (vec_initial_def, bb);
4413 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4414 loop_vec_info_for_loop (outer_loop), NULL);
4415 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4416
4417 /* Create vs0 - initial def of the double reduction phi. */
4418 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4419 loop_preheader_edge (outer_loop));
4420 init_def = get_initial_def_for_reduction (stmt,
4421 preheader_arg, NULL);
4422 vect_phi_init = vect_init_vector (use_stmt, init_def,
4423 vectype, NULL);
4424
4425 /* Update phi node arguments with vs0 and vs2. */
4426 add_phi_arg (vect_phi, vect_phi_init,
4427 loop_preheader_edge (outer_loop),
4428 UNKNOWN_LOCATION);
4429 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4430 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4431 if (dump_enabled_p ())
4432 {
4433 dump_printf_loc (MSG_NOTE, vect_location,
4434 "created double reduction phi node: ");
4435 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4436 }
4437
4438 vect_phi_res = PHI_RESULT (vect_phi);
4439
4440 /* Replace the use, i.e., set the correct vs1 in the regular
4441 reduction phi node. FORNOW, NCOPIES is always 1, so the
4442 loop is redundant. */
4443 use = reduction_phi;
4444 for (j = 0; j < ncopies; j++)
4445 {
4446 edge pr_edge = loop_preheader_edge (loop);
4447 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4448 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4449 }
4450 }
4451 }
4452 }
4453
4454 phis.release ();
4455 if (nested_in_vect_loop)
4456 {
4457 if (double_reduc)
4458 loop = outer_loop;
4459 else
4460 continue;
4461 }
4462
4463 phis.create (3);
4464 /* Find the loop-closed-use at the loop exit of the original scalar
4465 result. (The reduction result is expected to have two immediate uses,
4466 one at the latch block, and one at the loop exit). For double
4467 reductions we are looking for exit phis of the outer loop. */
4468 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4469 {
4470 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4471 phis.safe_push (USE_STMT (use_p));
4472 else
4473 {
4474 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4475 {
4476 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4477
4478 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4479 {
4480 if (!flow_bb_inside_loop_p (loop,
4481 gimple_bb (USE_STMT (phi_use_p))))
4482 phis.safe_push (USE_STMT (phi_use_p));
4483 }
4484 }
4485 }
4486 }
4487
4488 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4489 {
4490 /* Replace the uses: */
4491 orig_name = PHI_RESULT (exit_phi);
4492 scalar_result = scalar_results[k];
4493 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4494 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4495 SET_USE (use_p, scalar_result);
4496 }
4497
4498 phis.release ();
4499 }
4500
4501 scalar_results.release ();
4502 inner_phis.release ();
4503 new_phis.release ();
4504 }
4505
4506
4507 /* Function vectorizable_reduction.
4508
4509 Check if STMT performs a reduction operation that can be vectorized.
4510 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4511 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4512 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4513
4514 This function also handles reduction idioms (patterns) that have been
4515 recognized in advance during vect_pattern_recog. In this case, STMT may be
4516 of this form:
4517 X = pattern_expr (arg0, arg1, ..., X)
4518 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4519 sequence that had been detected and replaced by the pattern-stmt (STMT).
4520
4521 In some cases of reduction patterns, the type of the reduction variable X is
4522 different than the type of the other arguments of STMT.
4523 In such cases, the vectype that is used when transforming STMT into a vector
4524 stmt is different than the vectype that is used to determine the
4525 vectorization factor, because it consists of a different number of elements
4526 than the actual number of elements that are being operated upon in parallel.
4527
4528 For example, consider an accumulation of shorts into an int accumulator.
4529 On some targets it's possible to vectorize this pattern operating on 8
4530 shorts at a time (hence, the vectype for purposes of determining the
4531 vectorization factor should be V8HI); on the other hand, the vectype that
4532 is used to create the vector form is actually V4SI (the type of the result).
4533
4534 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4535 indicates what is the actual level of parallelism (V8HI in the example), so
4536 that the right vectorization factor would be derived. This vectype
4537 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4538 be used to create the vectorized stmt. The right vectype for the vectorized
4539 stmt is obtained from the type of the result X:
4540 get_vectype_for_scalar_type (TREE_TYPE (X))
4541
4542 This means that, contrary to "regular" reductions (or "regular" stmts in
4543 general), the following equation:
4544 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4545 does *NOT* necessarily hold for reduction patterns. */
4546
4547 bool
4548 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4549 gimple *vec_stmt, slp_tree slp_node)
4550 {
4551 tree vec_dest;
4552 tree scalar_dest;
4553 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4554 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4555 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4556 tree vectype_in = NULL_TREE;
4557 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4558 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4559 enum tree_code code, orig_code, epilog_reduc_code;
4560 enum machine_mode vec_mode;
4561 int op_type;
4562 optab optab, reduc_optab;
4563 tree new_temp = NULL_TREE;
4564 tree def;
4565 gimple def_stmt;
4566 enum vect_def_type dt;
4567 gimple new_phi = NULL;
4568 tree scalar_type;
4569 bool is_simple_use;
4570 gimple orig_stmt;
4571 stmt_vec_info orig_stmt_info;
4572 tree expr = NULL_TREE;
4573 int i;
4574 int ncopies;
4575 int epilog_copies;
4576 stmt_vec_info prev_stmt_info, prev_phi_info;
4577 bool single_defuse_cycle = false;
4578 tree reduc_def = NULL_TREE;
4579 gimple new_stmt = NULL;
4580 int j;
4581 tree ops[3];
4582 bool nested_cycle = false, found_nested_cycle_def = false;
4583 gimple reduc_def_stmt = NULL;
4584 /* The default is that the reduction variable is the last in statement. */
4585 int reduc_index = 2;
4586 bool double_reduc = false, dummy;
4587 basic_block def_bb;
4588 struct loop * def_stmt_loop, *outer_loop = NULL;
4589 tree def_arg;
4590 gimple def_arg_stmt;
4591 vec<tree> vec_oprnds0 = vNULL;
4592 vec<tree> vec_oprnds1 = vNULL;
4593 vec<tree> vect_defs = vNULL;
4594 vec<gimple> phis = vNULL;
4595 int vec_num;
4596 tree def0, def1, tem, op0, op1 = NULL_TREE;
4597
4598 /* In case of reduction chain we switch to the first stmt in the chain, but
4599 we don't update STMT_INFO, since only the last stmt is marked as reduction
4600 and has reduction properties. */
4601 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4602 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4603
4604 if (nested_in_vect_loop_p (loop, stmt))
4605 {
4606 outer_loop = loop;
4607 loop = loop->inner;
4608 nested_cycle = true;
4609 }
4610
4611 /* 1. Is vectorizable reduction? */
4612 /* Not supportable if the reduction variable is used in the loop, unless
4613 it's a reduction chain. */
4614 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4615 && !GROUP_FIRST_ELEMENT (stmt_info))
4616 return false;
4617
4618 /* Reductions that are not used even in an enclosing outer-loop,
4619 are expected to be "live" (used out of the loop). */
4620 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4621 && !STMT_VINFO_LIVE_P (stmt_info))
4622 return false;
4623
4624 /* Make sure it was already recognized as a reduction computation. */
4625 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4626 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4627 return false;
4628
4629 /* 2. Has this been recognized as a reduction pattern?
4630
4631 Check if STMT represents a pattern that has been recognized
4632 in earlier analysis stages. For stmts that represent a pattern,
4633 the STMT_VINFO_RELATED_STMT field records the last stmt in
4634 the original sequence that constitutes the pattern. */
4635
4636 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4637 if (orig_stmt)
4638 {
4639 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4640 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4641 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4642 }
4643
4644 /* 3. Check the operands of the operation. The first operands are defined
4645 inside the loop body. The last operand is the reduction variable,
4646 which is defined by the loop-header-phi. */
4647
4648 gcc_assert (is_gimple_assign (stmt));
4649
4650 /* Flatten RHS. */
4651 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4652 {
4653 case GIMPLE_SINGLE_RHS:
4654 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4655 if (op_type == ternary_op)
4656 {
4657 tree rhs = gimple_assign_rhs1 (stmt);
4658 ops[0] = TREE_OPERAND (rhs, 0);
4659 ops[1] = TREE_OPERAND (rhs, 1);
4660 ops[2] = TREE_OPERAND (rhs, 2);
4661 code = TREE_CODE (rhs);
4662 }
4663 else
4664 return false;
4665 break;
4666
4667 case GIMPLE_BINARY_RHS:
4668 code = gimple_assign_rhs_code (stmt);
4669 op_type = TREE_CODE_LENGTH (code);
4670 gcc_assert (op_type == binary_op);
4671 ops[0] = gimple_assign_rhs1 (stmt);
4672 ops[1] = gimple_assign_rhs2 (stmt);
4673 break;
4674
4675 case GIMPLE_TERNARY_RHS:
4676 code = gimple_assign_rhs_code (stmt);
4677 op_type = TREE_CODE_LENGTH (code);
4678 gcc_assert (op_type == ternary_op);
4679 ops[0] = gimple_assign_rhs1 (stmt);
4680 ops[1] = gimple_assign_rhs2 (stmt);
4681 ops[2] = gimple_assign_rhs3 (stmt);
4682 break;
4683
4684 case GIMPLE_UNARY_RHS:
4685 return false;
4686
4687 default:
4688 gcc_unreachable ();
4689 }
4690
4691 if (code == COND_EXPR && slp_node)
4692 return false;
4693
4694 scalar_dest = gimple_assign_lhs (stmt);
4695 scalar_type = TREE_TYPE (scalar_dest);
4696 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4697 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4698 return false;
4699
4700 /* Do not try to vectorize bit-precision reductions. */
4701 if ((TYPE_PRECISION (scalar_type)
4702 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4703 return false;
4704
4705 /* All uses but the last are expected to be defined in the loop.
4706 The last use is the reduction variable. In case of nested cycle this
4707 assumption is not true: we use reduc_index to record the index of the
4708 reduction variable. */
4709 for (i = 0; i < op_type - 1; i++)
4710 {
4711 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4712 if (i == 0 && code == COND_EXPR)
4713 continue;
4714
4715 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4716 &def_stmt, &def, &dt, &tem);
4717 if (!vectype_in)
4718 vectype_in = tem;
4719 gcc_assert (is_simple_use);
4720
4721 if (dt != vect_internal_def
4722 && dt != vect_external_def
4723 && dt != vect_constant_def
4724 && dt != vect_induction_def
4725 && !(dt == vect_nested_cycle && nested_cycle))
4726 return false;
4727
4728 if (dt == vect_nested_cycle)
4729 {
4730 found_nested_cycle_def = true;
4731 reduc_def_stmt = def_stmt;
4732 reduc_index = i;
4733 }
4734 }
4735
4736 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4737 &def_stmt, &def, &dt, &tem);
4738 if (!vectype_in)
4739 vectype_in = tem;
4740 gcc_assert (is_simple_use);
4741 if (!(dt == vect_reduction_def
4742 || dt == vect_nested_cycle
4743 || ((dt == vect_internal_def || dt == vect_external_def
4744 || dt == vect_constant_def || dt == vect_induction_def)
4745 && nested_cycle && found_nested_cycle_def)))
4746 {
4747 /* For pattern recognized stmts, orig_stmt might be a reduction,
4748 but some helper statements for the pattern might not, or
4749 might be COND_EXPRs with reduction uses in the condition. */
4750 gcc_assert (orig_stmt);
4751 return false;
4752 }
4753 if (!found_nested_cycle_def)
4754 reduc_def_stmt = def_stmt;
4755
4756 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4757 if (orig_stmt)
4758 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4759 reduc_def_stmt,
4760 !nested_cycle,
4761 &dummy));
4762 else
4763 {
4764 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4765 !nested_cycle, &dummy);
4766 /* We changed STMT to be the first stmt in reduction chain, hence we
4767 check that in this case the first element in the chain is STMT. */
4768 gcc_assert (stmt == tmp
4769 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4770 }
4771
4772 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4773 return false;
4774
4775 if (slp_node || PURE_SLP_STMT (stmt_info))
4776 ncopies = 1;
4777 else
4778 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4779 / TYPE_VECTOR_SUBPARTS (vectype_in));
4780
4781 gcc_assert (ncopies >= 1);
4782
4783 vec_mode = TYPE_MODE (vectype_in);
4784
4785 if (code == COND_EXPR)
4786 {
4787 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4788 {
4789 if (dump_enabled_p ())
4790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4791 "unsupported condition in reduction");
4792
4793 return false;
4794 }
4795 }
4796 else
4797 {
4798 /* 4. Supportable by target? */
4799
4800 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4801 || code == LROTATE_EXPR || code == RROTATE_EXPR)
4802 {
4803 /* Shifts and rotates are only supported by vectorizable_shifts,
4804 not vectorizable_reduction. */
4805 if (dump_enabled_p ())
4806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4807 "unsupported shift or rotation.");
4808 return false;
4809 }
4810
4811 /* 4.1. check support for the operation in the loop */
4812 optab = optab_for_tree_code (code, vectype_in, optab_default);
4813 if (!optab)
4814 {
4815 if (dump_enabled_p ())
4816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4817 "no optab.");
4818
4819 return false;
4820 }
4821
4822 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4823 {
4824 if (dump_enabled_p ())
4825 dump_printf (MSG_NOTE, "op not supported by target.");
4826
4827 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4828 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4829 < vect_min_worthwhile_factor (code))
4830 return false;
4831
4832 if (dump_enabled_p ())
4833 dump_printf (MSG_NOTE, "proceeding using word mode.");
4834 }
4835
4836 /* Worthwhile without SIMD support? */
4837 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4838 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4839 < vect_min_worthwhile_factor (code))
4840 {
4841 if (dump_enabled_p ())
4842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4843 "not worthwhile without SIMD support.");
4844
4845 return false;
4846 }
4847 }
4848
4849 /* 4.2. Check support for the epilog operation.
4850
4851 If STMT represents a reduction pattern, then the type of the
4852 reduction variable may be different than the type of the rest
4853 of the arguments. For example, consider the case of accumulation
4854 of shorts into an int accumulator; The original code:
4855 S1: int_a = (int) short_a;
4856 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
4857
4858 was replaced with:
4859 STMT: int_acc = widen_sum <short_a, int_acc>
4860
4861 This means that:
4862 1. The tree-code that is used to create the vector operation in the
4863 epilog code (that reduces the partial results) is not the
4864 tree-code of STMT, but is rather the tree-code of the original
4865 stmt from the pattern that STMT is replacing. I.e, in the example
4866 above we want to use 'widen_sum' in the loop, but 'plus' in the
4867 epilog.
4868 2. The type (mode) we use to check available target support
4869 for the vector operation to be created in the *epilog*, is
4870 determined by the type of the reduction variable (in the example
4871 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4872 However the type (mode) we use to check available target support
4873 for the vector operation to be created *inside the loop*, is
4874 determined by the type of the other arguments to STMT (in the
4875 example we'd check this: optab_handler (widen_sum_optab,
4876 vect_short_mode)).
4877
4878 This is contrary to "regular" reductions, in which the types of all
4879 the arguments are the same as the type of the reduction variable.
4880 For "regular" reductions we can therefore use the same vector type
4881 (and also the same tree-code) when generating the epilog code and
4882 when generating the code inside the loop. */
4883
4884 if (orig_stmt)
4885 {
4886 /* This is a reduction pattern: get the vectype from the type of the
4887 reduction variable, and get the tree-code from orig_stmt. */
4888 orig_code = gimple_assign_rhs_code (orig_stmt);
4889 gcc_assert (vectype_out);
4890 vec_mode = TYPE_MODE (vectype_out);
4891 }
4892 else
4893 {
4894 /* Regular reduction: use the same vectype and tree-code as used for
4895 the vector code inside the loop can be used for the epilog code. */
4896 orig_code = code;
4897 }
4898
4899 if (nested_cycle)
4900 {
4901 def_bb = gimple_bb (reduc_def_stmt);
4902 def_stmt_loop = def_bb->loop_father;
4903 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4904 loop_preheader_edge (def_stmt_loop));
4905 if (TREE_CODE (def_arg) == SSA_NAME
4906 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4907 && gimple_code (def_arg_stmt) == GIMPLE_PHI
4908 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4909 && vinfo_for_stmt (def_arg_stmt)
4910 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4911 == vect_double_reduction_def)
4912 double_reduc = true;
4913 }
4914
4915 epilog_reduc_code = ERROR_MARK;
4916 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4917 {
4918 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4919 optab_default);
4920 if (!reduc_optab)
4921 {
4922 if (dump_enabled_p ())
4923 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4924 "no optab for reduction.");
4925
4926 epilog_reduc_code = ERROR_MARK;
4927 }
4928
4929 if (reduc_optab
4930 && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4931 {
4932 if (dump_enabled_p ())
4933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4934 "reduc op not supported by target.");
4935
4936 epilog_reduc_code = ERROR_MARK;
4937 }
4938 }
4939 else
4940 {
4941 if (!nested_cycle || double_reduc)
4942 {
4943 if (dump_enabled_p ())
4944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4945 "no reduc code for scalar code.");
4946
4947 return false;
4948 }
4949 }
4950
4951 if (double_reduc && ncopies > 1)
4952 {
4953 if (dump_enabled_p ())
4954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4955 "multiple types in double reduction");
4956
4957 return false;
4958 }
4959
4960 /* In case of widenning multiplication by a constant, we update the type
4961 of the constant to be the type of the other operand. We check that the
4962 constant fits the type in the pattern recognition pass. */
4963 if (code == DOT_PROD_EXPR
4964 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4965 {
4966 if (TREE_CODE (ops[0]) == INTEGER_CST)
4967 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4968 else if (TREE_CODE (ops[1]) == INTEGER_CST)
4969 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4970 else
4971 {
4972 if (dump_enabled_p ())
4973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4974 "invalid types in dot-prod");
4975
4976 return false;
4977 }
4978 }
4979
4980 if (!vec_stmt) /* transformation not required. */
4981 {
4982 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4983 return false;
4984 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4985 return true;
4986 }
4987
4988 /** Transform. **/
4989
4990 if (dump_enabled_p ())
4991 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
4992
4993 /* FORNOW: Multiple types are not supported for condition. */
4994 if (code == COND_EXPR)
4995 gcc_assert (ncopies == 1);
4996
4997 /* Create the destination vector */
4998 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4999
5000 /* In case the vectorization factor (VF) is bigger than the number
5001 of elements that we can fit in a vectype (nunits), we have to generate
5002 more than one vector stmt - i.e - we need to "unroll" the
5003 vector stmt by a factor VF/nunits. For more details see documentation
5004 in vectorizable_operation. */
5005
5006 /* If the reduction is used in an outer loop we need to generate
5007 VF intermediate results, like so (e.g. for ncopies=2):
5008 r0 = phi (init, r0)
5009 r1 = phi (init, r1)
5010 r0 = x0 + r0;
5011 r1 = x1 + r1;
5012 (i.e. we generate VF results in 2 registers).
5013 In this case we have a separate def-use cycle for each copy, and therefore
5014 for each copy we get the vector def for the reduction variable from the
5015 respective phi node created for this copy.
5016
5017 Otherwise (the reduction is unused in the loop nest), we can combine
5018 together intermediate results, like so (e.g. for ncopies=2):
5019 r = phi (init, r)
5020 r = x0 + r;
5021 r = x1 + r;
5022 (i.e. we generate VF/2 results in a single register).
5023 In this case for each copy we get the vector def for the reduction variable
5024 from the vectorized reduction operation generated in the previous iteration.
5025 */
5026
5027 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5028 {
5029 single_defuse_cycle = true;
5030 epilog_copies = 1;
5031 }
5032 else
5033 epilog_copies = ncopies;
5034
5035 prev_stmt_info = NULL;
5036 prev_phi_info = NULL;
5037 if (slp_node)
5038 {
5039 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5040 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5041 == TYPE_VECTOR_SUBPARTS (vectype_in));
5042 }
5043 else
5044 {
5045 vec_num = 1;
5046 vec_oprnds0.create (1);
5047 if (op_type == ternary_op)
5048 vec_oprnds1.create (1);
5049 }
5050
5051 phis.create (vec_num);
5052 vect_defs.create (vec_num);
5053 if (!slp_node)
5054 vect_defs.quick_push (NULL_TREE);
5055
5056 for (j = 0; j < ncopies; j++)
5057 {
5058 if (j == 0 || !single_defuse_cycle)
5059 {
5060 for (i = 0; i < vec_num; i++)
5061 {
5062 /* Create the reduction-phi that defines the reduction
5063 operand. */
5064 new_phi = create_phi_node (vec_dest, loop->header);
5065 set_vinfo_for_stmt (new_phi,
5066 new_stmt_vec_info (new_phi, loop_vinfo,
5067 NULL));
5068 if (j == 0 || slp_node)
5069 phis.quick_push (new_phi);
5070 }
5071 }
5072
5073 if (code == COND_EXPR)
5074 {
5075 gcc_assert (!slp_node);
5076 vectorizable_condition (stmt, gsi, vec_stmt,
5077 PHI_RESULT (phis[0]),
5078 reduc_index, NULL);
5079 /* Multiple types are not supported for condition. */
5080 break;
5081 }
5082
5083 /* Handle uses. */
5084 if (j == 0)
5085 {
5086 op0 = ops[!reduc_index];
5087 if (op_type == ternary_op)
5088 {
5089 if (reduc_index == 0)
5090 op1 = ops[2];
5091 else
5092 op1 = ops[1];
5093 }
5094
5095 if (slp_node)
5096 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5097 slp_node, -1);
5098 else
5099 {
5100 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5101 stmt, NULL);
5102 vec_oprnds0.quick_push (loop_vec_def0);
5103 if (op_type == ternary_op)
5104 {
5105 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5106 NULL);
5107 vec_oprnds1.quick_push (loop_vec_def1);
5108 }
5109 }
5110 }
5111 else
5112 {
5113 if (!slp_node)
5114 {
5115 enum vect_def_type dt;
5116 gimple dummy_stmt;
5117 tree dummy;
5118
5119 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5120 &dummy_stmt, &dummy, &dt);
5121 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5122 loop_vec_def0);
5123 vec_oprnds0[0] = loop_vec_def0;
5124 if (op_type == ternary_op)
5125 {
5126 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5127 &dummy, &dt);
5128 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5129 loop_vec_def1);
5130 vec_oprnds1[0] = loop_vec_def1;
5131 }
5132 }
5133
5134 if (single_defuse_cycle)
5135 reduc_def = gimple_assign_lhs (new_stmt);
5136
5137 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5138 }
5139
5140 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5141 {
5142 if (slp_node)
5143 reduc_def = PHI_RESULT (phis[i]);
5144 else
5145 {
5146 if (!single_defuse_cycle || j == 0)
5147 reduc_def = PHI_RESULT (new_phi);
5148 }
5149
5150 def1 = ((op_type == ternary_op)
5151 ? vec_oprnds1[i] : NULL);
5152 if (op_type == binary_op)
5153 {
5154 if (reduc_index == 0)
5155 expr = build2 (code, vectype_out, reduc_def, def0);
5156 else
5157 expr = build2 (code, vectype_out, def0, reduc_def);
5158 }
5159 else
5160 {
5161 if (reduc_index == 0)
5162 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5163 else
5164 {
5165 if (reduc_index == 1)
5166 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5167 else
5168 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5169 }
5170 }
5171
5172 new_stmt = gimple_build_assign (vec_dest, expr);
5173 new_temp = make_ssa_name (vec_dest, new_stmt);
5174 gimple_assign_set_lhs (new_stmt, new_temp);
5175 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5176
5177 if (slp_node)
5178 {
5179 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5180 vect_defs.quick_push (new_temp);
5181 }
5182 else
5183 vect_defs[0] = new_temp;
5184 }
5185
5186 if (slp_node)
5187 continue;
5188
5189 if (j == 0)
5190 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5191 else
5192 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5193
5194 prev_stmt_info = vinfo_for_stmt (new_stmt);
5195 prev_phi_info = vinfo_for_stmt (new_phi);
5196 }
5197
5198 /* Finalize the reduction-phi (set its arguments) and create the
5199 epilog reduction code. */
5200 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5201 {
5202 new_temp = gimple_assign_lhs (*vec_stmt);
5203 vect_defs[0] = new_temp;
5204 }
5205
5206 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5207 epilog_reduc_code, phis, reduc_index,
5208 double_reduc, slp_node);
5209
5210 phis.release ();
5211 vect_defs.release ();
5212 vec_oprnds0.release ();
5213 vec_oprnds1.release ();
5214
5215 return true;
5216 }
5217
5218 /* Function vect_min_worthwhile_factor.
5219
5220 For a loop where we could vectorize the operation indicated by CODE,
5221 return the minimum vectorization factor that makes it worthwhile
5222 to use generic vectors. */
5223 int
5224 vect_min_worthwhile_factor (enum tree_code code)
5225 {
5226 switch (code)
5227 {
5228 case PLUS_EXPR:
5229 case MINUS_EXPR:
5230 case NEGATE_EXPR:
5231 return 4;
5232
5233 case BIT_AND_EXPR:
5234 case BIT_IOR_EXPR:
5235 case BIT_XOR_EXPR:
5236 case BIT_NOT_EXPR:
5237 return 2;
5238
5239 default:
5240 return INT_MAX;
5241 }
5242 }
5243
5244
5245 /* Function vectorizable_induction
5246
5247 Check if PHI performs an induction computation that can be vectorized.
5248 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5249 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5250 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5251
5252 bool
5253 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5254 gimple *vec_stmt)
5255 {
5256 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5257 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5258 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5259 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5260 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5261 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5262 tree vec_def;
5263
5264 gcc_assert (ncopies >= 1);
5265 /* FORNOW. These restrictions should be relaxed. */
5266 if (nested_in_vect_loop_p (loop, phi))
5267 {
5268 imm_use_iterator imm_iter;
5269 use_operand_p use_p;
5270 gimple exit_phi;
5271 edge latch_e;
5272 tree loop_arg;
5273
5274 if (ncopies > 1)
5275 {
5276 if (dump_enabled_p ())
5277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5278 "multiple types in nested loop.");
5279 return false;
5280 }
5281
5282 exit_phi = NULL;
5283 latch_e = loop_latch_edge (loop->inner);
5284 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5285 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5286 {
5287 if (!flow_bb_inside_loop_p (loop->inner,
5288 gimple_bb (USE_STMT (use_p))))
5289 {
5290 exit_phi = USE_STMT (use_p);
5291 break;
5292 }
5293 }
5294 if (exit_phi)
5295 {
5296 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5297 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5298 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5299 {
5300 if (dump_enabled_p ())
5301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5302 "inner-loop induction only used outside "
5303 "of the outer vectorized loop.");
5304 return false;
5305 }
5306 }
5307 }
5308
5309 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5310 return false;
5311
5312 /* FORNOW: SLP not supported. */
5313 if (STMT_SLP_TYPE (stmt_info))
5314 return false;
5315
5316 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5317
5318 if (gimple_code (phi) != GIMPLE_PHI)
5319 return false;
5320
5321 if (!vec_stmt) /* transformation not required. */
5322 {
5323 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5324 if (dump_enabled_p ())
5325 dump_printf_loc (MSG_NOTE, vect_location,
5326 "=== vectorizable_induction ===");
5327 vect_model_induction_cost (stmt_info, ncopies);
5328 return true;
5329 }
5330
5331 /** Transform. **/
5332
5333 if (dump_enabled_p ())
5334 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5335
5336 vec_def = get_initial_def_for_induction (phi);
5337 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5338 return true;
5339 }
5340
5341 /* Function vectorizable_live_operation.
5342
5343 STMT computes a value that is used outside the loop. Check if
5344 it can be supported. */
5345
5346 bool
5347 vectorizable_live_operation (gimple stmt,
5348 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5349 gimple *vec_stmt ATTRIBUTE_UNUSED)
5350 {
5351 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5352 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5353 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5354 int i;
5355 int op_type;
5356 tree op;
5357 tree def;
5358 gimple def_stmt;
5359 enum vect_def_type dt;
5360 enum tree_code code;
5361 enum gimple_rhs_class rhs_class;
5362
5363 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5364
5365 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5366 return false;
5367
5368 if (!is_gimple_assign (stmt))
5369 return false;
5370
5371 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5372 return false;
5373
5374 /* FORNOW. CHECKME. */
5375 if (nested_in_vect_loop_p (loop, stmt))
5376 return false;
5377
5378 code = gimple_assign_rhs_code (stmt);
5379 op_type = TREE_CODE_LENGTH (code);
5380 rhs_class = get_gimple_rhs_class (code);
5381 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5382 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5383
5384 /* FORNOW: support only if all uses are invariant. This means
5385 that the scalar operations can remain in place, unvectorized.
5386 The original last scalar value that they compute will be used. */
5387
5388 for (i = 0; i < op_type; i++)
5389 {
5390 if (rhs_class == GIMPLE_SINGLE_RHS)
5391 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5392 else
5393 op = gimple_op (stmt, i + 1);
5394 if (op
5395 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5396 &dt))
5397 {
5398 if (dump_enabled_p ())
5399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5400 "use not simple.");
5401 return false;
5402 }
5403
5404 if (dt != vect_external_def && dt != vect_constant_def)
5405 return false;
5406 }
5407
5408 /* No transformation is required for the cases we currently support. */
5409 return true;
5410 }
5411
5412 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5413
5414 static void
5415 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5416 {
5417 ssa_op_iter op_iter;
5418 imm_use_iterator imm_iter;
5419 def_operand_p def_p;
5420 gimple ustmt;
5421
5422 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5423 {
5424 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5425 {
5426 basic_block bb;
5427
5428 if (!is_gimple_debug (ustmt))
5429 continue;
5430
5431 bb = gimple_bb (ustmt);
5432
5433 if (!flow_bb_inside_loop_p (loop, bb))
5434 {
5435 if (gimple_debug_bind_p (ustmt))
5436 {
5437 if (dump_enabled_p ())
5438 dump_printf_loc (MSG_NOTE, vect_location,
5439 "killing debug use");
5440
5441 gimple_debug_bind_reset_value (ustmt);
5442 update_stmt (ustmt);
5443 }
5444 else
5445 gcc_unreachable ();
5446 }
5447 }
5448 }
5449 }
5450
5451 /* Function vect_transform_loop.
5452
5453 The analysis phase has determined that the loop is vectorizable.
5454 Vectorize the loop - created vectorized stmts to replace the scalar
5455 stmts in the loop, and update the loop exit condition. */
5456
5457 void
5458 vect_transform_loop (loop_vec_info loop_vinfo)
5459 {
5460 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5461 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5462 int nbbs = loop->num_nodes;
5463 gimple_stmt_iterator si;
5464 int i;
5465 tree ratio = NULL;
5466 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5467 bool grouped_store;
5468 bool slp_scheduled = false;
5469 unsigned int nunits;
5470 gimple stmt, pattern_stmt;
5471 gimple_seq pattern_def_seq = NULL;
5472 gimple_stmt_iterator pattern_def_si = gsi_none ();
5473 bool transform_pattern_stmt = false;
5474 bool check_profitability = false;
5475 int th;
5476 /* Record number of iterations before we started tampering with the profile. */
5477 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5478
5479 if (dump_enabled_p ())
5480 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5481
5482 /* If profile is inprecise, we have chance to fix it up. */
5483 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5484 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5485
5486 /* Use the more conservative vectorization threshold. If the number
5487 of iterations is constant assume the cost check has been performed
5488 by our caller. If the threshold makes all loops profitable that
5489 run at least the vectorization factor number of times checking
5490 is pointless, too. */
5491 th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5492 * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5493 th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5494 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5495 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5496 {
5497 if (dump_enabled_p ())
5498 dump_printf_loc (MSG_NOTE, vect_location,
5499 "Profitability threshold is %d loop iterations.", th);
5500 check_profitability = true;
5501 }
5502
5503 /* Peel the loop if there are data refs with unknown alignment.
5504 Only one data ref with unknown store is allowed. */
5505
5506 if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5507 {
5508 vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5509 check_profitability = false;
5510 }
5511
5512 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5513 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5514 {
5515 vect_loop_versioning (loop_vinfo, th, check_profitability);
5516 check_profitability = false;
5517 }
5518
5519 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5520 compile time constant), or it is a constant that doesn't divide by the
5521 vectorization factor, then an epilog loop needs to be created.
5522 We therefore duplicate the loop: the original loop will be vectorized,
5523 and will compute the first (n/VF) iterations. The second copy of the loop
5524 will remain scalar and will compute the remaining (n%VF) iterations.
5525 (VF is the vectorization factor). */
5526
5527 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5528 || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5529 && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5530 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5531 vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5532 th, check_profitability);
5533 else
5534 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5535 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5536
5537 /* 1) Make sure the loop header has exactly two entries
5538 2) Make sure we have a preheader basic block. */
5539
5540 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5541
5542 split_edge (loop_preheader_edge (loop));
5543
5544 /* FORNOW: the vectorizer supports only loops which body consist
5545 of one basic block (header + empty latch). When the vectorizer will
5546 support more involved loop forms, the order by which the BBs are
5547 traversed need to be reconsidered. */
5548
5549 for (i = 0; i < nbbs; i++)
5550 {
5551 basic_block bb = bbs[i];
5552 stmt_vec_info stmt_info;
5553 gimple phi;
5554
5555 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5556 {
5557 phi = gsi_stmt (si);
5558 if (dump_enabled_p ())
5559 {
5560 dump_printf_loc (MSG_NOTE, vect_location,
5561 "------>vectorizing phi: ");
5562 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5563 }
5564 stmt_info = vinfo_for_stmt (phi);
5565 if (!stmt_info)
5566 continue;
5567
5568 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5569 vect_loop_kill_debug_uses (loop, phi);
5570
5571 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5572 && !STMT_VINFO_LIVE_P (stmt_info))
5573 continue;
5574
5575 if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5576 != (unsigned HOST_WIDE_INT) vectorization_factor)
5577 && dump_enabled_p ())
5578 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5579
5580 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5581 {
5582 if (dump_enabled_p ())
5583 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5584 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5585 }
5586 }
5587
5588 pattern_stmt = NULL;
5589 for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5590 {
5591 bool is_store;
5592
5593 if (transform_pattern_stmt)
5594 stmt = pattern_stmt;
5595 else
5596 stmt = gsi_stmt (si);
5597
5598 if (dump_enabled_p ())
5599 {
5600 dump_printf_loc (MSG_NOTE, vect_location,
5601 "------>vectorizing statement: ");
5602 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5603 }
5604
5605 stmt_info = vinfo_for_stmt (stmt);
5606
5607 /* vector stmts created in the outer-loop during vectorization of
5608 stmts in an inner-loop may not have a stmt_info, and do not
5609 need to be vectorized. */
5610 if (!stmt_info)
5611 {
5612 gsi_next (&si);
5613 continue;
5614 }
5615
5616 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5617 vect_loop_kill_debug_uses (loop, stmt);
5618
5619 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5620 && !STMT_VINFO_LIVE_P (stmt_info))
5621 {
5622 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5623 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5624 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5625 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5626 {
5627 stmt = pattern_stmt;
5628 stmt_info = vinfo_for_stmt (stmt);
5629 }
5630 else
5631 {
5632 gsi_next (&si);
5633 continue;
5634 }
5635 }
5636 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5637 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5638 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5639 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5640 transform_pattern_stmt = true;
5641
5642 /* If pattern statement has def stmts, vectorize them too. */
5643 if (is_pattern_stmt_p (stmt_info))
5644 {
5645 if (pattern_def_seq == NULL)
5646 {
5647 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5648 pattern_def_si = gsi_start (pattern_def_seq);
5649 }
5650 else if (!gsi_end_p (pattern_def_si))
5651 gsi_next (&pattern_def_si);
5652 if (pattern_def_seq != NULL)
5653 {
5654 gimple pattern_def_stmt = NULL;
5655 stmt_vec_info pattern_def_stmt_info = NULL;
5656
5657 while (!gsi_end_p (pattern_def_si))
5658 {
5659 pattern_def_stmt = gsi_stmt (pattern_def_si);
5660 pattern_def_stmt_info
5661 = vinfo_for_stmt (pattern_def_stmt);
5662 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5663 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5664 break;
5665 gsi_next (&pattern_def_si);
5666 }
5667
5668 if (!gsi_end_p (pattern_def_si))
5669 {
5670 if (dump_enabled_p ())
5671 {
5672 dump_printf_loc (MSG_NOTE, vect_location,
5673 "==> vectorizing pattern def "
5674 "stmt: ");
5675 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5676 pattern_def_stmt, 0);
5677 }
5678
5679 stmt = pattern_def_stmt;
5680 stmt_info = pattern_def_stmt_info;
5681 }
5682 else
5683 {
5684 pattern_def_si = gsi_none ();
5685 transform_pattern_stmt = false;
5686 }
5687 }
5688 else
5689 transform_pattern_stmt = false;
5690 }
5691
5692 gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5693 nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5694 STMT_VINFO_VECTYPE (stmt_info));
5695 if (!STMT_SLP_TYPE (stmt_info)
5696 && nunits != (unsigned int) vectorization_factor
5697 && dump_enabled_p ())
5698 /* For SLP VF is set according to unrolling factor, and not to
5699 vector size, hence for SLP this print is not valid. */
5700 dump_printf_loc (MSG_NOTE, vect_location,
5701 "multiple-types.");
5702
5703 /* SLP. Schedule all the SLP instances when the first SLP stmt is
5704 reached. */
5705 if (STMT_SLP_TYPE (stmt_info))
5706 {
5707 if (!slp_scheduled)
5708 {
5709 slp_scheduled = true;
5710
5711 if (dump_enabled_p ())
5712 dump_printf_loc (MSG_NOTE, vect_location,
5713 "=== scheduling SLP instances ===");
5714
5715 vect_schedule_slp (loop_vinfo, NULL);
5716 }
5717
5718 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
5719 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5720 {
5721 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5722 {
5723 pattern_def_seq = NULL;
5724 gsi_next (&si);
5725 }
5726 continue;
5727 }
5728 }
5729
5730 /* -------- vectorize statement ------------ */
5731 if (dump_enabled_p ())
5732 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5733
5734 grouped_store = false;
5735 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5736 if (is_store)
5737 {
5738 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5739 {
5740 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5741 interleaving chain was completed - free all the stores in
5742 the chain. */
5743 gsi_next (&si);
5744 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5745 continue;
5746 }
5747 else
5748 {
5749 /* Free the attached stmt_vec_info and remove the stmt. */
5750 gimple store = gsi_stmt (si);
5751 free_stmt_vec_info (store);
5752 unlink_stmt_vdef (store);
5753 gsi_remove (&si, true);
5754 release_defs (store);
5755 continue;
5756 }
5757 }
5758
5759 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5760 {
5761 pattern_def_seq = NULL;
5762 gsi_next (&si);
5763 }
5764 } /* stmts in BB */
5765 } /* BBs in loop */
5766
5767 slpeel_make_loop_iterate_ntimes (loop, ratio);
5768
5769 /* Reduce loop iterations by the vectorization factor. */
5770 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
5771 expected_iterations / vectorization_factor);
5772 loop->nb_iterations_upper_bound
5773 = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5774 FLOOR_DIV_EXPR);
5775 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5776 && loop->nb_iterations_upper_bound != double_int_zero)
5777 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5778 if (loop->any_estimate)
5779 {
5780 loop->nb_iterations_estimate
5781 = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5782 FLOOR_DIV_EXPR);
5783 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5784 && loop->nb_iterations_estimate != double_int_zero)
5785 loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5786 }
5787
5788 if (dump_enabled_p ())
5789 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, "LOOP VECTORIZED.");
5790 if (loop->inner && dump_enabled_p ())
5791 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5792 "OUTER LOOP VECTORIZED.");
5793 }