ipa-cp.c (ipcp_cloning_candidate_p): Use opt_for_fn.
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "predict.h"
30 #include "vec.h"
31 #include "hashtab.h"
32 #include "hash-set.h"
33 #include "machmode.h"
34 #include "hard-reg-set.h"
35 #include "input.h"
36 #include "function.h"
37 #include "dominance.h"
38 #include "cfg.h"
39 #include "cfganal.h"
40 #include "basic-block.h"
41 #include "gimple-pretty-print.h"
42 #include "tree-ssa-alias.h"
43 #include "internal-fn.h"
44 #include "gimple-expr.h"
45 #include "is-a.h"
46 #include "gimple.h"
47 #include "gimplify.h"
48 #include "gimple-iterator.h"
49 #include "gimplify-me.h"
50 #include "gimple-ssa.h"
51 #include "tree-phinodes.h"
52 #include "ssa-iterators.h"
53 #include "stringpool.h"
54 #include "tree-ssanames.h"
55 #include "tree-ssa-loop-ivopts.h"
56 #include "tree-ssa-loop-manip.h"
57 #include "tree-ssa-loop-niter.h"
58 #include "tree-pass.h"
59 #include "cfgloop.h"
60 #include "expr.h"
61 #include "recog.h"
62 #include "insn-codes.h"
63 #include "optabs.h"
64 #include "params.h"
65 #include "diagnostic-core.h"
66 #include "tree-chrec.h"
67 #include "tree-scalar-evolution.h"
68 #include "tree-vectorizer.h"
69 #include "target.h"
70
71 /* Loop Vectorization Pass.
72
73 This pass tries to vectorize loops.
74
75 For example, the vectorizer transforms the following simple loop:
76
77 short a[N]; short b[N]; short c[N]; int i;
78
79 for (i=0; i<N; i++){
80 a[i] = b[i] + c[i];
81 }
82
83 as if it was manually vectorized by rewriting the source code into:
84
85 typedef int __attribute__((mode(V8HI))) v8hi;
86 short a[N]; short b[N]; short c[N]; int i;
87 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
88 v8hi va, vb, vc;
89
90 for (i=0; i<N/8; i++){
91 vb = pb[i];
92 vc = pc[i];
93 va = vb + vc;
94 pa[i] = va;
95 }
96
97 The main entry to this pass is vectorize_loops(), in which
98 the vectorizer applies a set of analyses on a given set of loops,
99 followed by the actual vectorization transformation for the loops that
100 had successfully passed the analysis phase.
101 Throughout this pass we make a distinction between two types of
102 data: scalars (which are represented by SSA_NAMES), and memory references
103 ("data-refs"). These two types of data require different handling both
104 during analysis and transformation. The types of data-refs that the
105 vectorizer currently supports are ARRAY_REFS which base is an array DECL
106 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
107 accesses are required to have a simple (consecutive) access pattern.
108
109 Analysis phase:
110 ===============
111 The driver for the analysis phase is vect_analyze_loop().
112 It applies a set of analyses, some of which rely on the scalar evolution
113 analyzer (scev) developed by Sebastian Pop.
114
115 During the analysis phase the vectorizer records some information
116 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
117 loop, as well as general information about the loop as a whole, which is
118 recorded in a "loop_vec_info" struct attached to each loop.
119
120 Transformation phase:
121 =====================
122 The loop transformation phase scans all the stmts in the loop, and
123 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
124 the loop that needs to be vectorized. It inserts the vector code sequence
125 just before the scalar stmt S, and records a pointer to the vector code
126 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
127 attached to S). This pointer will be used for the vectorization of following
128 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
129 otherwise, we rely on dead code elimination for removing it.
130
131 For example, say stmt S1 was vectorized into stmt VS1:
132
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 S2: a = b;
136
137 To vectorize stmt S2, the vectorizer first finds the stmt that defines
138 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
139 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
140 resulting sequence would be:
141
142 VS1: vb = px[i];
143 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
144 VS2: va = vb;
145 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
146
147 Operands that are not SSA_NAMEs, are data-refs that appear in
148 load/store operations (like 'x[i]' in S1), and are handled differently.
149
150 Target modeling:
151 =================
152 Currently the only target specific information that is used is the
153 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
154 Targets that can support different sizes of vectors, for now will need
155 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
156 flexibility will be added in the future.
157
158 Since we only vectorize operations which vector form can be
159 expressed using existing tree codes, to verify that an operation is
160 supported, the vectorizer checks the relevant optab at the relevant
161 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
162 the value found is CODE_FOR_nothing, then there's no target support, and
163 we can't vectorize the stmt.
164
165 For additional information on this project see:
166 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
167 */
168
169 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
170
171 /* Function vect_determine_vectorization_factor
172
173 Determine the vectorization factor (VF). VF is the number of data elements
174 that are operated upon in parallel in a single iteration of the vectorized
175 loop. For example, when vectorizing a loop that operates on 4byte elements,
176 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
177 elements can fit in a single vector register.
178
179 We currently support vectorization of loops in which all types operated upon
180 are of the same size. Therefore this function currently sets VF according to
181 the size of the types operated upon, and fails if there are multiple sizes
182 in the loop.
183
184 VF is also the factor by which the loop iterations are strip-mined, e.g.:
185 original loop:
186 for (i=0; i<N; i++){
187 a[i] = b[i] + c[i];
188 }
189
190 vectorized loop:
191 for (i=0; i<N; i+=VF){
192 a[i:VF] = b[i:VF] + c[i:VF];
193 }
194 */
195
196 static bool
197 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
198 {
199 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
200 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
201 int nbbs = loop->num_nodes;
202 gimple_stmt_iterator si;
203 unsigned int vectorization_factor = 0;
204 tree scalar_type;
205 gimple phi;
206 tree vectype;
207 unsigned int nunits;
208 stmt_vec_info stmt_info;
209 int i;
210 HOST_WIDE_INT dummy;
211 gimple stmt, pattern_stmt = NULL;
212 gimple_seq pattern_def_seq = NULL;
213 gimple_stmt_iterator pattern_def_si = gsi_none ();
214 bool analyze_pattern_stmt = false;
215
216 if (dump_enabled_p ())
217 dump_printf_loc (MSG_NOTE, vect_location,
218 "=== vect_determine_vectorization_factor ===\n");
219
220 for (i = 0; i < nbbs; i++)
221 {
222 basic_block bb = bbs[i];
223
224 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
225 {
226 phi = gsi_stmt (si);
227 stmt_info = vinfo_for_stmt (phi);
228 if (dump_enabled_p ())
229 {
230 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
231 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
232 dump_printf (MSG_NOTE, "\n");
233 }
234
235 gcc_assert (stmt_info);
236
237 if (STMT_VINFO_RELEVANT_P (stmt_info))
238 {
239 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
240 scalar_type = TREE_TYPE (PHI_RESULT (phi));
241
242 if (dump_enabled_p ())
243 {
244 dump_printf_loc (MSG_NOTE, vect_location,
245 "get vectype for scalar type: ");
246 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
247 dump_printf (MSG_NOTE, "\n");
248 }
249
250 vectype = get_vectype_for_scalar_type (scalar_type);
251 if (!vectype)
252 {
253 if (dump_enabled_p ())
254 {
255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
256 "not vectorized: unsupported "
257 "data-type ");
258 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
259 scalar_type);
260 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
261 }
262 return false;
263 }
264 STMT_VINFO_VECTYPE (stmt_info) = vectype;
265
266 if (dump_enabled_p ())
267 {
268 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
269 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
270 dump_printf (MSG_NOTE, "\n");
271 }
272
273 nunits = TYPE_VECTOR_SUBPARTS (vectype);
274 if (dump_enabled_p ())
275 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
276 nunits);
277
278 if (!vectorization_factor
279 || (nunits > vectorization_factor))
280 vectorization_factor = nunits;
281 }
282 }
283
284 for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
285 {
286 tree vf_vectype;
287
288 if (analyze_pattern_stmt)
289 stmt = pattern_stmt;
290 else
291 stmt = gsi_stmt (si);
292
293 stmt_info = vinfo_for_stmt (stmt);
294
295 if (dump_enabled_p ())
296 {
297 dump_printf_loc (MSG_NOTE, vect_location,
298 "==> examining statement: ");
299 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
300 dump_printf (MSG_NOTE, "\n");
301 }
302
303 gcc_assert (stmt_info);
304
305 /* Skip stmts which do not need to be vectorized. */
306 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
307 && !STMT_VINFO_LIVE_P (stmt_info))
308 || gimple_clobber_p (stmt))
309 {
310 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
311 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
312 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
313 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
314 {
315 stmt = pattern_stmt;
316 stmt_info = vinfo_for_stmt (pattern_stmt);
317 if (dump_enabled_p ())
318 {
319 dump_printf_loc (MSG_NOTE, vect_location,
320 "==> examining pattern statement: ");
321 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
322 dump_printf (MSG_NOTE, "\n");
323 }
324 }
325 else
326 {
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
329 gsi_next (&si);
330 continue;
331 }
332 }
333 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
334 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
335 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
336 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
337 analyze_pattern_stmt = true;
338
339 /* If a pattern statement has def stmts, analyze them too. */
340 if (is_pattern_stmt_p (stmt_info))
341 {
342 if (pattern_def_seq == NULL)
343 {
344 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
345 pattern_def_si = gsi_start (pattern_def_seq);
346 }
347 else if (!gsi_end_p (pattern_def_si))
348 gsi_next (&pattern_def_si);
349 if (pattern_def_seq != NULL)
350 {
351 gimple pattern_def_stmt = NULL;
352 stmt_vec_info pattern_def_stmt_info = NULL;
353
354 while (!gsi_end_p (pattern_def_si))
355 {
356 pattern_def_stmt = gsi_stmt (pattern_def_si);
357 pattern_def_stmt_info
358 = vinfo_for_stmt (pattern_def_stmt);
359 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
360 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
361 break;
362 gsi_next (&pattern_def_si);
363 }
364
365 if (!gsi_end_p (pattern_def_si))
366 {
367 if (dump_enabled_p ())
368 {
369 dump_printf_loc (MSG_NOTE, vect_location,
370 "==> examining pattern def stmt: ");
371 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
372 pattern_def_stmt, 0);
373 dump_printf (MSG_NOTE, "\n");
374 }
375
376 stmt = pattern_def_stmt;
377 stmt_info = pattern_def_stmt_info;
378 }
379 else
380 {
381 pattern_def_si = gsi_none ();
382 analyze_pattern_stmt = false;
383 }
384 }
385 else
386 analyze_pattern_stmt = false;
387 }
388
389 if (gimple_get_lhs (stmt) == NULL_TREE
390 /* MASK_STORE has no lhs, but is ok. */
391 && (!is_gimple_call (stmt)
392 || !gimple_call_internal_p (stmt)
393 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
394 {
395 if (is_gimple_call (stmt))
396 {
397 /* Ignore calls with no lhs. These must be calls to
398 #pragma omp simd functions, and what vectorization factor
399 it really needs can't be determined until
400 vectorizable_simd_clone_call. */
401 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
402 {
403 pattern_def_seq = NULL;
404 gsi_next (&si);
405 }
406 continue;
407 }
408 if (dump_enabled_p ())
409 {
410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
411 "not vectorized: irregular stmt.");
412 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
413 0);
414 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
415 }
416 return false;
417 }
418
419 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
420 {
421 if (dump_enabled_p ())
422 {
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "not vectorized: vector stmt in loop:");
425 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
426 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
427 }
428 return false;
429 }
430
431 if (STMT_VINFO_VECTYPE (stmt_info))
432 {
433 /* The only case when a vectype had been already set is for stmts
434 that contain a dataref, or for "pattern-stmts" (stmts
435 generated by the vectorizer to represent/replace a certain
436 idiom). */
437 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
438 || is_pattern_stmt_p (stmt_info)
439 || !gsi_end_p (pattern_def_si));
440 vectype = STMT_VINFO_VECTYPE (stmt_info);
441 }
442 else
443 {
444 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
445 if (is_gimple_call (stmt)
446 && gimple_call_internal_p (stmt)
447 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
448 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
449 else
450 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
451 if (dump_enabled_p ())
452 {
453 dump_printf_loc (MSG_NOTE, vect_location,
454 "get vectype for scalar type: ");
455 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
456 dump_printf (MSG_NOTE, "\n");
457 }
458 vectype = get_vectype_for_scalar_type (scalar_type);
459 if (!vectype)
460 {
461 if (dump_enabled_p ())
462 {
463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
464 "not vectorized: unsupported "
465 "data-type ");
466 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
467 scalar_type);
468 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
469 }
470 return false;
471 }
472
473 STMT_VINFO_VECTYPE (stmt_info) = vectype;
474
475 if (dump_enabled_p ())
476 {
477 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
478 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
479 dump_printf (MSG_NOTE, "\n");
480 }
481 }
482
483 /* The vectorization factor is according to the smallest
484 scalar type (or the largest vector size, but we only
485 support one vector size per loop). */
486 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
487 &dummy);
488 if (dump_enabled_p ())
489 {
490 dump_printf_loc (MSG_NOTE, vect_location,
491 "get vectype for scalar type: ");
492 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
493 dump_printf (MSG_NOTE, "\n");
494 }
495 vf_vectype = get_vectype_for_scalar_type (scalar_type);
496 if (!vf_vectype)
497 {
498 if (dump_enabled_p ())
499 {
500 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
501 "not vectorized: unsupported data-type ");
502 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
503 scalar_type);
504 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
505 }
506 return false;
507 }
508
509 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
510 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
511 {
512 if (dump_enabled_p ())
513 {
514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
515 "not vectorized: different sized vector "
516 "types in statement, ");
517 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
518 vectype);
519 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
520 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
521 vf_vectype);
522 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
523 }
524 return false;
525 }
526
527 if (dump_enabled_p ())
528 {
529 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
530 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
531 dump_printf (MSG_NOTE, "\n");
532 }
533
534 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
535 if (dump_enabled_p ())
536 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
537 if (!vectorization_factor
538 || (nunits > vectorization_factor))
539 vectorization_factor = nunits;
540
541 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
542 {
543 pattern_def_seq = NULL;
544 gsi_next (&si);
545 }
546 }
547 }
548
549 /* TODO: Analyze cost. Decide if worth while to vectorize. */
550 if (dump_enabled_p ())
551 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
552 vectorization_factor);
553 if (vectorization_factor <= 1)
554 {
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
557 "not vectorized: unsupported data-type\n");
558 return false;
559 }
560 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
561
562 return true;
563 }
564
565
566 /* Function vect_is_simple_iv_evolution.
567
568 FORNOW: A simple evolution of an induction variables in the loop is
569 considered a polynomial evolution. */
570
571 static bool
572 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
573 tree * step)
574 {
575 tree init_expr;
576 tree step_expr;
577 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
578 basic_block bb;
579
580 /* When there is no evolution in this loop, the evolution function
581 is not "simple". */
582 if (evolution_part == NULL_TREE)
583 return false;
584
585 /* When the evolution is a polynomial of degree >= 2
586 the evolution function is not "simple". */
587 if (tree_is_chrec (evolution_part))
588 return false;
589
590 step_expr = evolution_part;
591 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
592
593 if (dump_enabled_p ())
594 {
595 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
596 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
597 dump_printf (MSG_NOTE, ", init: ");
598 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
599 dump_printf (MSG_NOTE, "\n");
600 }
601
602 *init = init_expr;
603 *step = step_expr;
604
605 if (TREE_CODE (step_expr) != INTEGER_CST
606 && (TREE_CODE (step_expr) != SSA_NAME
607 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
608 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
609 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
610 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
611 || !flag_associative_math)))
612 && (TREE_CODE (step_expr) != REAL_CST
613 || !flag_associative_math))
614 {
615 if (dump_enabled_p ())
616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
617 "step unknown.\n");
618 return false;
619 }
620
621 return true;
622 }
623
624 /* Function vect_analyze_scalar_cycles_1.
625
626 Examine the cross iteration def-use cycles of scalar variables
627 in LOOP. LOOP_VINFO represents the loop that is now being
628 considered for vectorization (can be LOOP, or an outer-loop
629 enclosing LOOP). */
630
631 static void
632 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
633 {
634 basic_block bb = loop->header;
635 tree init, step;
636 auto_vec<gimple, 64> worklist;
637 gimple_stmt_iterator gsi;
638 bool double_reduc;
639
640 if (dump_enabled_p ())
641 dump_printf_loc (MSG_NOTE, vect_location,
642 "=== vect_analyze_scalar_cycles ===\n");
643
644 /* First - identify all inductions. Reduction detection assumes that all the
645 inductions have been identified, therefore, this order must not be
646 changed. */
647 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
648 {
649 gimple phi = gsi_stmt (gsi);
650 tree access_fn = NULL;
651 tree def = PHI_RESULT (phi);
652 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
653
654 if (dump_enabled_p ())
655 {
656 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
657 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
658 dump_printf (MSG_NOTE, "\n");
659 }
660
661 /* Skip virtual phi's. The data dependences that are associated with
662 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
663 if (virtual_operand_p (def))
664 continue;
665
666 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
667
668 /* Analyze the evolution function. */
669 access_fn = analyze_scalar_evolution (loop, def);
670 if (access_fn)
671 {
672 STRIP_NOPS (access_fn);
673 if (dump_enabled_p ())
674 {
675 dump_printf_loc (MSG_NOTE, vect_location,
676 "Access function of PHI: ");
677 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
678 dump_printf (MSG_NOTE, "\n");
679 }
680 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
681 = evolution_part_in_loop_num (access_fn, loop->num);
682 }
683
684 if (!access_fn
685 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
686 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
687 && TREE_CODE (step) != INTEGER_CST))
688 {
689 worklist.safe_push (phi);
690 continue;
691 }
692
693 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
694
695 if (dump_enabled_p ())
696 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
698 }
699
700
701 /* Second - identify all reductions and nested cycles. */
702 while (worklist.length () > 0)
703 {
704 gimple phi = worklist.pop ();
705 tree def = PHI_RESULT (phi);
706 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
707 gimple reduc_stmt;
708 bool nested_cycle;
709
710 if (dump_enabled_p ())
711 {
712 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
713 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
714 dump_printf (MSG_NOTE, "\n");
715 }
716
717 gcc_assert (!virtual_operand_p (def)
718 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
719
720 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
721 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
722 &double_reduc);
723 if (reduc_stmt)
724 {
725 if (double_reduc)
726 {
727 if (dump_enabled_p ())
728 dump_printf_loc (MSG_NOTE, vect_location,
729 "Detected double reduction.\n");
730
731 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
732 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
733 vect_double_reduction_def;
734 }
735 else
736 {
737 if (nested_cycle)
738 {
739 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE, vect_location,
741 "Detected vectorizable nested cycle.\n");
742
743 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
744 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
745 vect_nested_cycle;
746 }
747 else
748 {
749 if (dump_enabled_p ())
750 dump_printf_loc (MSG_NOTE, vect_location,
751 "Detected reduction.\n");
752
753 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
754 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
755 vect_reduction_def;
756 /* Store the reduction cycles for possible vectorization in
757 loop-aware SLP. */
758 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
759 }
760 }
761 }
762 else
763 if (dump_enabled_p ())
764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
765 "Unknown def-use cycle pattern.\n");
766 }
767 }
768
769
770 /* Function vect_analyze_scalar_cycles.
771
772 Examine the cross iteration def-use cycles of scalar variables, by
773 analyzing the loop-header PHIs of scalar variables. Classify each
774 cycle as one of the following: invariant, induction, reduction, unknown.
775 We do that for the loop represented by LOOP_VINFO, and also to its
776 inner-loop, if exists.
777 Examples for scalar cycles:
778
779 Example1: reduction:
780
781 loop1:
782 for (i=0; i<N; i++)
783 sum += a[i];
784
785 Example2: induction:
786
787 loop2:
788 for (i=0; i<N; i++)
789 a[i] = i; */
790
791 static void
792 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
793 {
794 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
795
796 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
797
798 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
799 Reductions in such inner-loop therefore have different properties than
800 the reductions in the nest that gets vectorized:
801 1. When vectorized, they are executed in the same order as in the original
802 scalar loop, so we can't change the order of computation when
803 vectorizing them.
804 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
805 current checks are too strict. */
806
807 if (loop->inner)
808 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
809 }
810
811
812 /* Function vect_get_loop_niters.
813
814 Determine how many iterations the loop is executed and place it
815 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
816 in NUMBER_OF_ITERATIONSM1.
817
818 Return the loop exit condition. */
819
820 static gimple
821 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
822 tree *number_of_iterationsm1)
823 {
824 tree niters;
825
826 if (dump_enabled_p ())
827 dump_printf_loc (MSG_NOTE, vect_location,
828 "=== get_loop_niters ===\n");
829
830 niters = number_of_latch_executions (loop);
831 *number_of_iterationsm1 = niters;
832
833 /* We want the number of loop header executions which is the number
834 of latch executions plus one.
835 ??? For UINT_MAX latch executions this number overflows to zero
836 for loops like do { n++; } while (n != 0); */
837 if (niters && !chrec_contains_undetermined (niters))
838 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
839 build_int_cst (TREE_TYPE (niters), 1));
840 *number_of_iterations = niters;
841
842 return get_loop_exit_condition (loop);
843 }
844
845
846 /* Function bb_in_loop_p
847
848 Used as predicate for dfs order traversal of the loop bbs. */
849
850 static bool
851 bb_in_loop_p (const_basic_block bb, const void *data)
852 {
853 const struct loop *const loop = (const struct loop *)data;
854 if (flow_bb_inside_loop_p (loop, bb))
855 return true;
856 return false;
857 }
858
859
860 /* Function new_loop_vec_info.
861
862 Create and initialize a new loop_vec_info struct for LOOP, as well as
863 stmt_vec_info structs for all the stmts in LOOP. */
864
865 static loop_vec_info
866 new_loop_vec_info (struct loop *loop)
867 {
868 loop_vec_info res;
869 basic_block *bbs;
870 gimple_stmt_iterator si;
871 unsigned int i, nbbs;
872
873 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
874 LOOP_VINFO_LOOP (res) = loop;
875
876 bbs = get_loop_body (loop);
877
878 /* Create/Update stmt_info for all stmts in the loop. */
879 for (i = 0; i < loop->num_nodes; i++)
880 {
881 basic_block bb = bbs[i];
882
883 /* BBs in a nested inner-loop will have been already processed (because
884 we will have called vect_analyze_loop_form for any nested inner-loop).
885 Therefore, for stmts in an inner-loop we just want to update the
886 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
887 loop_info of the outer-loop we are currently considering to vectorize
888 (instead of the loop_info of the inner-loop).
889 For stmts in other BBs we need to create a stmt_info from scratch. */
890 if (bb->loop_father != loop)
891 {
892 /* Inner-loop bb. */
893 gcc_assert (loop->inner && bb->loop_father == loop->inner);
894 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
895 {
896 gimple phi = gsi_stmt (si);
897 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
898 loop_vec_info inner_loop_vinfo =
899 STMT_VINFO_LOOP_VINFO (stmt_info);
900 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
901 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
902 }
903 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
904 {
905 gimple stmt = gsi_stmt (si);
906 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
907 loop_vec_info inner_loop_vinfo =
908 STMT_VINFO_LOOP_VINFO (stmt_info);
909 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
910 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
911 }
912 }
913 else
914 {
915 /* bb in current nest. */
916 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
917 {
918 gimple phi = gsi_stmt (si);
919 gimple_set_uid (phi, 0);
920 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
921 }
922
923 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
924 {
925 gimple stmt = gsi_stmt (si);
926 gimple_set_uid (stmt, 0);
927 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
928 }
929 }
930 }
931
932 /* CHECKME: We want to visit all BBs before their successors (except for
933 latch blocks, for which this assertion wouldn't hold). In the simple
934 case of the loop forms we allow, a dfs order of the BBs would the same
935 as reversed postorder traversal, so we are safe. */
936
937 free (bbs);
938 bbs = XCNEWVEC (basic_block, loop->num_nodes);
939 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
940 bbs, loop->num_nodes, loop);
941 gcc_assert (nbbs == loop->num_nodes);
942
943 LOOP_VINFO_BBS (res) = bbs;
944 LOOP_VINFO_NITERSM1 (res) = NULL;
945 LOOP_VINFO_NITERS (res) = NULL;
946 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
947 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
948 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
949 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
950 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
951 LOOP_VINFO_VECT_FACTOR (res) = 0;
952 LOOP_VINFO_LOOP_NEST (res).create (3);
953 LOOP_VINFO_DATAREFS (res).create (10);
954 LOOP_VINFO_DDRS (res).create (10 * 10);
955 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
956 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
957 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
958 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
959 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
960 LOOP_VINFO_GROUPED_STORES (res).create (10);
961 LOOP_VINFO_REDUCTIONS (res).create (10);
962 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
963 LOOP_VINFO_SLP_INSTANCES (res).create (10);
964 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
965 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
966 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
967 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
968 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
969
970 return res;
971 }
972
973
974 /* Function destroy_loop_vec_info.
975
976 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
977 stmts in the loop. */
978
979 void
980 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
981 {
982 struct loop *loop;
983 basic_block *bbs;
984 int nbbs;
985 gimple_stmt_iterator si;
986 int j;
987 vec<slp_instance> slp_instances;
988 slp_instance instance;
989 bool swapped;
990
991 if (!loop_vinfo)
992 return;
993
994 loop = LOOP_VINFO_LOOP (loop_vinfo);
995
996 bbs = LOOP_VINFO_BBS (loop_vinfo);
997 nbbs = clean_stmts ? loop->num_nodes : 0;
998 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
999
1000 for (j = 0; j < nbbs; j++)
1001 {
1002 basic_block bb = bbs[j];
1003 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1004 free_stmt_vec_info (gsi_stmt (si));
1005
1006 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1007 {
1008 gimple stmt = gsi_stmt (si);
1009
1010 /* We may have broken canonical form by moving a constant
1011 into RHS1 of a commutative op. Fix such occurrences. */
1012 if (swapped && is_gimple_assign (stmt))
1013 {
1014 enum tree_code code = gimple_assign_rhs_code (stmt);
1015
1016 if ((code == PLUS_EXPR
1017 || code == POINTER_PLUS_EXPR
1018 || code == MULT_EXPR)
1019 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1020 swap_ssa_operands (stmt,
1021 gimple_assign_rhs1_ptr (stmt),
1022 gimple_assign_rhs2_ptr (stmt));
1023 }
1024
1025 /* Free stmt_vec_info. */
1026 free_stmt_vec_info (stmt);
1027 gsi_next (&si);
1028 }
1029 }
1030
1031 free (LOOP_VINFO_BBS (loop_vinfo));
1032 vect_destroy_datarefs (loop_vinfo, NULL);
1033 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1034 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1035 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1036 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1037 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1038 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1039 vect_free_slp_instance (instance);
1040
1041 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1042 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1043 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1044 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1045
1046 delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1047 LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1048
1049 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1050
1051 free (loop_vinfo);
1052 loop->aux = NULL;
1053 }
1054
1055
1056 /* Function vect_analyze_loop_1.
1057
1058 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1059 for it. The different analyses will record information in the
1060 loop_vec_info struct. This is a subset of the analyses applied in
1061 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1062 that is now considered for (outer-loop) vectorization. */
1063
1064 static loop_vec_info
1065 vect_analyze_loop_1 (struct loop *loop)
1066 {
1067 loop_vec_info loop_vinfo;
1068
1069 if (dump_enabled_p ())
1070 dump_printf_loc (MSG_NOTE, vect_location,
1071 "===== analyze_loop_nest_1 =====\n");
1072
1073 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1074
1075 loop_vinfo = vect_analyze_loop_form (loop);
1076 if (!loop_vinfo)
1077 {
1078 if (dump_enabled_p ())
1079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1080 "bad inner-loop form.\n");
1081 return NULL;
1082 }
1083
1084 return loop_vinfo;
1085 }
1086
1087
1088 /* Function vect_analyze_loop_form.
1089
1090 Verify that certain CFG restrictions hold, including:
1091 - the loop has a pre-header
1092 - the loop has a single entry and exit
1093 - the loop exit condition is simple enough, and the number of iterations
1094 can be analyzed (a countable loop). */
1095
1096 loop_vec_info
1097 vect_analyze_loop_form (struct loop *loop)
1098 {
1099 loop_vec_info loop_vinfo;
1100 gimple loop_cond;
1101 tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1102 loop_vec_info inner_loop_vinfo = NULL;
1103
1104 if (dump_enabled_p ())
1105 dump_printf_loc (MSG_NOTE, vect_location,
1106 "=== vect_analyze_loop_form ===\n");
1107
1108 /* Different restrictions apply when we are considering an inner-most loop,
1109 vs. an outer (nested) loop.
1110 (FORNOW. May want to relax some of these restrictions in the future). */
1111
1112 if (!loop->inner)
1113 {
1114 /* Inner-most loop. We currently require that the number of BBs is
1115 exactly 2 (the header and latch). Vectorizable inner-most loops
1116 look like this:
1117
1118 (pre-header)
1119 |
1120 header <--------+
1121 | | |
1122 | +--> latch --+
1123 |
1124 (exit-bb) */
1125
1126 if (loop->num_nodes != 2)
1127 {
1128 if (dump_enabled_p ())
1129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1130 "not vectorized: control flow in loop.\n");
1131 return NULL;
1132 }
1133
1134 if (empty_block_p (loop->header))
1135 {
1136 if (dump_enabled_p ())
1137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1138 "not vectorized: empty loop.\n");
1139 return NULL;
1140 }
1141 }
1142 else
1143 {
1144 struct loop *innerloop = loop->inner;
1145 edge entryedge;
1146
1147 /* Nested loop. We currently require that the loop is doubly-nested,
1148 contains a single inner loop, and the number of BBs is exactly 5.
1149 Vectorizable outer-loops look like this:
1150
1151 (pre-header)
1152 |
1153 header <---+
1154 | |
1155 inner-loop |
1156 | |
1157 tail ------+
1158 |
1159 (exit-bb)
1160
1161 The inner-loop has the properties expected of inner-most loops
1162 as described above. */
1163
1164 if ((loop->inner)->inner || (loop->inner)->next)
1165 {
1166 if (dump_enabled_p ())
1167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1168 "not vectorized: multiple nested loops.\n");
1169 return NULL;
1170 }
1171
1172 /* Analyze the inner-loop. */
1173 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1174 if (!inner_loop_vinfo)
1175 {
1176 if (dump_enabled_p ())
1177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1178 "not vectorized: Bad inner loop.\n");
1179 return NULL;
1180 }
1181
1182 if (!expr_invariant_in_loop_p (loop,
1183 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1184 {
1185 if (dump_enabled_p ())
1186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1187 "not vectorized: inner-loop count not"
1188 " invariant.\n");
1189 destroy_loop_vec_info (inner_loop_vinfo, true);
1190 return NULL;
1191 }
1192
1193 if (loop->num_nodes != 5)
1194 {
1195 if (dump_enabled_p ())
1196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1197 "not vectorized: control flow in loop.\n");
1198 destroy_loop_vec_info (inner_loop_vinfo, true);
1199 return NULL;
1200 }
1201
1202 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1203 entryedge = EDGE_PRED (innerloop->header, 0);
1204 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1205 entryedge = EDGE_PRED (innerloop->header, 1);
1206
1207 if (entryedge->src != loop->header
1208 || !single_exit (innerloop)
1209 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1210 {
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1213 "not vectorized: unsupported outerloop form.\n");
1214 destroy_loop_vec_info (inner_loop_vinfo, true);
1215 return NULL;
1216 }
1217
1218 if (dump_enabled_p ())
1219 dump_printf_loc (MSG_NOTE, vect_location,
1220 "Considering outer-loop vectorization.\n");
1221 }
1222
1223 if (!single_exit (loop)
1224 || EDGE_COUNT (loop->header->preds) != 2)
1225 {
1226 if (dump_enabled_p ())
1227 {
1228 if (!single_exit (loop))
1229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1230 "not vectorized: multiple exits.\n");
1231 else if (EDGE_COUNT (loop->header->preds) != 2)
1232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1233 "not vectorized: too many incoming edges.\n");
1234 }
1235 if (inner_loop_vinfo)
1236 destroy_loop_vec_info (inner_loop_vinfo, true);
1237 return NULL;
1238 }
1239
1240 /* We assume that the loop exit condition is at the end of the loop. i.e,
1241 that the loop is represented as a do-while (with a proper if-guard
1242 before the loop if needed), where the loop header contains all the
1243 executable statements, and the latch is empty. */
1244 if (!empty_block_p (loop->latch)
1245 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1246 {
1247 if (dump_enabled_p ())
1248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249 "not vectorized: latch block not empty.\n");
1250 if (inner_loop_vinfo)
1251 destroy_loop_vec_info (inner_loop_vinfo, true);
1252 return NULL;
1253 }
1254
1255 /* Make sure there exists a single-predecessor exit bb: */
1256 if (!single_pred_p (single_exit (loop)->dest))
1257 {
1258 edge e = single_exit (loop);
1259 if (!(e->flags & EDGE_ABNORMAL))
1260 {
1261 split_loop_exit_edge (e);
1262 if (dump_enabled_p ())
1263 dump_printf (MSG_NOTE, "split exit edge.\n");
1264 }
1265 else
1266 {
1267 if (dump_enabled_p ())
1268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269 "not vectorized: abnormal loop exit edge.\n");
1270 if (inner_loop_vinfo)
1271 destroy_loop_vec_info (inner_loop_vinfo, true);
1272 return NULL;
1273 }
1274 }
1275
1276 loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1277 &number_of_iterationsm1);
1278 if (!loop_cond)
1279 {
1280 if (dump_enabled_p ())
1281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1282 "not vectorized: complicated exit condition.\n");
1283 if (inner_loop_vinfo)
1284 destroy_loop_vec_info (inner_loop_vinfo, true);
1285 return NULL;
1286 }
1287
1288 if (!number_of_iterations
1289 || chrec_contains_undetermined (number_of_iterations))
1290 {
1291 if (dump_enabled_p ())
1292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1293 "not vectorized: number of iterations cannot be "
1294 "computed.\n");
1295 if (inner_loop_vinfo)
1296 destroy_loop_vec_info (inner_loop_vinfo, true);
1297 return NULL;
1298 }
1299
1300 if (integer_zerop (number_of_iterations))
1301 {
1302 if (dump_enabled_p ())
1303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1304 "not vectorized: number of iterations = 0.\n");
1305 if (inner_loop_vinfo)
1306 destroy_loop_vec_info (inner_loop_vinfo, true);
1307 return NULL;
1308 }
1309
1310 loop_vinfo = new_loop_vec_info (loop);
1311 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1312 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1313 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1314
1315 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1316 {
1317 if (dump_enabled_p ())
1318 {
1319 dump_printf_loc (MSG_NOTE, vect_location,
1320 "Symbolic number of iterations is ");
1321 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1322 dump_printf (MSG_NOTE, "\n");
1323 }
1324 }
1325
1326 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1327
1328 /* CHECKME: May want to keep it around it in the future. */
1329 if (inner_loop_vinfo)
1330 destroy_loop_vec_info (inner_loop_vinfo, false);
1331
1332 gcc_assert (!loop->aux);
1333 loop->aux = loop_vinfo;
1334 return loop_vinfo;
1335 }
1336
1337
1338 /* Function vect_analyze_loop_operations.
1339
1340 Scan the loop stmts and make sure they are all vectorizable. */
1341
1342 static bool
1343 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1344 {
1345 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1346 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1347 int nbbs = loop->num_nodes;
1348 gimple_stmt_iterator si;
1349 unsigned int vectorization_factor = 0;
1350 int i;
1351 gimple phi;
1352 stmt_vec_info stmt_info;
1353 bool need_to_vectorize = false;
1354 int min_profitable_iters;
1355 int min_scalar_loop_bound;
1356 unsigned int th;
1357 bool only_slp_in_loop = true, ok;
1358 HOST_WIDE_INT max_niter;
1359 HOST_WIDE_INT estimated_niter;
1360 int min_profitable_estimate;
1361
1362 if (dump_enabled_p ())
1363 dump_printf_loc (MSG_NOTE, vect_location,
1364 "=== vect_analyze_loop_operations ===\n");
1365
1366 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1367 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1368 if (slp)
1369 {
1370 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1371 vectorization factor of the loop is the unrolling factor required by
1372 the SLP instances. If that unrolling factor is 1, we say, that we
1373 perform pure SLP on loop - cross iteration parallelism is not
1374 exploited. */
1375 for (i = 0; i < nbbs; i++)
1376 {
1377 basic_block bb = bbs[i];
1378 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1379 {
1380 gimple stmt = gsi_stmt (si);
1381 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1382 gcc_assert (stmt_info);
1383 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1384 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1385 && !PURE_SLP_STMT (stmt_info))
1386 /* STMT needs both SLP and loop-based vectorization. */
1387 only_slp_in_loop = false;
1388 }
1389 }
1390
1391 if (only_slp_in_loop)
1392 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1393 else
1394 vectorization_factor = least_common_multiple (vectorization_factor,
1395 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1396
1397 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1398 if (dump_enabled_p ())
1399 dump_printf_loc (MSG_NOTE, vect_location,
1400 "Updating vectorization factor to %d\n",
1401 vectorization_factor);
1402 }
1403
1404 for (i = 0; i < nbbs; i++)
1405 {
1406 basic_block bb = bbs[i];
1407
1408 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1409 {
1410 phi = gsi_stmt (si);
1411 ok = true;
1412
1413 stmt_info = vinfo_for_stmt (phi);
1414 if (dump_enabled_p ())
1415 {
1416 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1417 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1418 dump_printf (MSG_NOTE, "\n");
1419 }
1420
1421 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1422 (i.e., a phi in the tail of the outer-loop). */
1423 if (! is_loop_header_bb_p (bb))
1424 {
1425 /* FORNOW: we currently don't support the case that these phis
1426 are not used in the outerloop (unless it is double reduction,
1427 i.e., this phi is vect_reduction_def), cause this case
1428 requires to actually do something here. */
1429 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1430 || STMT_VINFO_LIVE_P (stmt_info))
1431 && STMT_VINFO_DEF_TYPE (stmt_info)
1432 != vect_double_reduction_def)
1433 {
1434 if (dump_enabled_p ())
1435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1436 "Unsupported loop-closed phi in "
1437 "outer-loop.\n");
1438 return false;
1439 }
1440
1441 /* If PHI is used in the outer loop, we check that its operand
1442 is defined in the inner loop. */
1443 if (STMT_VINFO_RELEVANT_P (stmt_info))
1444 {
1445 tree phi_op;
1446 gimple op_def_stmt;
1447
1448 if (gimple_phi_num_args (phi) != 1)
1449 return false;
1450
1451 phi_op = PHI_ARG_DEF (phi, 0);
1452 if (TREE_CODE (phi_op) != SSA_NAME)
1453 return false;
1454
1455 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1456 if (gimple_nop_p (op_def_stmt)
1457 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1458 || !vinfo_for_stmt (op_def_stmt))
1459 return false;
1460
1461 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1462 != vect_used_in_outer
1463 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1464 != vect_used_in_outer_by_reduction)
1465 return false;
1466 }
1467
1468 continue;
1469 }
1470
1471 gcc_assert (stmt_info);
1472
1473 if (STMT_VINFO_LIVE_P (stmt_info))
1474 {
1475 /* FORNOW: not yet supported. */
1476 if (dump_enabled_p ())
1477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1478 "not vectorized: value used after loop.\n");
1479 return false;
1480 }
1481
1482 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1483 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1484 {
1485 /* A scalar-dependence cycle that we don't support. */
1486 if (dump_enabled_p ())
1487 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1488 "not vectorized: scalar dependence cycle.\n");
1489 return false;
1490 }
1491
1492 if (STMT_VINFO_RELEVANT_P (stmt_info))
1493 {
1494 need_to_vectorize = true;
1495 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1496 ok = vectorizable_induction (phi, NULL, NULL);
1497 }
1498
1499 if (!ok)
1500 {
1501 if (dump_enabled_p ())
1502 {
1503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1504 "not vectorized: relevant phi not "
1505 "supported: ");
1506 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1507 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1508 }
1509 return false;
1510 }
1511 }
1512
1513 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1514 {
1515 gimple stmt = gsi_stmt (si);
1516 if (!gimple_clobber_p (stmt)
1517 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1518 return false;
1519 }
1520 } /* bbs */
1521
1522 /* All operations in the loop are either irrelevant (deal with loop
1523 control, or dead), or only used outside the loop and can be moved
1524 out of the loop (e.g. invariants, inductions). The loop can be
1525 optimized away by scalar optimizations. We're better off not
1526 touching this loop. */
1527 if (!need_to_vectorize)
1528 {
1529 if (dump_enabled_p ())
1530 dump_printf_loc (MSG_NOTE, vect_location,
1531 "All the computation can be taken out of the loop.\n");
1532 if (dump_enabled_p ())
1533 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1534 "not vectorized: redundant loop. no profit to "
1535 "vectorize.\n");
1536 return false;
1537 }
1538
1539 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1540 dump_printf_loc (MSG_NOTE, vect_location,
1541 "vectorization_factor = %d, niters = "
1542 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1543 LOOP_VINFO_INT_NITERS (loop_vinfo));
1544
1545 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1546 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1547 || ((max_niter = max_stmt_executions_int (loop)) != -1
1548 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1549 {
1550 if (dump_enabled_p ())
1551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1552 "not vectorized: iteration count too small.\n");
1553 if (dump_enabled_p ())
1554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1555 "not vectorized: iteration count smaller than "
1556 "vectorization factor.\n");
1557 return false;
1558 }
1559
1560 /* Analyze cost. Decide if worth while to vectorize. */
1561
1562 /* Once VF is set, SLP costs should be updated since the number of created
1563 vector stmts depends on VF. */
1564 vect_update_slp_costs_according_to_vf (loop_vinfo);
1565
1566 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1567 &min_profitable_estimate);
1568 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1569
1570 if (min_profitable_iters < 0)
1571 {
1572 if (dump_enabled_p ())
1573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1574 "not vectorized: vectorization not profitable.\n");
1575 if (dump_enabled_p ())
1576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1577 "not vectorized: vector version will never be "
1578 "profitable.\n");
1579 return false;
1580 }
1581
1582 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1583 * vectorization_factor) - 1);
1584
1585
1586 /* Use the cost model only if it is more conservative than user specified
1587 threshold. */
1588
1589 th = (unsigned) min_scalar_loop_bound;
1590 if (min_profitable_iters
1591 && (!min_scalar_loop_bound
1592 || min_profitable_iters > min_scalar_loop_bound))
1593 th = (unsigned) min_profitable_iters;
1594
1595 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1596
1597 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1598 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1599 {
1600 if (dump_enabled_p ())
1601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1602 "not vectorized: vectorization not profitable.\n");
1603 if (dump_enabled_p ())
1604 dump_printf_loc (MSG_NOTE, vect_location,
1605 "not vectorized: iteration count smaller than user "
1606 "specified loop bound parameter or minimum profitable "
1607 "iterations (whichever is more conservative).\n");
1608 return false;
1609 }
1610
1611 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1612 && ((unsigned HOST_WIDE_INT) estimated_niter
1613 <= MAX (th, (unsigned)min_profitable_estimate)))
1614 {
1615 if (dump_enabled_p ())
1616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1617 "not vectorized: estimated iteration count too "
1618 "small.\n");
1619 if (dump_enabled_p ())
1620 dump_printf_loc (MSG_NOTE, vect_location,
1621 "not vectorized: estimated iteration count smaller "
1622 "than specified loop bound parameter or minimum "
1623 "profitable iterations (whichever is more "
1624 "conservative).\n");
1625 return false;
1626 }
1627
1628 return true;
1629 }
1630
1631
1632 /* Function vect_analyze_loop_2.
1633
1634 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1635 for it. The different analyses will record information in the
1636 loop_vec_info struct. */
1637 static bool
1638 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1639 {
1640 bool ok, slp = false;
1641 int max_vf = MAX_VECTORIZATION_FACTOR;
1642 int min_vf = 2;
1643 unsigned int th;
1644 unsigned int n_stmts = 0;
1645
1646 /* Find all data references in the loop (which correspond to vdefs/vuses)
1647 and analyze their evolution in the loop. Also adjust the minimal
1648 vectorization factor according to the loads and stores.
1649
1650 FORNOW: Handle only simple, array references, which
1651 alignment can be forced, and aligned pointer-references. */
1652
1653 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1654 if (!ok)
1655 {
1656 if (dump_enabled_p ())
1657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1658 "bad data references.\n");
1659 return false;
1660 }
1661
1662 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1663 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1664
1665 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1666 if (!ok)
1667 {
1668 if (dump_enabled_p ())
1669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1670 "bad data access.\n");
1671 return false;
1672 }
1673
1674 /* Classify all cross-iteration scalar data-flow cycles.
1675 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1676
1677 vect_analyze_scalar_cycles (loop_vinfo);
1678
1679 vect_pattern_recog (loop_vinfo, NULL);
1680
1681 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1682
1683 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1684 if (!ok)
1685 {
1686 if (dump_enabled_p ())
1687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1688 "unexpected pattern.\n");
1689 return false;
1690 }
1691
1692 /* Analyze data dependences between the data-refs in the loop
1693 and adjust the maximum vectorization factor according to
1694 the dependences.
1695 FORNOW: fail at the first data dependence that we encounter. */
1696
1697 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1698 if (!ok
1699 || max_vf < min_vf)
1700 {
1701 if (dump_enabled_p ())
1702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1703 "bad data dependence.\n");
1704 return false;
1705 }
1706
1707 ok = vect_determine_vectorization_factor (loop_vinfo);
1708 if (!ok)
1709 {
1710 if (dump_enabled_p ())
1711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1712 "can't determine vectorization factor.\n");
1713 return false;
1714 }
1715 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1716 {
1717 if (dump_enabled_p ())
1718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719 "bad data dependence.\n");
1720 return false;
1721 }
1722
1723 /* Analyze the alignment of the data-refs in the loop.
1724 Fail if a data reference is found that cannot be vectorized. */
1725
1726 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1727 if (!ok)
1728 {
1729 if (dump_enabled_p ())
1730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1731 "bad data alignment.\n");
1732 return false;
1733 }
1734
1735 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1736 It is important to call pruning after vect_analyze_data_ref_accesses,
1737 since we use grouping information gathered by interleaving analysis. */
1738 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1739 if (!ok)
1740 {
1741 if (dump_enabled_p ())
1742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1743 "number of versioning for alias "
1744 "run-time tests exceeds %d "
1745 "(--param vect-max-version-for-alias-checks)\n",
1746 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1747 return false;
1748 }
1749
1750 /* This pass will decide on using loop versioning and/or loop peeling in
1751 order to enhance the alignment of data references in the loop. */
1752
1753 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1754 if (!ok)
1755 {
1756 if (dump_enabled_p ())
1757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1758 "bad data alignment.\n");
1759 return false;
1760 }
1761
1762 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1763 ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1764 if (ok)
1765 {
1766 /* Decide which possible SLP instances to SLP. */
1767 slp = vect_make_slp_decision (loop_vinfo);
1768
1769 /* Find stmts that need to be both vectorized and SLPed. */
1770 vect_detect_hybrid_slp (loop_vinfo);
1771 }
1772 else
1773 return false;
1774
1775 /* Scan all the operations in the loop and make sure they are
1776 vectorizable. */
1777
1778 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1779 if (!ok)
1780 {
1781 if (dump_enabled_p ())
1782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1783 "bad operation or unsupported loop bound.\n");
1784 return false;
1785 }
1786
1787 /* Decide whether we need to create an epilogue loop to handle
1788 remaining scalar iterations. */
1789 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1790 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1791 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1792
1793 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1794 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1795 {
1796 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1797 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1798 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1799 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1800 }
1801 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1802 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1803 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1804 /* In case of versioning, check if the maximum number of
1805 iterations is greater than th. If they are identical,
1806 the epilogue is unnecessary. */
1807 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1808 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1809 || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1810 (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1811 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1812
1813 /* If an epilogue loop is required make sure we can create one. */
1814 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1815 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1816 {
1817 if (dump_enabled_p ())
1818 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1819 if (!vect_can_advance_ivs_p (loop_vinfo)
1820 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1821 single_exit (LOOP_VINFO_LOOP
1822 (loop_vinfo))))
1823 {
1824 if (dump_enabled_p ())
1825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826 "not vectorized: can't create required "
1827 "epilog loop\n");
1828 return false;
1829 }
1830 }
1831
1832 return true;
1833 }
1834
1835 /* Function vect_analyze_loop.
1836
1837 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1838 for it. The different analyses will record information in the
1839 loop_vec_info struct. */
1840 loop_vec_info
1841 vect_analyze_loop (struct loop *loop)
1842 {
1843 loop_vec_info loop_vinfo;
1844 unsigned int vector_sizes;
1845
1846 /* Autodetect first vector size we try. */
1847 current_vector_size = 0;
1848 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1849
1850 if (dump_enabled_p ())
1851 dump_printf_loc (MSG_NOTE, vect_location,
1852 "===== analyze_loop_nest =====\n");
1853
1854 if (loop_outer (loop)
1855 && loop_vec_info_for_loop (loop_outer (loop))
1856 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1857 {
1858 if (dump_enabled_p ())
1859 dump_printf_loc (MSG_NOTE, vect_location,
1860 "outer-loop already vectorized.\n");
1861 return NULL;
1862 }
1863
1864 while (1)
1865 {
1866 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1867 loop_vinfo = vect_analyze_loop_form (loop);
1868 if (!loop_vinfo)
1869 {
1870 if (dump_enabled_p ())
1871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1872 "bad loop form.\n");
1873 return NULL;
1874 }
1875
1876 if (vect_analyze_loop_2 (loop_vinfo))
1877 {
1878 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1879
1880 return loop_vinfo;
1881 }
1882
1883 destroy_loop_vec_info (loop_vinfo, true);
1884
1885 vector_sizes &= ~current_vector_size;
1886 if (vector_sizes == 0
1887 || current_vector_size == 0)
1888 return NULL;
1889
1890 /* Try the next biggest vector size. */
1891 current_vector_size = 1 << floor_log2 (vector_sizes);
1892 if (dump_enabled_p ())
1893 dump_printf_loc (MSG_NOTE, vect_location,
1894 "***** Re-trying analysis with "
1895 "vector size %d\n", current_vector_size);
1896 }
1897 }
1898
1899
1900 /* Function reduction_code_for_scalar_code
1901
1902 Input:
1903 CODE - tree_code of a reduction operations.
1904
1905 Output:
1906 REDUC_CODE - the corresponding tree-code to be used to reduce the
1907 vector of partial results into a single scalar result, or ERROR_MARK
1908 if the operation is a supported reduction operation, but does not have
1909 such a tree-code.
1910
1911 Return FALSE if CODE currently cannot be vectorized as reduction. */
1912
1913 static bool
1914 reduction_code_for_scalar_code (enum tree_code code,
1915 enum tree_code *reduc_code)
1916 {
1917 switch (code)
1918 {
1919 case MAX_EXPR:
1920 *reduc_code = REDUC_MAX_EXPR;
1921 return true;
1922
1923 case MIN_EXPR:
1924 *reduc_code = REDUC_MIN_EXPR;
1925 return true;
1926
1927 case PLUS_EXPR:
1928 *reduc_code = REDUC_PLUS_EXPR;
1929 return true;
1930
1931 case MULT_EXPR:
1932 case MINUS_EXPR:
1933 case BIT_IOR_EXPR:
1934 case BIT_XOR_EXPR:
1935 case BIT_AND_EXPR:
1936 *reduc_code = ERROR_MARK;
1937 return true;
1938
1939 default:
1940 return false;
1941 }
1942 }
1943
1944
1945 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1946 STMT is printed with a message MSG. */
1947
1948 static void
1949 report_vect_op (int msg_type, gimple stmt, const char *msg)
1950 {
1951 dump_printf_loc (msg_type, vect_location, "%s", msg);
1952 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1953 dump_printf (msg_type, "\n");
1954 }
1955
1956
1957 /* Detect SLP reduction of the form:
1958
1959 #a1 = phi <a5, a0>
1960 a2 = operation (a1)
1961 a3 = operation (a2)
1962 a4 = operation (a3)
1963 a5 = operation (a4)
1964
1965 #a = phi <a5>
1966
1967 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1968 FIRST_STMT is the first reduction stmt in the chain
1969 (a2 = operation (a1)).
1970
1971 Return TRUE if a reduction chain was detected. */
1972
1973 static bool
1974 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1975 {
1976 struct loop *loop = (gimple_bb (phi))->loop_father;
1977 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1978 enum tree_code code;
1979 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1980 stmt_vec_info use_stmt_info, current_stmt_info;
1981 tree lhs;
1982 imm_use_iterator imm_iter;
1983 use_operand_p use_p;
1984 int nloop_uses, size = 0, n_out_of_loop_uses;
1985 bool found = false;
1986
1987 if (loop != vect_loop)
1988 return false;
1989
1990 lhs = PHI_RESULT (phi);
1991 code = gimple_assign_rhs_code (first_stmt);
1992 while (1)
1993 {
1994 nloop_uses = 0;
1995 n_out_of_loop_uses = 0;
1996 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1997 {
1998 gimple use_stmt = USE_STMT (use_p);
1999 if (is_gimple_debug (use_stmt))
2000 continue;
2001
2002 /* Check if we got back to the reduction phi. */
2003 if (use_stmt == phi)
2004 {
2005 loop_use_stmt = use_stmt;
2006 found = true;
2007 break;
2008 }
2009
2010 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2011 {
2012 if (vinfo_for_stmt (use_stmt)
2013 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
2014 {
2015 loop_use_stmt = use_stmt;
2016 nloop_uses++;
2017 }
2018 }
2019 else
2020 n_out_of_loop_uses++;
2021
2022 /* There are can be either a single use in the loop or two uses in
2023 phi nodes. */
2024 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2025 return false;
2026 }
2027
2028 if (found)
2029 break;
2030
2031 /* We reached a statement with no loop uses. */
2032 if (nloop_uses == 0)
2033 return false;
2034
2035 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2036 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2037 return false;
2038
2039 if (!is_gimple_assign (loop_use_stmt)
2040 || code != gimple_assign_rhs_code (loop_use_stmt)
2041 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2042 return false;
2043
2044 /* Insert USE_STMT into reduction chain. */
2045 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2046 if (current_stmt)
2047 {
2048 current_stmt_info = vinfo_for_stmt (current_stmt);
2049 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2050 GROUP_FIRST_ELEMENT (use_stmt_info)
2051 = GROUP_FIRST_ELEMENT (current_stmt_info);
2052 }
2053 else
2054 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2055
2056 lhs = gimple_assign_lhs (loop_use_stmt);
2057 current_stmt = loop_use_stmt;
2058 size++;
2059 }
2060
2061 if (!found || loop_use_stmt != phi || size < 2)
2062 return false;
2063
2064 /* Swap the operands, if needed, to make the reduction operand be the second
2065 operand. */
2066 lhs = PHI_RESULT (phi);
2067 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2068 while (next_stmt)
2069 {
2070 if (gimple_assign_rhs2 (next_stmt) == lhs)
2071 {
2072 tree op = gimple_assign_rhs1 (next_stmt);
2073 gimple def_stmt = NULL;
2074
2075 if (TREE_CODE (op) == SSA_NAME)
2076 def_stmt = SSA_NAME_DEF_STMT (op);
2077
2078 /* Check that the other def is either defined in the loop
2079 ("vect_internal_def"), or it's an induction (defined by a
2080 loop-header phi-node). */
2081 if (def_stmt
2082 && gimple_bb (def_stmt)
2083 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2084 && (is_gimple_assign (def_stmt)
2085 || is_gimple_call (def_stmt)
2086 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2087 == vect_induction_def
2088 || (gimple_code (def_stmt) == GIMPLE_PHI
2089 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2090 == vect_internal_def
2091 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2092 {
2093 lhs = gimple_assign_lhs (next_stmt);
2094 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2095 continue;
2096 }
2097
2098 return false;
2099 }
2100 else
2101 {
2102 tree op = gimple_assign_rhs2 (next_stmt);
2103 gimple def_stmt = NULL;
2104
2105 if (TREE_CODE (op) == SSA_NAME)
2106 def_stmt = SSA_NAME_DEF_STMT (op);
2107
2108 /* Check that the other def is either defined in the loop
2109 ("vect_internal_def"), or it's an induction (defined by a
2110 loop-header phi-node). */
2111 if (def_stmt
2112 && gimple_bb (def_stmt)
2113 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2114 && (is_gimple_assign (def_stmt)
2115 || is_gimple_call (def_stmt)
2116 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2117 == vect_induction_def
2118 || (gimple_code (def_stmt) == GIMPLE_PHI
2119 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2120 == vect_internal_def
2121 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2122 {
2123 if (dump_enabled_p ())
2124 {
2125 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2126 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2127 dump_printf (MSG_NOTE, "\n");
2128 }
2129
2130 swap_ssa_operands (next_stmt,
2131 gimple_assign_rhs1_ptr (next_stmt),
2132 gimple_assign_rhs2_ptr (next_stmt));
2133 update_stmt (next_stmt);
2134
2135 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2136 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2137 }
2138 else
2139 return false;
2140 }
2141
2142 lhs = gimple_assign_lhs (next_stmt);
2143 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2144 }
2145
2146 /* Save the chain for further analysis in SLP detection. */
2147 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2148 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2149 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2150
2151 return true;
2152 }
2153
2154
2155 /* Function vect_is_simple_reduction_1
2156
2157 (1) Detect a cross-iteration def-use cycle that represents a simple
2158 reduction computation. We look for the following pattern:
2159
2160 loop_header:
2161 a1 = phi < a0, a2 >
2162 a3 = ...
2163 a2 = operation (a3, a1)
2164
2165 or
2166
2167 a3 = ...
2168 loop_header:
2169 a1 = phi < a0, a2 >
2170 a2 = operation (a3, a1)
2171
2172 such that:
2173 1. operation is commutative and associative and it is safe to
2174 change the order of the computation (if CHECK_REDUCTION is true)
2175 2. no uses for a2 in the loop (a2 is used out of the loop)
2176 3. no uses of a1 in the loop besides the reduction operation
2177 4. no uses of a1 outside the loop.
2178
2179 Conditions 1,4 are tested here.
2180 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2181
2182 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2183 nested cycles, if CHECK_REDUCTION is false.
2184
2185 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2186 reductions:
2187
2188 a1 = phi < a0, a2 >
2189 inner loop (def of a3)
2190 a2 = phi < a3 >
2191
2192 If MODIFY is true it tries also to rework the code in-place to enable
2193 detection of more reduction patterns. For the time being we rewrite
2194 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2195 */
2196
2197 static gimple
2198 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2199 bool check_reduction, bool *double_reduc,
2200 bool modify)
2201 {
2202 struct loop *loop = (gimple_bb (phi))->loop_father;
2203 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2204 edge latch_e = loop_latch_edge (loop);
2205 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2206 gimple def_stmt, def1 = NULL, def2 = NULL;
2207 enum tree_code orig_code, code;
2208 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2209 tree type;
2210 int nloop_uses;
2211 tree name;
2212 imm_use_iterator imm_iter;
2213 use_operand_p use_p;
2214 bool phi_def;
2215
2216 *double_reduc = false;
2217
2218 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2219 otherwise, we assume outer loop vectorization. */
2220 gcc_assert ((check_reduction && loop == vect_loop)
2221 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2222
2223 name = PHI_RESULT (phi);
2224 /* ??? If there are no uses of the PHI result the inner loop reduction
2225 won't be detected as possibly double-reduction by vectorizable_reduction
2226 because that tries to walk the PHI arg from the preheader edge which
2227 can be constant. See PR60382. */
2228 if (has_zero_uses (name))
2229 return NULL;
2230 nloop_uses = 0;
2231 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2232 {
2233 gimple use_stmt = USE_STMT (use_p);
2234 if (is_gimple_debug (use_stmt))
2235 continue;
2236
2237 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2238 {
2239 if (dump_enabled_p ())
2240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2241 "intermediate value used outside loop.\n");
2242
2243 return NULL;
2244 }
2245
2246 if (vinfo_for_stmt (use_stmt)
2247 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2248 nloop_uses++;
2249 if (nloop_uses > 1)
2250 {
2251 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2253 "reduction used in loop.\n");
2254 return NULL;
2255 }
2256 }
2257
2258 if (TREE_CODE (loop_arg) != SSA_NAME)
2259 {
2260 if (dump_enabled_p ())
2261 {
2262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2263 "reduction: not ssa_name: ");
2264 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2265 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2266 }
2267 return NULL;
2268 }
2269
2270 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2271 if (!def_stmt)
2272 {
2273 if (dump_enabled_p ())
2274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2275 "reduction: no def_stmt.\n");
2276 return NULL;
2277 }
2278
2279 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2280 {
2281 if (dump_enabled_p ())
2282 {
2283 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2284 dump_printf (MSG_NOTE, "\n");
2285 }
2286 return NULL;
2287 }
2288
2289 if (is_gimple_assign (def_stmt))
2290 {
2291 name = gimple_assign_lhs (def_stmt);
2292 phi_def = false;
2293 }
2294 else
2295 {
2296 name = PHI_RESULT (def_stmt);
2297 phi_def = true;
2298 }
2299
2300 nloop_uses = 0;
2301 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2302 {
2303 gimple use_stmt = USE_STMT (use_p);
2304 if (is_gimple_debug (use_stmt))
2305 continue;
2306 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2307 && vinfo_for_stmt (use_stmt)
2308 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2309 nloop_uses++;
2310 if (nloop_uses > 1)
2311 {
2312 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314 "reduction used in loop.\n");
2315 return NULL;
2316 }
2317 }
2318
2319 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2320 defined in the inner loop. */
2321 if (phi_def)
2322 {
2323 op1 = PHI_ARG_DEF (def_stmt, 0);
2324
2325 if (gimple_phi_num_args (def_stmt) != 1
2326 || TREE_CODE (op1) != SSA_NAME)
2327 {
2328 if (dump_enabled_p ())
2329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2330 "unsupported phi node definition.\n");
2331
2332 return NULL;
2333 }
2334
2335 def1 = SSA_NAME_DEF_STMT (op1);
2336 if (gimple_bb (def1)
2337 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2338 && loop->inner
2339 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2340 && is_gimple_assign (def1))
2341 {
2342 if (dump_enabled_p ())
2343 report_vect_op (MSG_NOTE, def_stmt,
2344 "detected double reduction: ");
2345
2346 *double_reduc = true;
2347 return def_stmt;
2348 }
2349
2350 return NULL;
2351 }
2352
2353 code = orig_code = gimple_assign_rhs_code (def_stmt);
2354
2355 /* We can handle "res -= x[i]", which is non-associative by
2356 simply rewriting this into "res += -x[i]". Avoid changing
2357 gimple instruction for the first simple tests and only do this
2358 if we're allowed to change code at all. */
2359 if (code == MINUS_EXPR
2360 && modify
2361 && (op1 = gimple_assign_rhs1 (def_stmt))
2362 && TREE_CODE (op1) == SSA_NAME
2363 && SSA_NAME_DEF_STMT (op1) == phi)
2364 code = PLUS_EXPR;
2365
2366 if (check_reduction
2367 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2368 {
2369 if (dump_enabled_p ())
2370 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2371 "reduction: not commutative/associative: ");
2372 return NULL;
2373 }
2374
2375 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2376 {
2377 if (code != COND_EXPR)
2378 {
2379 if (dump_enabled_p ())
2380 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2381 "reduction: not binary operation: ");
2382
2383 return NULL;
2384 }
2385
2386 op3 = gimple_assign_rhs1 (def_stmt);
2387 if (COMPARISON_CLASS_P (op3))
2388 {
2389 op4 = TREE_OPERAND (op3, 1);
2390 op3 = TREE_OPERAND (op3, 0);
2391 }
2392
2393 op1 = gimple_assign_rhs2 (def_stmt);
2394 op2 = gimple_assign_rhs3 (def_stmt);
2395
2396 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2397 {
2398 if (dump_enabled_p ())
2399 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2400 "reduction: uses not ssa_names: ");
2401
2402 return NULL;
2403 }
2404 }
2405 else
2406 {
2407 op1 = gimple_assign_rhs1 (def_stmt);
2408 op2 = gimple_assign_rhs2 (def_stmt);
2409
2410 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2411 {
2412 if (dump_enabled_p ())
2413 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2414 "reduction: uses not ssa_names: ");
2415
2416 return NULL;
2417 }
2418 }
2419
2420 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2421 if ((TREE_CODE (op1) == SSA_NAME
2422 && !types_compatible_p (type,TREE_TYPE (op1)))
2423 || (TREE_CODE (op2) == SSA_NAME
2424 && !types_compatible_p (type, TREE_TYPE (op2)))
2425 || (op3 && TREE_CODE (op3) == SSA_NAME
2426 && !types_compatible_p (type, TREE_TYPE (op3)))
2427 || (op4 && TREE_CODE (op4) == SSA_NAME
2428 && !types_compatible_p (type, TREE_TYPE (op4))))
2429 {
2430 if (dump_enabled_p ())
2431 {
2432 dump_printf_loc (MSG_NOTE, vect_location,
2433 "reduction: multiple types: operation type: ");
2434 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2435 dump_printf (MSG_NOTE, ", operands types: ");
2436 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2437 TREE_TYPE (op1));
2438 dump_printf (MSG_NOTE, ",");
2439 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2440 TREE_TYPE (op2));
2441 if (op3)
2442 {
2443 dump_printf (MSG_NOTE, ",");
2444 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2445 TREE_TYPE (op3));
2446 }
2447
2448 if (op4)
2449 {
2450 dump_printf (MSG_NOTE, ",");
2451 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2452 TREE_TYPE (op4));
2453 }
2454 dump_printf (MSG_NOTE, "\n");
2455 }
2456
2457 return NULL;
2458 }
2459
2460 /* Check that it's ok to change the order of the computation.
2461 Generally, when vectorizing a reduction we change the order of the
2462 computation. This may change the behavior of the program in some
2463 cases, so we need to check that this is ok. One exception is when
2464 vectorizing an outer-loop: the inner-loop is executed sequentially,
2465 and therefore vectorizing reductions in the inner-loop during
2466 outer-loop vectorization is safe. */
2467
2468 /* CHECKME: check for !flag_finite_math_only too? */
2469 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2470 && check_reduction)
2471 {
2472 /* Changing the order of operations changes the semantics. */
2473 if (dump_enabled_p ())
2474 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2475 "reduction: unsafe fp math optimization: ");
2476 return NULL;
2477 }
2478 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2479 && check_reduction)
2480 {
2481 /* Changing the order of operations changes the semantics. */
2482 if (dump_enabled_p ())
2483 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2484 "reduction: unsafe int math optimization: ");
2485 return NULL;
2486 }
2487 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2488 {
2489 /* Changing the order of operations changes the semantics. */
2490 if (dump_enabled_p ())
2491 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2492 "reduction: unsafe fixed-point math optimization: ");
2493 return NULL;
2494 }
2495
2496 /* If we detected "res -= x[i]" earlier, rewrite it into
2497 "res += -x[i]" now. If this turns out to be useless reassoc
2498 will clean it up again. */
2499 if (orig_code == MINUS_EXPR)
2500 {
2501 tree rhs = gimple_assign_rhs2 (def_stmt);
2502 tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2503 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2504 rhs, NULL);
2505 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2506 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2507 loop_info, NULL));
2508 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2509 gimple_assign_set_rhs2 (def_stmt, negrhs);
2510 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2511 update_stmt (def_stmt);
2512 }
2513
2514 /* Reduction is safe. We're dealing with one of the following:
2515 1) integer arithmetic and no trapv
2516 2) floating point arithmetic, and special flags permit this optimization
2517 3) nested cycle (i.e., outer loop vectorization). */
2518 if (TREE_CODE (op1) == SSA_NAME)
2519 def1 = SSA_NAME_DEF_STMT (op1);
2520
2521 if (TREE_CODE (op2) == SSA_NAME)
2522 def2 = SSA_NAME_DEF_STMT (op2);
2523
2524 if (code != COND_EXPR
2525 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2526 {
2527 if (dump_enabled_p ())
2528 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2529 return NULL;
2530 }
2531
2532 /* Check that one def is the reduction def, defined by PHI,
2533 the other def is either defined in the loop ("vect_internal_def"),
2534 or it's an induction (defined by a loop-header phi-node). */
2535
2536 if (def2 && def2 == phi
2537 && (code == COND_EXPR
2538 || !def1 || gimple_nop_p (def1)
2539 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2540 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2541 && (is_gimple_assign (def1)
2542 || is_gimple_call (def1)
2543 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2544 == vect_induction_def
2545 || (gimple_code (def1) == GIMPLE_PHI
2546 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2547 == vect_internal_def
2548 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2549 {
2550 if (dump_enabled_p ())
2551 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2552 return def_stmt;
2553 }
2554
2555 if (def1 && def1 == phi
2556 && (code == COND_EXPR
2557 || !def2 || gimple_nop_p (def2)
2558 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2559 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2560 && (is_gimple_assign (def2)
2561 || is_gimple_call (def2)
2562 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2563 == vect_induction_def
2564 || (gimple_code (def2) == GIMPLE_PHI
2565 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2566 == vect_internal_def
2567 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2568 {
2569 if (check_reduction)
2570 {
2571 /* Swap operands (just for simplicity - so that the rest of the code
2572 can assume that the reduction variable is always the last (second)
2573 argument). */
2574 if (dump_enabled_p ())
2575 report_vect_op (MSG_NOTE, def_stmt,
2576 "detected reduction: need to swap operands: ");
2577
2578 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2579 gimple_assign_rhs2_ptr (def_stmt));
2580
2581 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2582 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2583 }
2584 else
2585 {
2586 if (dump_enabled_p ())
2587 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2588 }
2589
2590 return def_stmt;
2591 }
2592
2593 /* Try to find SLP reduction chain. */
2594 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2595 {
2596 if (dump_enabled_p ())
2597 report_vect_op (MSG_NOTE, def_stmt,
2598 "reduction: detected reduction chain: ");
2599
2600 return def_stmt;
2601 }
2602
2603 if (dump_enabled_p ())
2604 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2605 "reduction: unknown pattern: ");
2606
2607 return NULL;
2608 }
2609
2610 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2611 in-place. Arguments as there. */
2612
2613 static gimple
2614 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2615 bool check_reduction, bool *double_reduc)
2616 {
2617 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2618 double_reduc, false);
2619 }
2620
2621 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2622 in-place if it enables detection of more reductions. Arguments
2623 as there. */
2624
2625 gimple
2626 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2627 bool check_reduction, bool *double_reduc)
2628 {
2629 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2630 double_reduc, true);
2631 }
2632
2633 /* Calculate the cost of one scalar iteration of the loop. */
2634 int
2635 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2636 {
2637 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2638 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2639 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2640 int innerloop_iters, i, stmt_cost;
2641
2642 /* Count statements in scalar loop. Using this as scalar cost for a single
2643 iteration for now.
2644
2645 TODO: Add outer loop support.
2646
2647 TODO: Consider assigning different costs to different scalar
2648 statements. */
2649
2650 /* FORNOW. */
2651 innerloop_iters = 1;
2652 if (loop->inner)
2653 innerloop_iters = 50; /* FIXME */
2654
2655 for (i = 0; i < nbbs; i++)
2656 {
2657 gimple_stmt_iterator si;
2658 basic_block bb = bbs[i];
2659
2660 if (bb->loop_father == loop->inner)
2661 factor = innerloop_iters;
2662 else
2663 factor = 1;
2664
2665 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2666 {
2667 gimple stmt = gsi_stmt (si);
2668 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2669
2670 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2671 continue;
2672
2673 /* Skip stmts that are not vectorized inside the loop. */
2674 if (stmt_info
2675 && !STMT_VINFO_RELEVANT_P (stmt_info)
2676 && (!STMT_VINFO_LIVE_P (stmt_info)
2677 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2678 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2679 continue;
2680
2681 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2682 {
2683 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2684 stmt_cost = vect_get_stmt_cost (scalar_load);
2685 else
2686 stmt_cost = vect_get_stmt_cost (scalar_store);
2687 }
2688 else
2689 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2690
2691 scalar_single_iter_cost += stmt_cost * factor;
2692 }
2693 }
2694 return scalar_single_iter_cost;
2695 }
2696
2697 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2698 int
2699 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2700 int *peel_iters_epilogue,
2701 int scalar_single_iter_cost,
2702 stmt_vector_for_cost *prologue_cost_vec,
2703 stmt_vector_for_cost *epilogue_cost_vec)
2704 {
2705 int retval = 0;
2706 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2707
2708 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2709 {
2710 *peel_iters_epilogue = vf/2;
2711 if (dump_enabled_p ())
2712 dump_printf_loc (MSG_NOTE, vect_location,
2713 "cost model: epilogue peel iters set to vf/2 "
2714 "because loop iterations are unknown .\n");
2715
2716 /* If peeled iterations are known but number of scalar loop
2717 iterations are unknown, count a taken branch per peeled loop. */
2718 retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2719 NULL, 0, vect_prologue);
2720 }
2721 else
2722 {
2723 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2724 peel_iters_prologue = niters < peel_iters_prologue ?
2725 niters : peel_iters_prologue;
2726 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2727 /* If we need to peel for gaps, but no peeling is required, we have to
2728 peel VF iterations. */
2729 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2730 *peel_iters_epilogue = vf;
2731 }
2732
2733 if (peel_iters_prologue)
2734 retval += record_stmt_cost (prologue_cost_vec,
2735 peel_iters_prologue * scalar_single_iter_cost,
2736 scalar_stmt, NULL, 0, vect_prologue);
2737 if (*peel_iters_epilogue)
2738 retval += record_stmt_cost (epilogue_cost_vec,
2739 *peel_iters_epilogue * scalar_single_iter_cost,
2740 scalar_stmt, NULL, 0, vect_epilogue);
2741 return retval;
2742 }
2743
2744 /* Function vect_estimate_min_profitable_iters
2745
2746 Return the number of iterations required for the vector version of the
2747 loop to be profitable relative to the cost of the scalar version of the
2748 loop. */
2749
2750 static void
2751 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2752 int *ret_min_profitable_niters,
2753 int *ret_min_profitable_estimate)
2754 {
2755 int min_profitable_iters;
2756 int min_profitable_estimate;
2757 int peel_iters_prologue;
2758 int peel_iters_epilogue;
2759 unsigned vec_inside_cost = 0;
2760 int vec_outside_cost = 0;
2761 unsigned vec_prologue_cost = 0;
2762 unsigned vec_epilogue_cost = 0;
2763 int scalar_single_iter_cost = 0;
2764 int scalar_outside_cost = 0;
2765 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2766 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2767 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2768
2769 /* Cost model disabled. */
2770 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2771 {
2772 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2773 *ret_min_profitable_niters = 0;
2774 *ret_min_profitable_estimate = 0;
2775 return;
2776 }
2777
2778 /* Requires loop versioning tests to handle misalignment. */
2779 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2780 {
2781 /* FIXME: Make cost depend on complexity of individual check. */
2782 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2783 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2784 vect_prologue);
2785 dump_printf (MSG_NOTE,
2786 "cost model: Adding cost of checks for loop "
2787 "versioning to treat misalignment.\n");
2788 }
2789
2790 /* Requires loop versioning with alias checks. */
2791 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2792 {
2793 /* FIXME: Make cost depend on complexity of individual check. */
2794 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2795 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2796 vect_prologue);
2797 dump_printf (MSG_NOTE,
2798 "cost model: Adding cost of checks for loop "
2799 "versioning aliasing.\n");
2800 }
2801
2802 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2803 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2804 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2805 vect_prologue);
2806
2807 /* Count statements in scalar loop. Using this as scalar cost for a single
2808 iteration for now.
2809
2810 TODO: Add outer loop support.
2811
2812 TODO: Consider assigning different costs to different scalar
2813 statements. */
2814
2815 scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2816
2817 /* Add additional cost for the peeled instructions in prologue and epilogue
2818 loop.
2819
2820 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2821 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2822
2823 TODO: Build an expression that represents peel_iters for prologue and
2824 epilogue to be used in a run-time test. */
2825
2826 if (npeel < 0)
2827 {
2828 peel_iters_prologue = vf/2;
2829 dump_printf (MSG_NOTE, "cost model: "
2830 "prologue peel iters set to vf/2.\n");
2831
2832 /* If peeling for alignment is unknown, loop bound of main loop becomes
2833 unknown. */
2834 peel_iters_epilogue = vf/2;
2835 dump_printf (MSG_NOTE, "cost model: "
2836 "epilogue peel iters set to vf/2 because "
2837 "peeling for alignment is unknown.\n");
2838
2839 /* If peeled iterations are unknown, count a taken branch and a not taken
2840 branch per peeled loop. Even if scalar loop iterations are known,
2841 vector iterations are not known since peeled prologue iterations are
2842 not known. Hence guards remain the same. */
2843 (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2844 NULL, 0, vect_prologue);
2845 (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2846 NULL, 0, vect_prologue);
2847 /* FORNOW: Don't attempt to pass individual scalar instructions to
2848 the model; just assume linear cost for scalar iterations. */
2849 (void) add_stmt_cost (target_cost_data,
2850 peel_iters_prologue * scalar_single_iter_cost,
2851 scalar_stmt, NULL, 0, vect_prologue);
2852 (void) add_stmt_cost (target_cost_data,
2853 peel_iters_epilogue * scalar_single_iter_cost,
2854 scalar_stmt, NULL, 0, vect_epilogue);
2855 }
2856 else
2857 {
2858 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2859 stmt_info_for_cost *si;
2860 int j;
2861 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2862
2863 prologue_cost_vec.create (2);
2864 epilogue_cost_vec.create (2);
2865 peel_iters_prologue = npeel;
2866
2867 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2868 &peel_iters_epilogue,
2869 scalar_single_iter_cost,
2870 &prologue_cost_vec,
2871 &epilogue_cost_vec);
2872
2873 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2874 {
2875 struct _stmt_vec_info *stmt_info
2876 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2877 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2878 si->misalign, vect_prologue);
2879 }
2880
2881 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2882 {
2883 struct _stmt_vec_info *stmt_info
2884 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2885 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2886 si->misalign, vect_epilogue);
2887 }
2888
2889 prologue_cost_vec.release ();
2890 epilogue_cost_vec.release ();
2891 }
2892
2893 /* FORNOW: The scalar outside cost is incremented in one of the
2894 following ways:
2895
2896 1. The vectorizer checks for alignment and aliasing and generates
2897 a condition that allows dynamic vectorization. A cost model
2898 check is ANDED with the versioning condition. Hence scalar code
2899 path now has the added cost of the versioning check.
2900
2901 if (cost > th & versioning_check)
2902 jmp to vector code
2903
2904 Hence run-time scalar is incremented by not-taken branch cost.
2905
2906 2. The vectorizer then checks if a prologue is required. If the
2907 cost model check was not done before during versioning, it has to
2908 be done before the prologue check.
2909
2910 if (cost <= th)
2911 prologue = scalar_iters
2912 if (prologue == 0)
2913 jmp to vector code
2914 else
2915 execute prologue
2916 if (prologue == num_iters)
2917 go to exit
2918
2919 Hence the run-time scalar cost is incremented by a taken branch,
2920 plus a not-taken branch, plus a taken branch cost.
2921
2922 3. The vectorizer then checks if an epilogue is required. If the
2923 cost model check was not done before during prologue check, it
2924 has to be done with the epilogue check.
2925
2926 if (prologue == 0)
2927 jmp to vector code
2928 else
2929 execute prologue
2930 if (prologue == num_iters)
2931 go to exit
2932 vector code:
2933 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2934 jmp to epilogue
2935
2936 Hence the run-time scalar cost should be incremented by 2 taken
2937 branches.
2938
2939 TODO: The back end may reorder the BBS's differently and reverse
2940 conditions/branch directions. Change the estimates below to
2941 something more reasonable. */
2942
2943 /* If the number of iterations is known and we do not do versioning, we can
2944 decide whether to vectorize at compile time. Hence the scalar version
2945 do not carry cost model guard costs. */
2946 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2947 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2948 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2949 {
2950 /* Cost model check occurs at versioning. */
2951 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2952 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2953 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2954 else
2955 {
2956 /* Cost model check occurs at prologue generation. */
2957 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2958 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2959 + vect_get_stmt_cost (cond_branch_not_taken);
2960 /* Cost model check occurs at epilogue generation. */
2961 else
2962 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2963 }
2964 }
2965
2966 /* Complete the target-specific cost calculations. */
2967 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2968 &vec_inside_cost, &vec_epilogue_cost);
2969
2970 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2971
2972 /* Calculate number of iterations required to make the vector version
2973 profitable, relative to the loop bodies only. The following condition
2974 must hold true:
2975 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2976 where
2977 SIC = scalar iteration cost, VIC = vector iteration cost,
2978 VOC = vector outside cost, VF = vectorization factor,
2979 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2980 SOC = scalar outside cost for run time cost model check. */
2981
2982 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2983 {
2984 if (vec_outside_cost <= 0)
2985 min_profitable_iters = 1;
2986 else
2987 {
2988 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2989 - vec_inside_cost * peel_iters_prologue
2990 - vec_inside_cost * peel_iters_epilogue)
2991 / ((scalar_single_iter_cost * vf)
2992 - vec_inside_cost);
2993
2994 if ((scalar_single_iter_cost * vf * min_profitable_iters)
2995 <= (((int) vec_inside_cost * min_profitable_iters)
2996 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2997 min_profitable_iters++;
2998 }
2999 }
3000 /* vector version will never be profitable. */
3001 else
3002 {
3003 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3004 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3005 "did not happen for a simd loop");
3006
3007 if (dump_enabled_p ())
3008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3009 "cost model: the vector iteration cost = %d "
3010 "divided by the scalar iteration cost = %d "
3011 "is greater or equal to the vectorization factor = %d"
3012 ".\n",
3013 vec_inside_cost, scalar_single_iter_cost, vf);
3014 *ret_min_profitable_niters = -1;
3015 *ret_min_profitable_estimate = -1;
3016 return;
3017 }
3018
3019 if (dump_enabled_p ())
3020 {
3021 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3022 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3023 vec_inside_cost);
3024 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3025 vec_prologue_cost);
3026 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3027 vec_epilogue_cost);
3028 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3029 scalar_single_iter_cost);
3030 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3031 scalar_outside_cost);
3032 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3033 vec_outside_cost);
3034 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3035 peel_iters_prologue);
3036 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3037 peel_iters_epilogue);
3038 dump_printf (MSG_NOTE,
3039 " Calculated minimum iters for profitability: %d\n",
3040 min_profitable_iters);
3041 dump_printf (MSG_NOTE, "\n");
3042 }
3043
3044 min_profitable_iters =
3045 min_profitable_iters < vf ? vf : min_profitable_iters;
3046
3047 /* Because the condition we create is:
3048 if (niters <= min_profitable_iters)
3049 then skip the vectorized loop. */
3050 min_profitable_iters--;
3051
3052 if (dump_enabled_p ())
3053 dump_printf_loc (MSG_NOTE, vect_location,
3054 " Runtime profitability threshold = %d\n",
3055 min_profitable_iters);
3056
3057 *ret_min_profitable_niters = min_profitable_iters;
3058
3059 /* Calculate number of iterations required to make the vector version
3060 profitable, relative to the loop bodies only.
3061
3062 Non-vectorized variant is SIC * niters and it must win over vector
3063 variant on the expected loop trip count. The following condition must hold true:
3064 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3065
3066 if (vec_outside_cost <= 0)
3067 min_profitable_estimate = 1;
3068 else
3069 {
3070 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3071 - vec_inside_cost * peel_iters_prologue
3072 - vec_inside_cost * peel_iters_epilogue)
3073 / ((scalar_single_iter_cost * vf)
3074 - vec_inside_cost);
3075 }
3076 min_profitable_estimate --;
3077 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3078 if (dump_enabled_p ())
3079 dump_printf_loc (MSG_NOTE, vect_location,
3080 " Static estimate profitability threshold = %d\n",
3081 min_profitable_iters);
3082
3083 *ret_min_profitable_estimate = min_profitable_estimate;
3084 }
3085
3086 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3087 vector elements (not bits) for a vector of mode MODE. */
3088 static void
3089 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3090 unsigned char *sel)
3091 {
3092 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3093
3094 for (i = 0; i < nelt; i++)
3095 sel[i] = (i + offset) & (2*nelt - 1);
3096 }
3097
3098 /* Checks whether the target supports whole-vector shifts for vectors of mode
3099 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3100 it supports vec_perm_const with masks for all necessary shift amounts. */
3101 static bool
3102 have_whole_vector_shift (enum machine_mode mode)
3103 {
3104 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3105 return true;
3106
3107 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3108 return false;
3109
3110 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3111 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3112
3113 for (i = nelt/2; i >= 1; i/=2)
3114 {
3115 calc_vec_perm_mask_for_shift (mode, i, sel);
3116 if (!can_vec_perm_p (mode, false, sel))
3117 return false;
3118 }
3119 return true;
3120 }
3121
3122 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3123 functions. Design better to avoid maintenance issues. */
3124
3125 /* Function vect_model_reduction_cost.
3126
3127 Models cost for a reduction operation, including the vector ops
3128 generated within the strip-mine loop, the initial definition before
3129 the loop, and the epilogue code that must be generated. */
3130
3131 static bool
3132 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3133 int ncopies)
3134 {
3135 int prologue_cost = 0, epilogue_cost = 0;
3136 enum tree_code code;
3137 optab optab;
3138 tree vectype;
3139 gimple stmt, orig_stmt;
3140 tree reduction_op;
3141 machine_mode mode;
3142 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3143 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3144 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3145
3146 /* Cost of reduction op inside loop. */
3147 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3148 stmt_info, 0, vect_body);
3149 stmt = STMT_VINFO_STMT (stmt_info);
3150
3151 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3152 {
3153 case GIMPLE_SINGLE_RHS:
3154 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3155 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3156 break;
3157 case GIMPLE_UNARY_RHS:
3158 reduction_op = gimple_assign_rhs1 (stmt);
3159 break;
3160 case GIMPLE_BINARY_RHS:
3161 reduction_op = gimple_assign_rhs2 (stmt);
3162 break;
3163 case GIMPLE_TERNARY_RHS:
3164 reduction_op = gimple_assign_rhs3 (stmt);
3165 break;
3166 default:
3167 gcc_unreachable ();
3168 }
3169
3170 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3171 if (!vectype)
3172 {
3173 if (dump_enabled_p ())
3174 {
3175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3176 "unsupported data-type ");
3177 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3178 TREE_TYPE (reduction_op));
3179 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3180 }
3181 return false;
3182 }
3183
3184 mode = TYPE_MODE (vectype);
3185 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3186
3187 if (!orig_stmt)
3188 orig_stmt = STMT_VINFO_STMT (stmt_info);
3189
3190 code = gimple_assign_rhs_code (orig_stmt);
3191
3192 /* Add in cost for initial definition. */
3193 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3194 stmt_info, 0, vect_prologue);
3195
3196 /* Determine cost of epilogue code.
3197
3198 We have a reduction operator that will reduce the vector in one statement.
3199 Also requires scalar extract. */
3200
3201 if (!nested_in_vect_loop_p (loop, orig_stmt))
3202 {
3203 if (reduc_code != ERROR_MARK)
3204 {
3205 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3206 stmt_info, 0, vect_epilogue);
3207 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3208 stmt_info, 0, vect_epilogue);
3209 }
3210 else
3211 {
3212 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3213 tree bitsize =
3214 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3215 int element_bitsize = tree_to_uhwi (bitsize);
3216 int nelements = vec_size_in_bits / element_bitsize;
3217
3218 optab = optab_for_tree_code (code, vectype, optab_default);
3219
3220 /* We have a whole vector shift available. */
3221 if (VECTOR_MODE_P (mode)
3222 && optab_handler (optab, mode) != CODE_FOR_nothing
3223 && have_whole_vector_shift (mode))
3224 {
3225 /* Final reduction via vector shifts and the reduction operator.
3226 Also requires scalar extract. */
3227 epilogue_cost += add_stmt_cost (target_cost_data,
3228 exact_log2 (nelements) * 2,
3229 vector_stmt, stmt_info, 0,
3230 vect_epilogue);
3231 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3232 vec_to_scalar, stmt_info, 0,
3233 vect_epilogue);
3234 }
3235 else
3236 /* Use extracts and reduction op for final reduction. For N
3237 elements, we have N extracts and N-1 reduction ops. */
3238 epilogue_cost += add_stmt_cost (target_cost_data,
3239 nelements + nelements - 1,
3240 vector_stmt, stmt_info, 0,
3241 vect_epilogue);
3242 }
3243 }
3244
3245 if (dump_enabled_p ())
3246 dump_printf (MSG_NOTE,
3247 "vect_model_reduction_cost: inside_cost = %d, "
3248 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3249 prologue_cost, epilogue_cost);
3250
3251 return true;
3252 }
3253
3254
3255 /* Function vect_model_induction_cost.
3256
3257 Models cost for induction operations. */
3258
3259 static void
3260 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3261 {
3262 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3263 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3264 unsigned inside_cost, prologue_cost;
3265
3266 /* loop cost for vec_loop. */
3267 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3268 stmt_info, 0, vect_body);
3269
3270 /* prologue cost for vec_init and vec_step. */
3271 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3272 stmt_info, 0, vect_prologue);
3273
3274 if (dump_enabled_p ())
3275 dump_printf_loc (MSG_NOTE, vect_location,
3276 "vect_model_induction_cost: inside_cost = %d, "
3277 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3278 }
3279
3280
3281 /* Function get_initial_def_for_induction
3282
3283 Input:
3284 STMT - a stmt that performs an induction operation in the loop.
3285 IV_PHI - the initial value of the induction variable
3286
3287 Output:
3288 Return a vector variable, initialized with the first VF values of
3289 the induction variable. E.g., for an iv with IV_PHI='X' and
3290 evolution S, for a vector of 4 units, we want to return:
3291 [X, X + S, X + 2*S, X + 3*S]. */
3292
3293 static tree
3294 get_initial_def_for_induction (gimple iv_phi)
3295 {
3296 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3297 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3298 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3299 tree vectype;
3300 int nunits;
3301 edge pe = loop_preheader_edge (loop);
3302 struct loop *iv_loop;
3303 basic_block new_bb;
3304 tree new_vec, vec_init, vec_step, t;
3305 tree new_var;
3306 tree new_name;
3307 gimple init_stmt, induction_phi, new_stmt;
3308 tree induc_def, vec_def, vec_dest;
3309 tree init_expr, step_expr;
3310 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3311 int i;
3312 int ncopies;
3313 tree expr;
3314 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3315 bool nested_in_vect_loop = false;
3316 gimple_seq stmts = NULL;
3317 imm_use_iterator imm_iter;
3318 use_operand_p use_p;
3319 gimple exit_phi;
3320 edge latch_e;
3321 tree loop_arg;
3322 gimple_stmt_iterator si;
3323 basic_block bb = gimple_bb (iv_phi);
3324 tree stepvectype;
3325 tree resvectype;
3326
3327 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3328 if (nested_in_vect_loop_p (loop, iv_phi))
3329 {
3330 nested_in_vect_loop = true;
3331 iv_loop = loop->inner;
3332 }
3333 else
3334 iv_loop = loop;
3335 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3336
3337 latch_e = loop_latch_edge (iv_loop);
3338 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3339
3340 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3341 gcc_assert (step_expr != NULL_TREE);
3342
3343 pe = loop_preheader_edge (iv_loop);
3344 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3345 loop_preheader_edge (iv_loop));
3346
3347 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3348 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3349 gcc_assert (vectype);
3350 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3351 ncopies = vf / nunits;
3352
3353 gcc_assert (phi_info);
3354 gcc_assert (ncopies >= 1);
3355
3356 /* Convert the step to the desired type. */
3357 step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3358 step_expr),
3359 &stmts, true, NULL_TREE);
3360 if (stmts)
3361 {
3362 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3363 gcc_assert (!new_bb);
3364 }
3365
3366 /* Find the first insertion point in the BB. */
3367 si = gsi_after_labels (bb);
3368
3369 /* Create the vector that holds the initial_value of the induction. */
3370 if (nested_in_vect_loop)
3371 {
3372 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3373 been created during vectorization of previous stmts. We obtain it
3374 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3375 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3376 /* If the initial value is not of proper type, convert it. */
3377 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3378 {
3379 new_stmt = gimple_build_assign_with_ops
3380 (VIEW_CONVERT_EXPR,
3381 vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3382 build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3383 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3384 gimple_assign_set_lhs (new_stmt, vec_init);
3385 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3386 new_stmt);
3387 gcc_assert (!new_bb);
3388 set_vinfo_for_stmt (new_stmt,
3389 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3390 }
3391 }
3392 else
3393 {
3394 vec<constructor_elt, va_gc> *v;
3395
3396 /* iv_loop is the loop to be vectorized. Create:
3397 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3398 new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3399 vect_scalar_var, "var_");
3400 new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3401 init_expr),
3402 &stmts, false, new_var);
3403 if (stmts)
3404 {
3405 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3406 gcc_assert (!new_bb);
3407 }
3408
3409 vec_alloc (v, nunits);
3410 bool constant_p = is_gimple_min_invariant (new_name);
3411 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3412 for (i = 1; i < nunits; i++)
3413 {
3414 /* Create: new_name_i = new_name + step_expr */
3415 new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3416 new_name, step_expr);
3417 if (!is_gimple_min_invariant (new_name))
3418 {
3419 init_stmt = gimple_build_assign (new_var, new_name);
3420 new_name = make_ssa_name (new_var, init_stmt);
3421 gimple_assign_set_lhs (init_stmt, new_name);
3422 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3423 gcc_assert (!new_bb);
3424 if (dump_enabled_p ())
3425 {
3426 dump_printf_loc (MSG_NOTE, vect_location,
3427 "created new init_stmt: ");
3428 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3429 dump_printf (MSG_NOTE, "\n");
3430 }
3431 constant_p = false;
3432 }
3433 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3434 }
3435 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3436 if (constant_p)
3437 new_vec = build_vector_from_ctor (vectype, v);
3438 else
3439 new_vec = build_constructor (vectype, v);
3440 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3441 }
3442
3443
3444 /* Create the vector that holds the step of the induction. */
3445 if (nested_in_vect_loop)
3446 /* iv_loop is nested in the loop to be vectorized. Generate:
3447 vec_step = [S, S, S, S] */
3448 new_name = step_expr;
3449 else
3450 {
3451 /* iv_loop is the loop to be vectorized. Generate:
3452 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3453 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3454 {
3455 expr = build_int_cst (integer_type_node, vf);
3456 expr = fold_convert (TREE_TYPE (step_expr), expr);
3457 }
3458 else
3459 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3460 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3461 expr, step_expr);
3462 if (TREE_CODE (step_expr) == SSA_NAME)
3463 new_name = vect_init_vector (iv_phi, new_name,
3464 TREE_TYPE (step_expr), NULL);
3465 }
3466
3467 t = unshare_expr (new_name);
3468 gcc_assert (CONSTANT_CLASS_P (new_name)
3469 || TREE_CODE (new_name) == SSA_NAME);
3470 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3471 gcc_assert (stepvectype);
3472 new_vec = build_vector_from_val (stepvectype, t);
3473 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3474
3475
3476 /* Create the following def-use cycle:
3477 loop prolog:
3478 vec_init = ...
3479 vec_step = ...
3480 loop:
3481 vec_iv = PHI <vec_init, vec_loop>
3482 ...
3483 STMT
3484 ...
3485 vec_loop = vec_iv + vec_step; */
3486
3487 /* Create the induction-phi that defines the induction-operand. */
3488 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3489 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3490 set_vinfo_for_stmt (induction_phi,
3491 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3492 induc_def = PHI_RESULT (induction_phi);
3493
3494 /* Create the iv update inside the loop */
3495 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3496 induc_def, vec_step);
3497 vec_def = make_ssa_name (vec_dest, new_stmt);
3498 gimple_assign_set_lhs (new_stmt, vec_def);
3499 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3500 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3501 NULL));
3502
3503 /* Set the arguments of the phi node: */
3504 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3505 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3506 UNKNOWN_LOCATION);
3507
3508
3509 /* In case that vectorization factor (VF) is bigger than the number
3510 of elements that we can fit in a vectype (nunits), we have to generate
3511 more than one vector stmt - i.e - we need to "unroll" the
3512 vector stmt by a factor VF/nunits. For more details see documentation
3513 in vectorizable_operation. */
3514
3515 if (ncopies > 1)
3516 {
3517 stmt_vec_info prev_stmt_vinfo;
3518 /* FORNOW. This restriction should be relaxed. */
3519 gcc_assert (!nested_in_vect_loop);
3520
3521 /* Create the vector that holds the step of the induction. */
3522 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3523 {
3524 expr = build_int_cst (integer_type_node, nunits);
3525 expr = fold_convert (TREE_TYPE (step_expr), expr);
3526 }
3527 else
3528 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3529 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3530 expr, step_expr);
3531 if (TREE_CODE (step_expr) == SSA_NAME)
3532 new_name = vect_init_vector (iv_phi, new_name,
3533 TREE_TYPE (step_expr), NULL);
3534 t = unshare_expr (new_name);
3535 gcc_assert (CONSTANT_CLASS_P (new_name)
3536 || TREE_CODE (new_name) == SSA_NAME);
3537 new_vec = build_vector_from_val (stepvectype, t);
3538 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3539
3540 vec_def = induc_def;
3541 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3542 for (i = 1; i < ncopies; i++)
3543 {
3544 /* vec_i = vec_prev + vec_step */
3545 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3546 vec_def, vec_step);
3547 vec_def = make_ssa_name (vec_dest, new_stmt);
3548 gimple_assign_set_lhs (new_stmt, vec_def);
3549
3550 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3551 if (!useless_type_conversion_p (resvectype, vectype))
3552 {
3553 new_stmt = gimple_build_assign_with_ops
3554 (VIEW_CONVERT_EXPR,
3555 vect_get_new_vect_var (resvectype, vect_simple_var,
3556 "vec_iv_"),
3557 build1 (VIEW_CONVERT_EXPR, resvectype,
3558 gimple_assign_lhs (new_stmt)), NULL_TREE);
3559 gimple_assign_set_lhs (new_stmt,
3560 make_ssa_name
3561 (gimple_assign_lhs (new_stmt), new_stmt));
3562 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3563 }
3564 set_vinfo_for_stmt (new_stmt,
3565 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3566 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3567 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3568 }
3569 }
3570
3571 if (nested_in_vect_loop)
3572 {
3573 /* Find the loop-closed exit-phi of the induction, and record
3574 the final vector of induction results: */
3575 exit_phi = NULL;
3576 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3577 {
3578 gimple use_stmt = USE_STMT (use_p);
3579 if (is_gimple_debug (use_stmt))
3580 continue;
3581
3582 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3583 {
3584 exit_phi = use_stmt;
3585 break;
3586 }
3587 }
3588 if (exit_phi)
3589 {
3590 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3591 /* FORNOW. Currently not supporting the case that an inner-loop induction
3592 is not used in the outer-loop (i.e. only outside the outer-loop). */
3593 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3594 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3595
3596 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3597 if (dump_enabled_p ())
3598 {
3599 dump_printf_loc (MSG_NOTE, vect_location,
3600 "vector of inductions after inner-loop:");
3601 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3602 dump_printf (MSG_NOTE, "\n");
3603 }
3604 }
3605 }
3606
3607
3608 if (dump_enabled_p ())
3609 {
3610 dump_printf_loc (MSG_NOTE, vect_location,
3611 "transform induction: created def-use cycle: ");
3612 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3613 dump_printf (MSG_NOTE, "\n");
3614 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3615 SSA_NAME_DEF_STMT (vec_def), 0);
3616 dump_printf (MSG_NOTE, "\n");
3617 }
3618
3619 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3620 if (!useless_type_conversion_p (resvectype, vectype))
3621 {
3622 new_stmt = gimple_build_assign_with_ops
3623 (VIEW_CONVERT_EXPR,
3624 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3625 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3626 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3627 gimple_assign_set_lhs (new_stmt, induc_def);
3628 si = gsi_after_labels (bb);
3629 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3630 set_vinfo_for_stmt (new_stmt,
3631 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3632 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3633 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3634 }
3635
3636 return induc_def;
3637 }
3638
3639
3640 /* Function get_initial_def_for_reduction
3641
3642 Input:
3643 STMT - a stmt that performs a reduction operation in the loop.
3644 INIT_VAL - the initial value of the reduction variable
3645
3646 Output:
3647 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3648 of the reduction (used for adjusting the epilog - see below).
3649 Return a vector variable, initialized according to the operation that STMT
3650 performs. This vector will be used as the initial value of the
3651 vector of partial results.
3652
3653 Option1 (adjust in epilog): Initialize the vector as follows:
3654 add/bit or/xor: [0,0,...,0,0]
3655 mult/bit and: [1,1,...,1,1]
3656 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3657 and when necessary (e.g. add/mult case) let the caller know
3658 that it needs to adjust the result by init_val.
3659
3660 Option2: Initialize the vector as follows:
3661 add/bit or/xor: [init_val,0,0,...,0]
3662 mult/bit and: [init_val,1,1,...,1]
3663 min/max/cond_expr: [init_val,init_val,...,init_val]
3664 and no adjustments are needed.
3665
3666 For example, for the following code:
3667
3668 s = init_val;
3669 for (i=0;i<n;i++)
3670 s = s + a[i];
3671
3672 STMT is 's = s + a[i]', and the reduction variable is 's'.
3673 For a vector of 4 units, we want to return either [0,0,0,init_val],
3674 or [0,0,0,0] and let the caller know that it needs to adjust
3675 the result at the end by 'init_val'.
3676
3677 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3678 initialization vector is simpler (same element in all entries), if
3679 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3680
3681 A cost model should help decide between these two schemes. */
3682
3683 tree
3684 get_initial_def_for_reduction (gimple stmt, tree init_val,
3685 tree *adjustment_def)
3686 {
3687 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3688 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3689 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3690 tree scalar_type = TREE_TYPE (init_val);
3691 tree vectype = get_vectype_for_scalar_type (scalar_type);
3692 int nunits;
3693 enum tree_code code = gimple_assign_rhs_code (stmt);
3694 tree def_for_init;
3695 tree init_def;
3696 tree *elts;
3697 int i;
3698 bool nested_in_vect_loop = false;
3699 tree init_value;
3700 REAL_VALUE_TYPE real_init_val = dconst0;
3701 int int_init_val = 0;
3702 gimple def_stmt = NULL;
3703
3704 gcc_assert (vectype);
3705 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3706
3707 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3708 || SCALAR_FLOAT_TYPE_P (scalar_type));
3709
3710 if (nested_in_vect_loop_p (loop, stmt))
3711 nested_in_vect_loop = true;
3712 else
3713 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3714
3715 /* In case of double reduction we only create a vector variable to be put
3716 in the reduction phi node. The actual statement creation is done in
3717 vect_create_epilog_for_reduction. */
3718 if (adjustment_def && nested_in_vect_loop
3719 && TREE_CODE (init_val) == SSA_NAME
3720 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3721 && gimple_code (def_stmt) == GIMPLE_PHI
3722 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3723 && vinfo_for_stmt (def_stmt)
3724 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3725 == vect_double_reduction_def)
3726 {
3727 *adjustment_def = NULL;
3728 return vect_create_destination_var (init_val, vectype);
3729 }
3730
3731 if (TREE_CONSTANT (init_val))
3732 {
3733 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3734 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3735 else
3736 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3737 }
3738 else
3739 init_value = init_val;
3740
3741 switch (code)
3742 {
3743 case WIDEN_SUM_EXPR:
3744 case DOT_PROD_EXPR:
3745 case SAD_EXPR:
3746 case PLUS_EXPR:
3747 case MINUS_EXPR:
3748 case BIT_IOR_EXPR:
3749 case BIT_XOR_EXPR:
3750 case MULT_EXPR:
3751 case BIT_AND_EXPR:
3752 /* ADJUSMENT_DEF is NULL when called from
3753 vect_create_epilog_for_reduction to vectorize double reduction. */
3754 if (adjustment_def)
3755 {
3756 if (nested_in_vect_loop)
3757 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3758 NULL);
3759 else
3760 *adjustment_def = init_val;
3761 }
3762
3763 if (code == MULT_EXPR)
3764 {
3765 real_init_val = dconst1;
3766 int_init_val = 1;
3767 }
3768
3769 if (code == BIT_AND_EXPR)
3770 int_init_val = -1;
3771
3772 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3773 def_for_init = build_real (scalar_type, real_init_val);
3774 else
3775 def_for_init = build_int_cst (scalar_type, int_init_val);
3776
3777 /* Create a vector of '0' or '1' except the first element. */
3778 elts = XALLOCAVEC (tree, nunits);
3779 for (i = nunits - 2; i >= 0; --i)
3780 elts[i + 1] = def_for_init;
3781
3782 /* Option1: the first element is '0' or '1' as well. */
3783 if (adjustment_def)
3784 {
3785 elts[0] = def_for_init;
3786 init_def = build_vector (vectype, elts);
3787 break;
3788 }
3789
3790 /* Option2: the first element is INIT_VAL. */
3791 elts[0] = init_val;
3792 if (TREE_CONSTANT (init_val))
3793 init_def = build_vector (vectype, elts);
3794 else
3795 {
3796 vec<constructor_elt, va_gc> *v;
3797 vec_alloc (v, nunits);
3798 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3799 for (i = 1; i < nunits; ++i)
3800 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3801 init_def = build_constructor (vectype, v);
3802 }
3803
3804 break;
3805
3806 case MIN_EXPR:
3807 case MAX_EXPR:
3808 case COND_EXPR:
3809 if (adjustment_def)
3810 {
3811 *adjustment_def = NULL_TREE;
3812 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3813 break;
3814 }
3815
3816 init_def = build_vector_from_val (vectype, init_value);
3817 break;
3818
3819 default:
3820 gcc_unreachable ();
3821 }
3822
3823 return init_def;
3824 }
3825
3826 /* Function vect_create_epilog_for_reduction
3827
3828 Create code at the loop-epilog to finalize the result of a reduction
3829 computation.
3830
3831 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3832 reduction statements.
3833 STMT is the scalar reduction stmt that is being vectorized.
3834 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3835 number of elements that we can fit in a vectype (nunits). In this case
3836 we have to generate more than one vector stmt - i.e - we need to "unroll"
3837 the vector stmt by a factor VF/nunits. For more details see documentation
3838 in vectorizable_operation.
3839 REDUC_CODE is the tree-code for the epilog reduction.
3840 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3841 computation.
3842 REDUC_INDEX is the index of the operand in the right hand side of the
3843 statement that is defined by REDUCTION_PHI.
3844 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3845 SLP_NODE is an SLP node containing a group of reduction statements. The
3846 first one in this group is STMT.
3847
3848 This function:
3849 1. Creates the reduction def-use cycles: sets the arguments for
3850 REDUCTION_PHIS:
3851 The loop-entry argument is the vectorized initial-value of the reduction.
3852 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3853 sums.
3854 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3855 by applying the operation specified by REDUC_CODE if available, or by
3856 other means (whole-vector shifts or a scalar loop).
3857 The function also creates a new phi node at the loop exit to preserve
3858 loop-closed form, as illustrated below.
3859
3860 The flow at the entry to this function:
3861
3862 loop:
3863 vec_def = phi <null, null> # REDUCTION_PHI
3864 VECT_DEF = vector_stmt # vectorized form of STMT
3865 s_loop = scalar_stmt # (scalar) STMT
3866 loop_exit:
3867 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3868 use <s_out0>
3869 use <s_out0>
3870
3871 The above is transformed by this function into:
3872
3873 loop:
3874 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3875 VECT_DEF = vector_stmt # vectorized form of STMT
3876 s_loop = scalar_stmt # (scalar) STMT
3877 loop_exit:
3878 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3879 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3880 v_out2 = reduce <v_out1>
3881 s_out3 = extract_field <v_out2, 0>
3882 s_out4 = adjust_result <s_out3>
3883 use <s_out4>
3884 use <s_out4>
3885 */
3886
3887 static void
3888 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3889 int ncopies, enum tree_code reduc_code,
3890 vec<gimple> reduction_phis,
3891 int reduc_index, bool double_reduc,
3892 slp_tree slp_node)
3893 {
3894 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3895 stmt_vec_info prev_phi_info;
3896 tree vectype;
3897 machine_mode mode;
3898 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3899 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3900 basic_block exit_bb;
3901 tree scalar_dest;
3902 tree scalar_type;
3903 gimple new_phi = NULL, phi;
3904 gimple_stmt_iterator exit_gsi;
3905 tree vec_dest;
3906 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3907 gimple epilog_stmt = NULL;
3908 enum tree_code code = gimple_assign_rhs_code (stmt);
3909 gimple exit_phi;
3910 tree bitsize;
3911 tree adjustment_def = NULL;
3912 tree vec_initial_def = NULL;
3913 tree reduction_op, expr, def;
3914 tree orig_name, scalar_result;
3915 imm_use_iterator imm_iter, phi_imm_iter;
3916 use_operand_p use_p, phi_use_p;
3917 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3918 bool nested_in_vect_loop = false;
3919 auto_vec<gimple> new_phis;
3920 auto_vec<gimple> inner_phis;
3921 enum vect_def_type dt = vect_unknown_def_type;
3922 int j, i;
3923 auto_vec<tree> scalar_results;
3924 unsigned int group_size = 1, k, ratio;
3925 auto_vec<tree> vec_initial_defs;
3926 auto_vec<gimple> phis;
3927 bool slp_reduc = false;
3928 tree new_phi_result;
3929 gimple inner_phi = NULL;
3930
3931 if (slp_node)
3932 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3933
3934 if (nested_in_vect_loop_p (loop, stmt))
3935 {
3936 outer_loop = loop;
3937 loop = loop->inner;
3938 nested_in_vect_loop = true;
3939 gcc_assert (!slp_node);
3940 }
3941
3942 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3943 {
3944 case GIMPLE_SINGLE_RHS:
3945 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3946 == ternary_op);
3947 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3948 break;
3949 case GIMPLE_UNARY_RHS:
3950 reduction_op = gimple_assign_rhs1 (stmt);
3951 break;
3952 case GIMPLE_BINARY_RHS:
3953 reduction_op = reduc_index ?
3954 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3955 break;
3956 case GIMPLE_TERNARY_RHS:
3957 reduction_op = gimple_op (stmt, reduc_index + 1);
3958 break;
3959 default:
3960 gcc_unreachable ();
3961 }
3962
3963 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3964 gcc_assert (vectype);
3965 mode = TYPE_MODE (vectype);
3966
3967 /* 1. Create the reduction def-use cycle:
3968 Set the arguments of REDUCTION_PHIS, i.e., transform
3969
3970 loop:
3971 vec_def = phi <null, null> # REDUCTION_PHI
3972 VECT_DEF = vector_stmt # vectorized form of STMT
3973 ...
3974
3975 into:
3976
3977 loop:
3978 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3979 VECT_DEF = vector_stmt # vectorized form of STMT
3980 ...
3981
3982 (in case of SLP, do it for all the phis). */
3983
3984 /* Get the loop-entry arguments. */
3985 if (slp_node)
3986 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3987 NULL, slp_node, reduc_index);
3988 else
3989 {
3990 vec_initial_defs.create (1);
3991 /* For the case of reduction, vect_get_vec_def_for_operand returns
3992 the scalar def before the loop, that defines the initial value
3993 of the reduction variable. */
3994 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3995 &adjustment_def);
3996 vec_initial_defs.quick_push (vec_initial_def);
3997 }
3998
3999 /* Set phi nodes arguments. */
4000 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4001 {
4002 tree vec_init_def, def;
4003 gimple_seq stmts;
4004 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4005 true, NULL_TREE);
4006 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4007 def = vect_defs[i];
4008 for (j = 0; j < ncopies; j++)
4009 {
4010 /* Set the loop-entry arg of the reduction-phi. */
4011 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4012 UNKNOWN_LOCATION);
4013
4014 /* Set the loop-latch arg for the reduction-phi. */
4015 if (j > 0)
4016 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4017
4018 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4019
4020 if (dump_enabled_p ())
4021 {
4022 dump_printf_loc (MSG_NOTE, vect_location,
4023 "transform reduction: created def-use cycle: ");
4024 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4025 dump_printf (MSG_NOTE, "\n");
4026 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4027 dump_printf (MSG_NOTE, "\n");
4028 }
4029
4030 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4031 }
4032 }
4033
4034 /* 2. Create epilog code.
4035 The reduction epilog code operates across the elements of the vector
4036 of partial results computed by the vectorized loop.
4037 The reduction epilog code consists of:
4038
4039 step 1: compute the scalar result in a vector (v_out2)
4040 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4041 step 3: adjust the scalar result (s_out3) if needed.
4042
4043 Step 1 can be accomplished using one the following three schemes:
4044 (scheme 1) using reduc_code, if available.
4045 (scheme 2) using whole-vector shifts, if available.
4046 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4047 combined.
4048
4049 The overall epilog code looks like this:
4050
4051 s_out0 = phi <s_loop> # original EXIT_PHI
4052 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4053 v_out2 = reduce <v_out1> # step 1
4054 s_out3 = extract_field <v_out2, 0> # step 2
4055 s_out4 = adjust_result <s_out3> # step 3
4056
4057 (step 3 is optional, and steps 1 and 2 may be combined).
4058 Lastly, the uses of s_out0 are replaced by s_out4. */
4059
4060
4061 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4062 v_out1 = phi <VECT_DEF>
4063 Store them in NEW_PHIS. */
4064
4065 exit_bb = single_exit (loop)->dest;
4066 prev_phi_info = NULL;
4067 new_phis.create (vect_defs.length ());
4068 FOR_EACH_VEC_ELT (vect_defs, i, def)
4069 {
4070 for (j = 0; j < ncopies; j++)
4071 {
4072 tree new_def = copy_ssa_name (def, NULL);
4073 phi = create_phi_node (new_def, exit_bb);
4074 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4075 if (j == 0)
4076 new_phis.quick_push (phi);
4077 else
4078 {
4079 def = vect_get_vec_def_for_stmt_copy (dt, def);
4080 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4081 }
4082
4083 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4084 prev_phi_info = vinfo_for_stmt (phi);
4085 }
4086 }
4087
4088 /* The epilogue is created for the outer-loop, i.e., for the loop being
4089 vectorized. Create exit phis for the outer loop. */
4090 if (double_reduc)
4091 {
4092 loop = outer_loop;
4093 exit_bb = single_exit (loop)->dest;
4094 inner_phis.create (vect_defs.length ());
4095 FOR_EACH_VEC_ELT (new_phis, i, phi)
4096 {
4097 tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4098 gimple outer_phi = create_phi_node (new_result, exit_bb);
4099 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4100 PHI_RESULT (phi));
4101 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4102 loop_vinfo, NULL));
4103 inner_phis.quick_push (phi);
4104 new_phis[i] = outer_phi;
4105 prev_phi_info = vinfo_for_stmt (outer_phi);
4106 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4107 {
4108 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4109 new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4110 outer_phi = create_phi_node (new_result, exit_bb);
4111 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4112 PHI_RESULT (phi));
4113 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4114 loop_vinfo, NULL));
4115 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4116 prev_phi_info = vinfo_for_stmt (outer_phi);
4117 }
4118 }
4119 }
4120
4121 exit_gsi = gsi_after_labels (exit_bb);
4122
4123 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4124 (i.e. when reduc_code is not available) and in the final adjustment
4125 code (if needed). Also get the original scalar reduction variable as
4126 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4127 represents a reduction pattern), the tree-code and scalar-def are
4128 taken from the original stmt that the pattern-stmt (STMT) replaces.
4129 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4130 are taken from STMT. */
4131
4132 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4133 if (!orig_stmt)
4134 {
4135 /* Regular reduction */
4136 orig_stmt = stmt;
4137 }
4138 else
4139 {
4140 /* Reduction pattern */
4141 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4142 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4143 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4144 }
4145
4146 code = gimple_assign_rhs_code (orig_stmt);
4147 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4148 partial results are added and not subtracted. */
4149 if (code == MINUS_EXPR)
4150 code = PLUS_EXPR;
4151
4152 scalar_dest = gimple_assign_lhs (orig_stmt);
4153 scalar_type = TREE_TYPE (scalar_dest);
4154 scalar_results.create (group_size);
4155 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4156 bitsize = TYPE_SIZE (scalar_type);
4157
4158 /* In case this is a reduction in an inner-loop while vectorizing an outer
4159 loop - we don't need to extract a single scalar result at the end of the
4160 inner-loop (unless it is double reduction, i.e., the use of reduction is
4161 outside the outer-loop). The final vector of partial results will be used
4162 in the vectorized outer-loop, or reduced to a scalar result at the end of
4163 the outer-loop. */
4164 if (nested_in_vect_loop && !double_reduc)
4165 goto vect_finalize_reduction;
4166
4167 /* SLP reduction without reduction chain, e.g.,
4168 # a1 = phi <a2, a0>
4169 # b1 = phi <b2, b0>
4170 a2 = operation (a1)
4171 b2 = operation (b1) */
4172 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4173
4174 /* In case of reduction chain, e.g.,
4175 # a1 = phi <a3, a0>
4176 a2 = operation (a1)
4177 a3 = operation (a2),
4178
4179 we may end up with more than one vector result. Here we reduce them to
4180 one vector. */
4181 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4182 {
4183 tree first_vect = PHI_RESULT (new_phis[0]);
4184 tree tmp;
4185 gimple new_vec_stmt = NULL;
4186
4187 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4188 for (k = 1; k < new_phis.length (); k++)
4189 {
4190 gimple next_phi = new_phis[k];
4191 tree second_vect = PHI_RESULT (next_phi);
4192
4193 tmp = build2 (code, vectype, first_vect, second_vect);
4194 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4195 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4196 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4197 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4198 }
4199
4200 new_phi_result = first_vect;
4201 if (new_vec_stmt)
4202 {
4203 new_phis.truncate (0);
4204 new_phis.safe_push (new_vec_stmt);
4205 }
4206 }
4207 else
4208 new_phi_result = PHI_RESULT (new_phis[0]);
4209
4210 /* 2.3 Create the reduction code, using one of the three schemes described
4211 above. In SLP we simply need to extract all the elements from the
4212 vector (without reducing them), so we use scalar shifts. */
4213 if (reduc_code != ERROR_MARK && !slp_reduc)
4214 {
4215 tree tmp;
4216 tree vec_elem_type;
4217
4218 /*** Case 1: Create:
4219 v_out2 = reduc_expr <v_out1> */
4220
4221 if (dump_enabled_p ())
4222 dump_printf_loc (MSG_NOTE, vect_location,
4223 "Reduce using direct vector reduction.\n");
4224
4225 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4226 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4227 {
4228 tree tmp_dest =
4229 vect_create_destination_var (scalar_dest, vec_elem_type);
4230 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4231 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4232 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4233 gimple_assign_set_lhs (epilog_stmt, new_temp);
4234 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4235
4236 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4237 }
4238 else
4239 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4240 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4241 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4242 gimple_assign_set_lhs (epilog_stmt, new_temp);
4243 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4244 scalar_results.safe_push (new_temp);
4245 }
4246 else
4247 {
4248 bool reduce_with_shift = have_whole_vector_shift (mode);
4249 int element_bitsize = tree_to_uhwi (bitsize);
4250 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4251 tree vec_temp;
4252
4253 /* Regardless of whether we have a whole vector shift, if we're
4254 emulating the operation via tree-vect-generic, we don't want
4255 to use it. Only the first round of the reduction is likely
4256 to still be profitable via emulation. */
4257 /* ??? It might be better to emit a reduction tree code here, so that
4258 tree-vect-generic can expand the first round via bit tricks. */
4259 if (!VECTOR_MODE_P (mode))
4260 reduce_with_shift = false;
4261 else
4262 {
4263 optab optab = optab_for_tree_code (code, vectype, optab_default);
4264 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4265 reduce_with_shift = false;
4266 }
4267
4268 if (reduce_with_shift && !slp_reduc)
4269 {
4270 int nelements = vec_size_in_bits / element_bitsize;
4271 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4272
4273 int elt_offset;
4274
4275 tree zero_vec = build_zero_cst (vectype);
4276 /*** Case 2: Create:
4277 for (offset = nelements/2; offset >= 1; offset/=2)
4278 {
4279 Create: va' = vec_shift <va, offset>
4280 Create: va = vop <va, va'>
4281 } */
4282
4283 tree rhs;
4284
4285 if (dump_enabled_p ())
4286 dump_printf_loc (MSG_NOTE, vect_location,
4287 "Reduce using vector shifts\n");
4288
4289 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4290 new_temp = new_phi_result;
4291 for (elt_offset = nelements / 2;
4292 elt_offset >= 1;
4293 elt_offset /= 2)
4294 {
4295 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4296 tree mask = vect_gen_perm_mask_any (vectype, sel);
4297 epilog_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR,
4298 vec_dest, new_temp,
4299 zero_vec, mask);
4300 new_name = make_ssa_name (vec_dest, epilog_stmt);
4301 gimple_assign_set_lhs (epilog_stmt, new_name);
4302 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4303
4304 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4305 new_name, new_temp);
4306 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4307 gimple_assign_set_lhs (epilog_stmt, new_temp);
4308 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4309 }
4310
4311 /* 2.4 Extract the final scalar result. Create:
4312 s_out3 = extract_field <v_out2, bitpos> */
4313
4314 if (dump_enabled_p ())
4315 dump_printf_loc (MSG_NOTE, vect_location,
4316 "extract scalar result\n");
4317
4318 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4319 bitsize, bitsize_zero_node);
4320 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4321 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4322 gimple_assign_set_lhs (epilog_stmt, new_temp);
4323 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4324 scalar_results.safe_push (new_temp);
4325 }
4326 else
4327 {
4328 /*** Case 3: Create:
4329 s = extract_field <v_out2, 0>
4330 for (offset = element_size;
4331 offset < vector_size;
4332 offset += element_size;)
4333 {
4334 Create: s' = extract_field <v_out2, offset>
4335 Create: s = op <s, s'> // For non SLP cases
4336 } */
4337
4338 if (dump_enabled_p ())
4339 dump_printf_loc (MSG_NOTE, vect_location,
4340 "Reduce using scalar code.\n");
4341
4342 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4343 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4344 {
4345 int bit_offset;
4346 if (gimple_code (new_phi) == GIMPLE_PHI)
4347 vec_temp = PHI_RESULT (new_phi);
4348 else
4349 vec_temp = gimple_assign_lhs (new_phi);
4350 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4351 bitsize_zero_node);
4352 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4353 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4354 gimple_assign_set_lhs (epilog_stmt, new_temp);
4355 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4356
4357 /* In SLP we don't need to apply reduction operation, so we just
4358 collect s' values in SCALAR_RESULTS. */
4359 if (slp_reduc)
4360 scalar_results.safe_push (new_temp);
4361
4362 for (bit_offset = element_bitsize;
4363 bit_offset < vec_size_in_bits;
4364 bit_offset += element_bitsize)
4365 {
4366 tree bitpos = bitsize_int (bit_offset);
4367 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4368 bitsize, bitpos);
4369
4370 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4371 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4372 gimple_assign_set_lhs (epilog_stmt, new_name);
4373 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4374
4375 if (slp_reduc)
4376 {
4377 /* In SLP we don't need to apply reduction operation, so
4378 we just collect s' values in SCALAR_RESULTS. */
4379 new_temp = new_name;
4380 scalar_results.safe_push (new_name);
4381 }
4382 else
4383 {
4384 epilog_stmt = gimple_build_assign_with_ops (code,
4385 new_scalar_dest, new_name, new_temp);
4386 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4387 gimple_assign_set_lhs (epilog_stmt, new_temp);
4388 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4389 }
4390 }
4391 }
4392
4393 /* The only case where we need to reduce scalar results in SLP, is
4394 unrolling. If the size of SCALAR_RESULTS is greater than
4395 GROUP_SIZE, we reduce them combining elements modulo
4396 GROUP_SIZE. */
4397 if (slp_reduc)
4398 {
4399 tree res, first_res, new_res;
4400 gimple new_stmt;
4401
4402 /* Reduce multiple scalar results in case of SLP unrolling. */
4403 for (j = group_size; scalar_results.iterate (j, &res);
4404 j++)
4405 {
4406 first_res = scalar_results[j % group_size];
4407 new_stmt = gimple_build_assign_with_ops (code,
4408 new_scalar_dest, first_res, res);
4409 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4410 gimple_assign_set_lhs (new_stmt, new_res);
4411 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4412 scalar_results[j % group_size] = new_res;
4413 }
4414 }
4415 else
4416 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4417 scalar_results.safe_push (new_temp);
4418 }
4419 }
4420
4421 vect_finalize_reduction:
4422
4423 if (double_reduc)
4424 loop = loop->inner;
4425
4426 /* 2.5 Adjust the final result by the initial value of the reduction
4427 variable. (When such adjustment is not needed, then
4428 'adjustment_def' is zero). For example, if code is PLUS we create:
4429 new_temp = loop_exit_def + adjustment_def */
4430
4431 if (adjustment_def)
4432 {
4433 gcc_assert (!slp_reduc);
4434 if (nested_in_vect_loop)
4435 {
4436 new_phi = new_phis[0];
4437 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4438 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4439 new_dest = vect_create_destination_var (scalar_dest, vectype);
4440 }
4441 else
4442 {
4443 new_temp = scalar_results[0];
4444 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4445 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4446 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4447 }
4448
4449 epilog_stmt = gimple_build_assign (new_dest, expr);
4450 new_temp = make_ssa_name (new_dest, epilog_stmt);
4451 gimple_assign_set_lhs (epilog_stmt, new_temp);
4452 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4453 if (nested_in_vect_loop)
4454 {
4455 set_vinfo_for_stmt (epilog_stmt,
4456 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4457 NULL));
4458 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4459 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4460
4461 if (!double_reduc)
4462 scalar_results.quick_push (new_temp);
4463 else
4464 scalar_results[0] = new_temp;
4465 }
4466 else
4467 scalar_results[0] = new_temp;
4468
4469 new_phis[0] = epilog_stmt;
4470 }
4471
4472 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4473 phis with new adjusted scalar results, i.e., replace use <s_out0>
4474 with use <s_out4>.
4475
4476 Transform:
4477 loop_exit:
4478 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4479 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4480 v_out2 = reduce <v_out1>
4481 s_out3 = extract_field <v_out2, 0>
4482 s_out4 = adjust_result <s_out3>
4483 use <s_out0>
4484 use <s_out0>
4485
4486 into:
4487
4488 loop_exit:
4489 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4490 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4491 v_out2 = reduce <v_out1>
4492 s_out3 = extract_field <v_out2, 0>
4493 s_out4 = adjust_result <s_out3>
4494 use <s_out4>
4495 use <s_out4> */
4496
4497
4498 /* In SLP reduction chain we reduce vector results into one vector if
4499 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4500 the last stmt in the reduction chain, since we are looking for the loop
4501 exit phi node. */
4502 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4503 {
4504 scalar_dest = gimple_assign_lhs (
4505 SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4506 group_size = 1;
4507 }
4508
4509 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4510 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4511 need to match SCALAR_RESULTS with corresponding statements. The first
4512 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4513 the first vector stmt, etc.
4514 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4515 if (group_size > new_phis.length ())
4516 {
4517 ratio = group_size / new_phis.length ();
4518 gcc_assert (!(group_size % new_phis.length ()));
4519 }
4520 else
4521 ratio = 1;
4522
4523 for (k = 0; k < group_size; k++)
4524 {
4525 if (k % ratio == 0)
4526 {
4527 epilog_stmt = new_phis[k / ratio];
4528 reduction_phi = reduction_phis[k / ratio];
4529 if (double_reduc)
4530 inner_phi = inner_phis[k / ratio];
4531 }
4532
4533 if (slp_reduc)
4534 {
4535 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4536
4537 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4538 /* SLP statements can't participate in patterns. */
4539 gcc_assert (!orig_stmt);
4540 scalar_dest = gimple_assign_lhs (current_stmt);
4541 }
4542
4543 phis.create (3);
4544 /* Find the loop-closed-use at the loop exit of the original scalar
4545 result. (The reduction result is expected to have two immediate uses -
4546 one at the latch block, and one at the loop exit). */
4547 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4548 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4549 && !is_gimple_debug (USE_STMT (use_p)))
4550 phis.safe_push (USE_STMT (use_p));
4551
4552 /* While we expect to have found an exit_phi because of loop-closed-ssa
4553 form we can end up without one if the scalar cycle is dead. */
4554
4555 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4556 {
4557 if (outer_loop)
4558 {
4559 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4560 gimple vect_phi;
4561
4562 /* FORNOW. Currently not supporting the case that an inner-loop
4563 reduction is not used in the outer-loop (but only outside the
4564 outer-loop), unless it is double reduction. */
4565 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4566 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4567 || double_reduc);
4568
4569 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4570 if (!double_reduc
4571 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4572 != vect_double_reduction_def)
4573 continue;
4574
4575 /* Handle double reduction:
4576
4577 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4578 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4579 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4580 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4581
4582 At that point the regular reduction (stmt2 and stmt3) is
4583 already vectorized, as well as the exit phi node, stmt4.
4584 Here we vectorize the phi node of double reduction, stmt1, and
4585 update all relevant statements. */
4586
4587 /* Go through all the uses of s2 to find double reduction phi
4588 node, i.e., stmt1 above. */
4589 orig_name = PHI_RESULT (exit_phi);
4590 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4591 {
4592 stmt_vec_info use_stmt_vinfo;
4593 stmt_vec_info new_phi_vinfo;
4594 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4595 basic_block bb = gimple_bb (use_stmt);
4596 gimple use;
4597
4598 /* Check that USE_STMT is really double reduction phi
4599 node. */
4600 if (gimple_code (use_stmt) != GIMPLE_PHI
4601 || gimple_phi_num_args (use_stmt) != 2
4602 || bb->loop_father != outer_loop)
4603 continue;
4604 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4605 if (!use_stmt_vinfo
4606 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4607 != vect_double_reduction_def)
4608 continue;
4609
4610 /* Create vector phi node for double reduction:
4611 vs1 = phi <vs0, vs2>
4612 vs1 was created previously in this function by a call to
4613 vect_get_vec_def_for_operand and is stored in
4614 vec_initial_def;
4615 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4616 vs0 is created here. */
4617
4618 /* Create vector phi node. */
4619 vect_phi = create_phi_node (vec_initial_def, bb);
4620 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4621 loop_vec_info_for_loop (outer_loop), NULL);
4622 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4623
4624 /* Create vs0 - initial def of the double reduction phi. */
4625 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4626 loop_preheader_edge (outer_loop));
4627 init_def = get_initial_def_for_reduction (stmt,
4628 preheader_arg, NULL);
4629 vect_phi_init = vect_init_vector (use_stmt, init_def,
4630 vectype, NULL);
4631
4632 /* Update phi node arguments with vs0 and vs2. */
4633 add_phi_arg (vect_phi, vect_phi_init,
4634 loop_preheader_edge (outer_loop),
4635 UNKNOWN_LOCATION);
4636 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4637 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4638 if (dump_enabled_p ())
4639 {
4640 dump_printf_loc (MSG_NOTE, vect_location,
4641 "created double reduction phi node: ");
4642 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4643 dump_printf (MSG_NOTE, "\n");
4644 }
4645
4646 vect_phi_res = PHI_RESULT (vect_phi);
4647
4648 /* Replace the use, i.e., set the correct vs1 in the regular
4649 reduction phi node. FORNOW, NCOPIES is always 1, so the
4650 loop is redundant. */
4651 use = reduction_phi;
4652 for (j = 0; j < ncopies; j++)
4653 {
4654 edge pr_edge = loop_preheader_edge (loop);
4655 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4656 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4657 }
4658 }
4659 }
4660 }
4661
4662 phis.release ();
4663 if (nested_in_vect_loop)
4664 {
4665 if (double_reduc)
4666 loop = outer_loop;
4667 else
4668 continue;
4669 }
4670
4671 phis.create (3);
4672 /* Find the loop-closed-use at the loop exit of the original scalar
4673 result. (The reduction result is expected to have two immediate uses,
4674 one at the latch block, and one at the loop exit). For double
4675 reductions we are looking for exit phis of the outer loop. */
4676 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4677 {
4678 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4679 {
4680 if (!is_gimple_debug (USE_STMT (use_p)))
4681 phis.safe_push (USE_STMT (use_p));
4682 }
4683 else
4684 {
4685 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4686 {
4687 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4688
4689 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4690 {
4691 if (!flow_bb_inside_loop_p (loop,
4692 gimple_bb (USE_STMT (phi_use_p)))
4693 && !is_gimple_debug (USE_STMT (phi_use_p)))
4694 phis.safe_push (USE_STMT (phi_use_p));
4695 }
4696 }
4697 }
4698 }
4699
4700 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4701 {
4702 /* Replace the uses: */
4703 orig_name = PHI_RESULT (exit_phi);
4704 scalar_result = scalar_results[k];
4705 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4706 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4707 SET_USE (use_p, scalar_result);
4708 }
4709
4710 phis.release ();
4711 }
4712 }
4713
4714
4715 /* Function vectorizable_reduction.
4716
4717 Check if STMT performs a reduction operation that can be vectorized.
4718 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4719 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4720 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4721
4722 This function also handles reduction idioms (patterns) that have been
4723 recognized in advance during vect_pattern_recog. In this case, STMT may be
4724 of this form:
4725 X = pattern_expr (arg0, arg1, ..., X)
4726 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4727 sequence that had been detected and replaced by the pattern-stmt (STMT).
4728
4729 In some cases of reduction patterns, the type of the reduction variable X is
4730 different than the type of the other arguments of STMT.
4731 In such cases, the vectype that is used when transforming STMT into a vector
4732 stmt is different than the vectype that is used to determine the
4733 vectorization factor, because it consists of a different number of elements
4734 than the actual number of elements that are being operated upon in parallel.
4735
4736 For example, consider an accumulation of shorts into an int accumulator.
4737 On some targets it's possible to vectorize this pattern operating on 8
4738 shorts at a time (hence, the vectype for purposes of determining the
4739 vectorization factor should be V8HI); on the other hand, the vectype that
4740 is used to create the vector form is actually V4SI (the type of the result).
4741
4742 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4743 indicates what is the actual level of parallelism (V8HI in the example), so
4744 that the right vectorization factor would be derived. This vectype
4745 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4746 be used to create the vectorized stmt. The right vectype for the vectorized
4747 stmt is obtained from the type of the result X:
4748 get_vectype_for_scalar_type (TREE_TYPE (X))
4749
4750 This means that, contrary to "regular" reductions (or "regular" stmts in
4751 general), the following equation:
4752 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4753 does *NOT* necessarily hold for reduction patterns. */
4754
4755 bool
4756 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4757 gimple *vec_stmt, slp_tree slp_node)
4758 {
4759 tree vec_dest;
4760 tree scalar_dest;
4761 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4762 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4763 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4764 tree vectype_in = NULL_TREE;
4765 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4766 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4767 enum tree_code code, orig_code, epilog_reduc_code;
4768 machine_mode vec_mode;
4769 int op_type;
4770 optab optab, reduc_optab;
4771 tree new_temp = NULL_TREE;
4772 tree def;
4773 gimple def_stmt;
4774 enum vect_def_type dt;
4775 gimple new_phi = NULL;
4776 tree scalar_type;
4777 bool is_simple_use;
4778 gimple orig_stmt;
4779 stmt_vec_info orig_stmt_info;
4780 tree expr = NULL_TREE;
4781 int i;
4782 int ncopies;
4783 int epilog_copies;
4784 stmt_vec_info prev_stmt_info, prev_phi_info;
4785 bool single_defuse_cycle = false;
4786 tree reduc_def = NULL_TREE;
4787 gimple new_stmt = NULL;
4788 int j;
4789 tree ops[3];
4790 bool nested_cycle = false, found_nested_cycle_def = false;
4791 gimple reduc_def_stmt = NULL;
4792 /* The default is that the reduction variable is the last in statement. */
4793 int reduc_index = 2;
4794 bool double_reduc = false, dummy;
4795 basic_block def_bb;
4796 struct loop * def_stmt_loop, *outer_loop = NULL;
4797 tree def_arg;
4798 gimple def_arg_stmt;
4799 auto_vec<tree> vec_oprnds0;
4800 auto_vec<tree> vec_oprnds1;
4801 auto_vec<tree> vect_defs;
4802 auto_vec<gimple> phis;
4803 int vec_num;
4804 tree def0, def1, tem, op0, op1 = NULL_TREE;
4805
4806 /* In case of reduction chain we switch to the first stmt in the chain, but
4807 we don't update STMT_INFO, since only the last stmt is marked as reduction
4808 and has reduction properties. */
4809 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4810 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4811
4812 if (nested_in_vect_loop_p (loop, stmt))
4813 {
4814 outer_loop = loop;
4815 loop = loop->inner;
4816 nested_cycle = true;
4817 }
4818
4819 /* 1. Is vectorizable reduction? */
4820 /* Not supportable if the reduction variable is used in the loop, unless
4821 it's a reduction chain. */
4822 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4823 && !GROUP_FIRST_ELEMENT (stmt_info))
4824 return false;
4825
4826 /* Reductions that are not used even in an enclosing outer-loop,
4827 are expected to be "live" (used out of the loop). */
4828 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4829 && !STMT_VINFO_LIVE_P (stmt_info))
4830 return false;
4831
4832 /* Make sure it was already recognized as a reduction computation. */
4833 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4834 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4835 return false;
4836
4837 /* 2. Has this been recognized as a reduction pattern?
4838
4839 Check if STMT represents a pattern that has been recognized
4840 in earlier analysis stages. For stmts that represent a pattern,
4841 the STMT_VINFO_RELATED_STMT field records the last stmt in
4842 the original sequence that constitutes the pattern. */
4843
4844 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4845 if (orig_stmt)
4846 {
4847 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4848 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4849 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4850 }
4851
4852 /* 3. Check the operands of the operation. The first operands are defined
4853 inside the loop body. The last operand is the reduction variable,
4854 which is defined by the loop-header-phi. */
4855
4856 gcc_assert (is_gimple_assign (stmt));
4857
4858 /* Flatten RHS. */
4859 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4860 {
4861 case GIMPLE_SINGLE_RHS:
4862 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4863 if (op_type == ternary_op)
4864 {
4865 tree rhs = gimple_assign_rhs1 (stmt);
4866 ops[0] = TREE_OPERAND (rhs, 0);
4867 ops[1] = TREE_OPERAND (rhs, 1);
4868 ops[2] = TREE_OPERAND (rhs, 2);
4869 code = TREE_CODE (rhs);
4870 }
4871 else
4872 return false;
4873 break;
4874
4875 case GIMPLE_BINARY_RHS:
4876 code = gimple_assign_rhs_code (stmt);
4877 op_type = TREE_CODE_LENGTH (code);
4878 gcc_assert (op_type == binary_op);
4879 ops[0] = gimple_assign_rhs1 (stmt);
4880 ops[1] = gimple_assign_rhs2 (stmt);
4881 break;
4882
4883 case GIMPLE_TERNARY_RHS:
4884 code = gimple_assign_rhs_code (stmt);
4885 op_type = TREE_CODE_LENGTH (code);
4886 gcc_assert (op_type == ternary_op);
4887 ops[0] = gimple_assign_rhs1 (stmt);
4888 ops[1] = gimple_assign_rhs2 (stmt);
4889 ops[2] = gimple_assign_rhs3 (stmt);
4890 break;
4891
4892 case GIMPLE_UNARY_RHS:
4893 return false;
4894
4895 default:
4896 gcc_unreachable ();
4897 }
4898
4899 if (code == COND_EXPR && slp_node)
4900 return false;
4901
4902 scalar_dest = gimple_assign_lhs (stmt);
4903 scalar_type = TREE_TYPE (scalar_dest);
4904 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4905 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4906 return false;
4907
4908 /* Do not try to vectorize bit-precision reductions. */
4909 if ((TYPE_PRECISION (scalar_type)
4910 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4911 return false;
4912
4913 /* All uses but the last are expected to be defined in the loop.
4914 The last use is the reduction variable. In case of nested cycle this
4915 assumption is not true: we use reduc_index to record the index of the
4916 reduction variable. */
4917 for (i = 0; i < op_type - 1; i++)
4918 {
4919 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4920 if (i == 0 && code == COND_EXPR)
4921 continue;
4922
4923 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4924 &def_stmt, &def, &dt, &tem);
4925 if (!vectype_in)
4926 vectype_in = tem;
4927 gcc_assert (is_simple_use);
4928
4929 if (dt != vect_internal_def
4930 && dt != vect_external_def
4931 && dt != vect_constant_def
4932 && dt != vect_induction_def
4933 && !(dt == vect_nested_cycle && nested_cycle))
4934 return false;
4935
4936 if (dt == vect_nested_cycle)
4937 {
4938 found_nested_cycle_def = true;
4939 reduc_def_stmt = def_stmt;
4940 reduc_index = i;
4941 }
4942 }
4943
4944 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4945 &def_stmt, &def, &dt, &tem);
4946 if (!vectype_in)
4947 vectype_in = tem;
4948 gcc_assert (is_simple_use);
4949 if (!(dt == vect_reduction_def
4950 || dt == vect_nested_cycle
4951 || ((dt == vect_internal_def || dt == vect_external_def
4952 || dt == vect_constant_def || dt == vect_induction_def)
4953 && nested_cycle && found_nested_cycle_def)))
4954 {
4955 /* For pattern recognized stmts, orig_stmt might be a reduction,
4956 but some helper statements for the pattern might not, or
4957 might be COND_EXPRs with reduction uses in the condition. */
4958 gcc_assert (orig_stmt);
4959 return false;
4960 }
4961 if (!found_nested_cycle_def)
4962 reduc_def_stmt = def_stmt;
4963
4964 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4965 if (orig_stmt)
4966 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4967 reduc_def_stmt,
4968 !nested_cycle,
4969 &dummy));
4970 else
4971 {
4972 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4973 !nested_cycle, &dummy);
4974 /* We changed STMT to be the first stmt in reduction chain, hence we
4975 check that in this case the first element in the chain is STMT. */
4976 gcc_assert (stmt == tmp
4977 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4978 }
4979
4980 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4981 return false;
4982
4983 if (slp_node || PURE_SLP_STMT (stmt_info))
4984 ncopies = 1;
4985 else
4986 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4987 / TYPE_VECTOR_SUBPARTS (vectype_in));
4988
4989 gcc_assert (ncopies >= 1);
4990
4991 vec_mode = TYPE_MODE (vectype_in);
4992
4993 if (code == COND_EXPR)
4994 {
4995 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4996 {
4997 if (dump_enabled_p ())
4998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4999 "unsupported condition in reduction\n");
5000
5001 return false;
5002 }
5003 }
5004 else
5005 {
5006 /* 4. Supportable by target? */
5007
5008 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5009 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5010 {
5011 /* Shifts and rotates are only supported by vectorizable_shifts,
5012 not vectorizable_reduction. */
5013 if (dump_enabled_p ())
5014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5015 "unsupported shift or rotation.\n");
5016 return false;
5017 }
5018
5019 /* 4.1. check support for the operation in the loop */
5020 optab = optab_for_tree_code (code, vectype_in, optab_default);
5021 if (!optab)
5022 {
5023 if (dump_enabled_p ())
5024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5025 "no optab.\n");
5026
5027 return false;
5028 }
5029
5030 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5031 {
5032 if (dump_enabled_p ())
5033 dump_printf (MSG_NOTE, "op not supported by target.\n");
5034
5035 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5036 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5037 < vect_min_worthwhile_factor (code))
5038 return false;
5039
5040 if (dump_enabled_p ())
5041 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5042 }
5043
5044 /* Worthwhile without SIMD support? */
5045 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5046 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5047 < vect_min_worthwhile_factor (code))
5048 {
5049 if (dump_enabled_p ())
5050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5051 "not worthwhile without SIMD support.\n");
5052
5053 return false;
5054 }
5055 }
5056
5057 /* 4.2. Check support for the epilog operation.
5058
5059 If STMT represents a reduction pattern, then the type of the
5060 reduction variable may be different than the type of the rest
5061 of the arguments. For example, consider the case of accumulation
5062 of shorts into an int accumulator; The original code:
5063 S1: int_a = (int) short_a;
5064 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5065
5066 was replaced with:
5067 STMT: int_acc = widen_sum <short_a, int_acc>
5068
5069 This means that:
5070 1. The tree-code that is used to create the vector operation in the
5071 epilog code (that reduces the partial results) is not the
5072 tree-code of STMT, but is rather the tree-code of the original
5073 stmt from the pattern that STMT is replacing. I.e, in the example
5074 above we want to use 'widen_sum' in the loop, but 'plus' in the
5075 epilog.
5076 2. The type (mode) we use to check available target support
5077 for the vector operation to be created in the *epilog*, is
5078 determined by the type of the reduction variable (in the example
5079 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5080 However the type (mode) we use to check available target support
5081 for the vector operation to be created *inside the loop*, is
5082 determined by the type of the other arguments to STMT (in the
5083 example we'd check this: optab_handler (widen_sum_optab,
5084 vect_short_mode)).
5085
5086 This is contrary to "regular" reductions, in which the types of all
5087 the arguments are the same as the type of the reduction variable.
5088 For "regular" reductions we can therefore use the same vector type
5089 (and also the same tree-code) when generating the epilog code and
5090 when generating the code inside the loop. */
5091
5092 if (orig_stmt)
5093 {
5094 /* This is a reduction pattern: get the vectype from the type of the
5095 reduction variable, and get the tree-code from orig_stmt. */
5096 orig_code = gimple_assign_rhs_code (orig_stmt);
5097 gcc_assert (vectype_out);
5098 vec_mode = TYPE_MODE (vectype_out);
5099 }
5100 else
5101 {
5102 /* Regular reduction: use the same vectype and tree-code as used for
5103 the vector code inside the loop can be used for the epilog code. */
5104 orig_code = code;
5105 }
5106
5107 if (nested_cycle)
5108 {
5109 def_bb = gimple_bb (reduc_def_stmt);
5110 def_stmt_loop = def_bb->loop_father;
5111 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5112 loop_preheader_edge (def_stmt_loop));
5113 if (TREE_CODE (def_arg) == SSA_NAME
5114 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5115 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5116 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5117 && vinfo_for_stmt (def_arg_stmt)
5118 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5119 == vect_double_reduction_def)
5120 double_reduc = true;
5121 }
5122
5123 epilog_reduc_code = ERROR_MARK;
5124 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5125 {
5126 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5127 optab_default);
5128 if (!reduc_optab)
5129 {
5130 if (dump_enabled_p ())
5131 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5132 "no optab for reduction.\n");
5133
5134 epilog_reduc_code = ERROR_MARK;
5135 }
5136 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5137 {
5138 optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5139 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5140 {
5141 if (dump_enabled_p ())
5142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5143 "reduc op not supported by target.\n");
5144
5145 epilog_reduc_code = ERROR_MARK;
5146 }
5147 }
5148 }
5149 else
5150 {
5151 if (!nested_cycle || double_reduc)
5152 {
5153 if (dump_enabled_p ())
5154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5155 "no reduc code for scalar code.\n");
5156
5157 return false;
5158 }
5159 }
5160
5161 if (double_reduc && ncopies > 1)
5162 {
5163 if (dump_enabled_p ())
5164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5165 "multiple types in double reduction\n");
5166
5167 return false;
5168 }
5169
5170 /* In case of widenning multiplication by a constant, we update the type
5171 of the constant to be the type of the other operand. We check that the
5172 constant fits the type in the pattern recognition pass. */
5173 if (code == DOT_PROD_EXPR
5174 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5175 {
5176 if (TREE_CODE (ops[0]) == INTEGER_CST)
5177 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5178 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5179 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5180 else
5181 {
5182 if (dump_enabled_p ())
5183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5184 "invalid types in dot-prod\n");
5185
5186 return false;
5187 }
5188 }
5189
5190 if (!vec_stmt) /* transformation not required. */
5191 {
5192 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5193 return false;
5194 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5195 return true;
5196 }
5197
5198 /** Transform. **/
5199
5200 if (dump_enabled_p ())
5201 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5202
5203 /* FORNOW: Multiple types are not supported for condition. */
5204 if (code == COND_EXPR)
5205 gcc_assert (ncopies == 1);
5206
5207 /* Create the destination vector */
5208 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5209
5210 /* In case the vectorization factor (VF) is bigger than the number
5211 of elements that we can fit in a vectype (nunits), we have to generate
5212 more than one vector stmt - i.e - we need to "unroll" the
5213 vector stmt by a factor VF/nunits. For more details see documentation
5214 in vectorizable_operation. */
5215
5216 /* If the reduction is used in an outer loop we need to generate
5217 VF intermediate results, like so (e.g. for ncopies=2):
5218 r0 = phi (init, r0)
5219 r1 = phi (init, r1)
5220 r0 = x0 + r0;
5221 r1 = x1 + r1;
5222 (i.e. we generate VF results in 2 registers).
5223 In this case we have a separate def-use cycle for each copy, and therefore
5224 for each copy we get the vector def for the reduction variable from the
5225 respective phi node created for this copy.
5226
5227 Otherwise (the reduction is unused in the loop nest), we can combine
5228 together intermediate results, like so (e.g. for ncopies=2):
5229 r = phi (init, r)
5230 r = x0 + r;
5231 r = x1 + r;
5232 (i.e. we generate VF/2 results in a single register).
5233 In this case for each copy we get the vector def for the reduction variable
5234 from the vectorized reduction operation generated in the previous iteration.
5235 */
5236
5237 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5238 {
5239 single_defuse_cycle = true;
5240 epilog_copies = 1;
5241 }
5242 else
5243 epilog_copies = ncopies;
5244
5245 prev_stmt_info = NULL;
5246 prev_phi_info = NULL;
5247 if (slp_node)
5248 {
5249 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5250 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5251 == TYPE_VECTOR_SUBPARTS (vectype_in));
5252 }
5253 else
5254 {
5255 vec_num = 1;
5256 vec_oprnds0.create (1);
5257 if (op_type == ternary_op)
5258 vec_oprnds1.create (1);
5259 }
5260
5261 phis.create (vec_num);
5262 vect_defs.create (vec_num);
5263 if (!slp_node)
5264 vect_defs.quick_push (NULL_TREE);
5265
5266 for (j = 0; j < ncopies; j++)
5267 {
5268 if (j == 0 || !single_defuse_cycle)
5269 {
5270 for (i = 0; i < vec_num; i++)
5271 {
5272 /* Create the reduction-phi that defines the reduction
5273 operand. */
5274 new_phi = create_phi_node (vec_dest, loop->header);
5275 set_vinfo_for_stmt (new_phi,
5276 new_stmt_vec_info (new_phi, loop_vinfo,
5277 NULL));
5278 if (j == 0 || slp_node)
5279 phis.quick_push (new_phi);
5280 }
5281 }
5282
5283 if (code == COND_EXPR)
5284 {
5285 gcc_assert (!slp_node);
5286 vectorizable_condition (stmt, gsi, vec_stmt,
5287 PHI_RESULT (phis[0]),
5288 reduc_index, NULL);
5289 /* Multiple types are not supported for condition. */
5290 break;
5291 }
5292
5293 /* Handle uses. */
5294 if (j == 0)
5295 {
5296 op0 = ops[!reduc_index];
5297 if (op_type == ternary_op)
5298 {
5299 if (reduc_index == 0)
5300 op1 = ops[2];
5301 else
5302 op1 = ops[1];
5303 }
5304
5305 if (slp_node)
5306 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5307 slp_node, -1);
5308 else
5309 {
5310 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5311 stmt, NULL);
5312 vec_oprnds0.quick_push (loop_vec_def0);
5313 if (op_type == ternary_op)
5314 {
5315 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5316 NULL);
5317 vec_oprnds1.quick_push (loop_vec_def1);
5318 }
5319 }
5320 }
5321 else
5322 {
5323 if (!slp_node)
5324 {
5325 enum vect_def_type dt;
5326 gimple dummy_stmt;
5327 tree dummy;
5328
5329 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5330 &dummy_stmt, &dummy, &dt);
5331 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5332 loop_vec_def0);
5333 vec_oprnds0[0] = loop_vec_def0;
5334 if (op_type == ternary_op)
5335 {
5336 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5337 &dummy, &dt);
5338 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5339 loop_vec_def1);
5340 vec_oprnds1[0] = loop_vec_def1;
5341 }
5342 }
5343
5344 if (single_defuse_cycle)
5345 reduc_def = gimple_assign_lhs (new_stmt);
5346
5347 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5348 }
5349
5350 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5351 {
5352 if (slp_node)
5353 reduc_def = PHI_RESULT (phis[i]);
5354 else
5355 {
5356 if (!single_defuse_cycle || j == 0)
5357 reduc_def = PHI_RESULT (new_phi);
5358 }
5359
5360 def1 = ((op_type == ternary_op)
5361 ? vec_oprnds1[i] : NULL);
5362 if (op_type == binary_op)
5363 {
5364 if (reduc_index == 0)
5365 expr = build2 (code, vectype_out, reduc_def, def0);
5366 else
5367 expr = build2 (code, vectype_out, def0, reduc_def);
5368 }
5369 else
5370 {
5371 if (reduc_index == 0)
5372 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5373 else
5374 {
5375 if (reduc_index == 1)
5376 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5377 else
5378 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5379 }
5380 }
5381
5382 new_stmt = gimple_build_assign (vec_dest, expr);
5383 new_temp = make_ssa_name (vec_dest, new_stmt);
5384 gimple_assign_set_lhs (new_stmt, new_temp);
5385 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5386
5387 if (slp_node)
5388 {
5389 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5390 vect_defs.quick_push (new_temp);
5391 }
5392 else
5393 vect_defs[0] = new_temp;
5394 }
5395
5396 if (slp_node)
5397 continue;
5398
5399 if (j == 0)
5400 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5401 else
5402 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5403
5404 prev_stmt_info = vinfo_for_stmt (new_stmt);
5405 prev_phi_info = vinfo_for_stmt (new_phi);
5406 }
5407
5408 /* Finalize the reduction-phi (set its arguments) and create the
5409 epilog reduction code. */
5410 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5411 {
5412 new_temp = gimple_assign_lhs (*vec_stmt);
5413 vect_defs[0] = new_temp;
5414 }
5415
5416 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5417 epilog_reduc_code, phis, reduc_index,
5418 double_reduc, slp_node);
5419
5420 return true;
5421 }
5422
5423 /* Function vect_min_worthwhile_factor.
5424
5425 For a loop where we could vectorize the operation indicated by CODE,
5426 return the minimum vectorization factor that makes it worthwhile
5427 to use generic vectors. */
5428 int
5429 vect_min_worthwhile_factor (enum tree_code code)
5430 {
5431 switch (code)
5432 {
5433 case PLUS_EXPR:
5434 case MINUS_EXPR:
5435 case NEGATE_EXPR:
5436 return 4;
5437
5438 case BIT_AND_EXPR:
5439 case BIT_IOR_EXPR:
5440 case BIT_XOR_EXPR:
5441 case BIT_NOT_EXPR:
5442 return 2;
5443
5444 default:
5445 return INT_MAX;
5446 }
5447 }
5448
5449
5450 /* Function vectorizable_induction
5451
5452 Check if PHI performs an induction computation that can be vectorized.
5453 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5454 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5455 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5456
5457 bool
5458 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5459 gimple *vec_stmt)
5460 {
5461 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5462 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5463 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5464 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5465 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5466 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5467 tree vec_def;
5468
5469 gcc_assert (ncopies >= 1);
5470 /* FORNOW. These restrictions should be relaxed. */
5471 if (nested_in_vect_loop_p (loop, phi))
5472 {
5473 imm_use_iterator imm_iter;
5474 use_operand_p use_p;
5475 gimple exit_phi;
5476 edge latch_e;
5477 tree loop_arg;
5478
5479 if (ncopies > 1)
5480 {
5481 if (dump_enabled_p ())
5482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5483 "multiple types in nested loop.\n");
5484 return false;
5485 }
5486
5487 exit_phi = NULL;
5488 latch_e = loop_latch_edge (loop->inner);
5489 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5490 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5491 {
5492 gimple use_stmt = USE_STMT (use_p);
5493 if (is_gimple_debug (use_stmt))
5494 continue;
5495
5496 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5497 {
5498 exit_phi = use_stmt;
5499 break;
5500 }
5501 }
5502 if (exit_phi)
5503 {
5504 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5505 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5506 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5507 {
5508 if (dump_enabled_p ())
5509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5510 "inner-loop induction only used outside "
5511 "of the outer vectorized loop.\n");
5512 return false;
5513 }
5514 }
5515 }
5516
5517 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5518 return false;
5519
5520 /* FORNOW: SLP not supported. */
5521 if (STMT_SLP_TYPE (stmt_info))
5522 return false;
5523
5524 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5525
5526 if (gimple_code (phi) != GIMPLE_PHI)
5527 return false;
5528
5529 if (!vec_stmt) /* transformation not required. */
5530 {
5531 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5532 if (dump_enabled_p ())
5533 dump_printf_loc (MSG_NOTE, vect_location,
5534 "=== vectorizable_induction ===\n");
5535 vect_model_induction_cost (stmt_info, ncopies);
5536 return true;
5537 }
5538
5539 /** Transform. **/
5540
5541 if (dump_enabled_p ())
5542 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5543
5544 vec_def = get_initial_def_for_induction (phi);
5545 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5546 return true;
5547 }
5548
5549 /* Function vectorizable_live_operation.
5550
5551 STMT computes a value that is used outside the loop. Check if
5552 it can be supported. */
5553
5554 bool
5555 vectorizable_live_operation (gimple stmt,
5556 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5557 gimple *vec_stmt)
5558 {
5559 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5560 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5561 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5562 int i;
5563 int op_type;
5564 tree op;
5565 tree def;
5566 gimple def_stmt;
5567 enum vect_def_type dt;
5568 enum tree_code code;
5569 enum gimple_rhs_class rhs_class;
5570
5571 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5572
5573 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5574 return false;
5575
5576 if (!is_gimple_assign (stmt))
5577 {
5578 if (gimple_call_internal_p (stmt)
5579 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5580 && gimple_call_lhs (stmt)
5581 && loop->simduid
5582 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5583 && loop->simduid
5584 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5585 {
5586 edge e = single_exit (loop);
5587 basic_block merge_bb = e->dest;
5588 imm_use_iterator imm_iter;
5589 use_operand_p use_p;
5590 tree lhs = gimple_call_lhs (stmt);
5591
5592 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5593 {
5594 gimple use_stmt = USE_STMT (use_p);
5595 if (gimple_code (use_stmt) == GIMPLE_PHI
5596 && gimple_bb (use_stmt) == merge_bb)
5597 {
5598 if (vec_stmt)
5599 {
5600 tree vfm1
5601 = build_int_cst (unsigned_type_node,
5602 loop_vinfo->vectorization_factor - 1);
5603 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5604 }
5605 return true;
5606 }
5607 }
5608 }
5609
5610 return false;
5611 }
5612
5613 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5614 return false;
5615
5616 /* FORNOW. CHECKME. */
5617 if (nested_in_vect_loop_p (loop, stmt))
5618 return false;
5619
5620 code = gimple_assign_rhs_code (stmt);
5621 op_type = TREE_CODE_LENGTH (code);
5622 rhs_class = get_gimple_rhs_class (code);
5623 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5624 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5625
5626 /* FORNOW: support only if all uses are invariant. This means
5627 that the scalar operations can remain in place, unvectorized.
5628 The original last scalar value that they compute will be used. */
5629
5630 for (i = 0; i < op_type; i++)
5631 {
5632 if (rhs_class == GIMPLE_SINGLE_RHS)
5633 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5634 else
5635 op = gimple_op (stmt, i + 1);
5636 if (op
5637 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5638 &dt))
5639 {
5640 if (dump_enabled_p ())
5641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5642 "use not simple.\n");
5643 return false;
5644 }
5645
5646 if (dt != vect_external_def && dt != vect_constant_def)
5647 return false;
5648 }
5649
5650 /* No transformation is required for the cases we currently support. */
5651 return true;
5652 }
5653
5654 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5655
5656 static void
5657 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5658 {
5659 ssa_op_iter op_iter;
5660 imm_use_iterator imm_iter;
5661 def_operand_p def_p;
5662 gimple ustmt;
5663
5664 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5665 {
5666 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5667 {
5668 basic_block bb;
5669
5670 if (!is_gimple_debug (ustmt))
5671 continue;
5672
5673 bb = gimple_bb (ustmt);
5674
5675 if (!flow_bb_inside_loop_p (loop, bb))
5676 {
5677 if (gimple_debug_bind_p (ustmt))
5678 {
5679 if (dump_enabled_p ())
5680 dump_printf_loc (MSG_NOTE, vect_location,
5681 "killing debug use\n");
5682
5683 gimple_debug_bind_reset_value (ustmt);
5684 update_stmt (ustmt);
5685 }
5686 else
5687 gcc_unreachable ();
5688 }
5689 }
5690 }
5691 }
5692
5693
5694 /* This function builds ni_name = number of iterations. Statements
5695 are emitted on the loop preheader edge. */
5696
5697 static tree
5698 vect_build_loop_niters (loop_vec_info loop_vinfo)
5699 {
5700 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5701 if (TREE_CODE (ni) == INTEGER_CST)
5702 return ni;
5703 else
5704 {
5705 tree ni_name, var;
5706 gimple_seq stmts = NULL;
5707 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5708
5709 var = create_tmp_var (TREE_TYPE (ni), "niters");
5710 ni_name = force_gimple_operand (ni, &stmts, false, var);
5711 if (stmts)
5712 gsi_insert_seq_on_edge_immediate (pe, stmts);
5713
5714 return ni_name;
5715 }
5716 }
5717
5718
5719 /* This function generates the following statements:
5720
5721 ni_name = number of iterations loop executes
5722 ratio = ni_name / vf
5723 ratio_mult_vf_name = ratio * vf
5724
5725 and places them on the loop preheader edge. */
5726
5727 static void
5728 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5729 tree ni_name,
5730 tree *ratio_mult_vf_name_ptr,
5731 tree *ratio_name_ptr)
5732 {
5733 tree ni_minus_gap_name;
5734 tree var;
5735 tree ratio_name;
5736 tree ratio_mult_vf_name;
5737 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5738 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5739 tree log_vf;
5740
5741 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5742
5743 /* If epilogue loop is required because of data accesses with gaps, we
5744 subtract one iteration from the total number of iterations here for
5745 correct calculation of RATIO. */
5746 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5747 {
5748 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5749 ni_name,
5750 build_one_cst (TREE_TYPE (ni_name)));
5751 if (!is_gimple_val (ni_minus_gap_name))
5752 {
5753 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5754 gimple stmts = NULL;
5755 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5756 true, var);
5757 gsi_insert_seq_on_edge_immediate (pe, stmts);
5758 }
5759 }
5760 else
5761 ni_minus_gap_name = ni_name;
5762
5763 /* Create: ratio = ni >> log2(vf) */
5764 /* ??? As we have ni == number of latch executions + 1, ni could
5765 have overflown to zero. So avoid computing ratio based on ni
5766 but compute it using the fact that we know ratio will be at least
5767 one, thus via (ni - vf) >> log2(vf) + 1. */
5768 ratio_name
5769 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5770 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5771 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5772 ni_minus_gap_name,
5773 build_int_cst
5774 (TREE_TYPE (ni_name), vf)),
5775 log_vf),
5776 build_int_cst (TREE_TYPE (ni_name), 1));
5777 if (!is_gimple_val (ratio_name))
5778 {
5779 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5780 gimple stmts = NULL;
5781 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5782 gsi_insert_seq_on_edge_immediate (pe, stmts);
5783 }
5784 *ratio_name_ptr = ratio_name;
5785
5786 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5787
5788 if (ratio_mult_vf_name_ptr)
5789 {
5790 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5791 ratio_name, log_vf);
5792 if (!is_gimple_val (ratio_mult_vf_name))
5793 {
5794 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5795 gimple stmts = NULL;
5796 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5797 true, var);
5798 gsi_insert_seq_on_edge_immediate (pe, stmts);
5799 }
5800 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5801 }
5802
5803 return;
5804 }
5805
5806
5807 /* Function vect_transform_loop.
5808
5809 The analysis phase has determined that the loop is vectorizable.
5810 Vectorize the loop - created vectorized stmts to replace the scalar
5811 stmts in the loop, and update the loop exit condition. */
5812
5813 void
5814 vect_transform_loop (loop_vec_info loop_vinfo)
5815 {
5816 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5817 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5818 int nbbs = loop->num_nodes;
5819 gimple_stmt_iterator si;
5820 int i;
5821 tree ratio = NULL;
5822 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5823 bool grouped_store;
5824 bool slp_scheduled = false;
5825 gimple stmt, pattern_stmt;
5826 gimple_seq pattern_def_seq = NULL;
5827 gimple_stmt_iterator pattern_def_si = gsi_none ();
5828 bool transform_pattern_stmt = false;
5829 bool check_profitability = false;
5830 int th;
5831 /* Record number of iterations before we started tampering with the profile. */
5832 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5833
5834 if (dump_enabled_p ())
5835 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5836
5837 /* If profile is inprecise, we have chance to fix it up. */
5838 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5839 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5840
5841 /* Use the more conservative vectorization threshold. If the number
5842 of iterations is constant assume the cost check has been performed
5843 by our caller. If the threshold makes all loops profitable that
5844 run at least the vectorization factor number of times checking
5845 is pointless, too. */
5846 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5847 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5848 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5849 {
5850 if (dump_enabled_p ())
5851 dump_printf_loc (MSG_NOTE, vect_location,
5852 "Profitability threshold is %d loop iterations.\n",
5853 th);
5854 check_profitability = true;
5855 }
5856
5857 /* Version the loop first, if required, so the profitability check
5858 comes first. */
5859
5860 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5861 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5862 {
5863 vect_loop_versioning (loop_vinfo, th, check_profitability);
5864 check_profitability = false;
5865 }
5866
5867 tree ni_name = vect_build_loop_niters (loop_vinfo);
5868 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5869
5870 /* Peel the loop if there are data refs with unknown alignment.
5871 Only one data ref with unknown store is allowed. */
5872
5873 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5874 {
5875 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5876 th, check_profitability);
5877 check_profitability = false;
5878 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5879 be re-computed. */
5880 ni_name = NULL_TREE;
5881 }
5882
5883 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5884 compile time constant), or it is a constant that doesn't divide by the
5885 vectorization factor, then an epilog loop needs to be created.
5886 We therefore duplicate the loop: the original loop will be vectorized,
5887 and will compute the first (n/VF) iterations. The second copy of the loop
5888 will remain scalar and will compute the remaining (n%VF) iterations.
5889 (VF is the vectorization factor). */
5890
5891 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5892 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5893 {
5894 tree ratio_mult_vf;
5895 if (!ni_name)
5896 ni_name = vect_build_loop_niters (loop_vinfo);
5897 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5898 &ratio);
5899 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5900 th, check_profitability);
5901 }
5902 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5903 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5904 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5905 else
5906 {
5907 if (!ni_name)
5908 ni_name = vect_build_loop_niters (loop_vinfo);
5909 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5910 }
5911
5912 /* 1) Make sure the loop header has exactly two entries
5913 2) Make sure we have a preheader basic block. */
5914
5915 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5916
5917 split_edge (loop_preheader_edge (loop));
5918
5919 /* FORNOW: the vectorizer supports only loops which body consist
5920 of one basic block (header + empty latch). When the vectorizer will
5921 support more involved loop forms, the order by which the BBs are
5922 traversed need to be reconsidered. */
5923
5924 for (i = 0; i < nbbs; i++)
5925 {
5926 basic_block bb = bbs[i];
5927 stmt_vec_info stmt_info;
5928 gimple phi;
5929
5930 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5931 {
5932 phi = gsi_stmt (si);
5933 if (dump_enabled_p ())
5934 {
5935 dump_printf_loc (MSG_NOTE, vect_location,
5936 "------>vectorizing phi: ");
5937 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5938 dump_printf (MSG_NOTE, "\n");
5939 }
5940 stmt_info = vinfo_for_stmt (phi);
5941 if (!stmt_info)
5942 continue;
5943
5944 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5945 vect_loop_kill_debug_uses (loop, phi);
5946
5947 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5948 && !STMT_VINFO_LIVE_P (stmt_info))
5949 continue;
5950
5951 if (STMT_VINFO_VECTYPE (stmt_info)
5952 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5953 != (unsigned HOST_WIDE_INT) vectorization_factor)
5954 && dump_enabled_p ())
5955 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5956
5957 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5958 {
5959 if (dump_enabled_p ())
5960 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5961 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5962 }
5963 }
5964
5965 pattern_stmt = NULL;
5966 for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5967 {
5968 bool is_store;
5969
5970 if (transform_pattern_stmt)
5971 stmt = pattern_stmt;
5972 else
5973 {
5974 stmt = gsi_stmt (si);
5975 /* During vectorization remove existing clobber stmts. */
5976 if (gimple_clobber_p (stmt))
5977 {
5978 unlink_stmt_vdef (stmt);
5979 gsi_remove (&si, true);
5980 release_defs (stmt);
5981 continue;
5982 }
5983 }
5984
5985 if (dump_enabled_p ())
5986 {
5987 dump_printf_loc (MSG_NOTE, vect_location,
5988 "------>vectorizing statement: ");
5989 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5990 dump_printf (MSG_NOTE, "\n");
5991 }
5992
5993 stmt_info = vinfo_for_stmt (stmt);
5994
5995 /* vector stmts created in the outer-loop during vectorization of
5996 stmts in an inner-loop may not have a stmt_info, and do not
5997 need to be vectorized. */
5998 if (!stmt_info)
5999 {
6000 gsi_next (&si);
6001 continue;
6002 }
6003
6004 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6005 vect_loop_kill_debug_uses (loop, stmt);
6006
6007 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6008 && !STMT_VINFO_LIVE_P (stmt_info))
6009 {
6010 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6011 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6012 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6013 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6014 {
6015 stmt = pattern_stmt;
6016 stmt_info = vinfo_for_stmt (stmt);
6017 }
6018 else
6019 {
6020 gsi_next (&si);
6021 continue;
6022 }
6023 }
6024 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6025 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6026 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6027 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6028 transform_pattern_stmt = true;
6029
6030 /* If pattern statement has def stmts, vectorize them too. */
6031 if (is_pattern_stmt_p (stmt_info))
6032 {
6033 if (pattern_def_seq == NULL)
6034 {
6035 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6036 pattern_def_si = gsi_start (pattern_def_seq);
6037 }
6038 else if (!gsi_end_p (pattern_def_si))
6039 gsi_next (&pattern_def_si);
6040 if (pattern_def_seq != NULL)
6041 {
6042 gimple pattern_def_stmt = NULL;
6043 stmt_vec_info pattern_def_stmt_info = NULL;
6044
6045 while (!gsi_end_p (pattern_def_si))
6046 {
6047 pattern_def_stmt = gsi_stmt (pattern_def_si);
6048 pattern_def_stmt_info
6049 = vinfo_for_stmt (pattern_def_stmt);
6050 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6051 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6052 break;
6053 gsi_next (&pattern_def_si);
6054 }
6055
6056 if (!gsi_end_p (pattern_def_si))
6057 {
6058 if (dump_enabled_p ())
6059 {
6060 dump_printf_loc (MSG_NOTE, vect_location,
6061 "==> vectorizing pattern def "
6062 "stmt: ");
6063 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6064 pattern_def_stmt, 0);
6065 dump_printf (MSG_NOTE, "\n");
6066 }
6067
6068 stmt = pattern_def_stmt;
6069 stmt_info = pattern_def_stmt_info;
6070 }
6071 else
6072 {
6073 pattern_def_si = gsi_none ();
6074 transform_pattern_stmt = false;
6075 }
6076 }
6077 else
6078 transform_pattern_stmt = false;
6079 }
6080
6081 if (STMT_VINFO_VECTYPE (stmt_info))
6082 {
6083 unsigned int nunits
6084 = (unsigned int)
6085 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6086 if (!STMT_SLP_TYPE (stmt_info)
6087 && nunits != (unsigned int) vectorization_factor
6088 && dump_enabled_p ())
6089 /* For SLP VF is set according to unrolling factor, and not
6090 to vector size, hence for SLP this print is not valid. */
6091 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6092 }
6093
6094 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6095 reached. */
6096 if (STMT_SLP_TYPE (stmt_info))
6097 {
6098 if (!slp_scheduled)
6099 {
6100 slp_scheduled = true;
6101
6102 if (dump_enabled_p ())
6103 dump_printf_loc (MSG_NOTE, vect_location,
6104 "=== scheduling SLP instances ===\n");
6105
6106 vect_schedule_slp (loop_vinfo, NULL);
6107 }
6108
6109 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6110 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6111 {
6112 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6113 {
6114 pattern_def_seq = NULL;
6115 gsi_next (&si);
6116 }
6117 continue;
6118 }
6119 }
6120
6121 /* -------- vectorize statement ------------ */
6122 if (dump_enabled_p ())
6123 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6124
6125 grouped_store = false;
6126 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6127 if (is_store)
6128 {
6129 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6130 {
6131 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6132 interleaving chain was completed - free all the stores in
6133 the chain. */
6134 gsi_next (&si);
6135 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6136 }
6137 else
6138 {
6139 /* Free the attached stmt_vec_info and remove the stmt. */
6140 gimple store = gsi_stmt (si);
6141 free_stmt_vec_info (store);
6142 unlink_stmt_vdef (store);
6143 gsi_remove (&si, true);
6144 release_defs (store);
6145 }
6146
6147 /* Stores can only appear at the end of pattern statements. */
6148 gcc_assert (!transform_pattern_stmt);
6149 pattern_def_seq = NULL;
6150 }
6151 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6152 {
6153 pattern_def_seq = NULL;
6154 gsi_next (&si);
6155 }
6156 } /* stmts in BB */
6157 } /* BBs in loop */
6158
6159 slpeel_make_loop_iterate_ntimes (loop, ratio);
6160
6161 /* Reduce loop iterations by the vectorization factor. */
6162 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6163 expected_iterations / vectorization_factor);
6164 loop->nb_iterations_upper_bound
6165 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6166 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6167 && loop->nb_iterations_upper_bound != 0)
6168 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6169 if (loop->any_estimate)
6170 {
6171 loop->nb_iterations_estimate
6172 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6173 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6174 && loop->nb_iterations_estimate != 0)
6175 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6176 }
6177
6178 if (dump_enabled_p ())
6179 {
6180 dump_printf_loc (MSG_NOTE, vect_location,
6181 "LOOP VECTORIZED\n");
6182 if (loop->inner)
6183 dump_printf_loc (MSG_NOTE, vect_location,
6184 "OUTER LOOP VECTORIZED\n");
6185 dump_printf (MSG_NOTE, "\n");
6186 }
6187 }