41e50319f0033eb4ecabe9345744a1e2113e1002
[gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "cfgloop.h"
45 #include "params.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50
51 /* Loop Vectorization Pass.
52
53 This pass tries to vectorize loops.
54
55 For example, the vectorizer transforms the following simple loop:
56
57 short a[N]; short b[N]; short c[N]; int i;
58
59 for (i=0; i<N; i++){
60 a[i] = b[i] + c[i];
61 }
62
63 as if it was manually vectorized by rewriting the source code into:
64
65 typedef int __attribute__((mode(V8HI))) v8hi;
66 short a[N]; short b[N]; short c[N]; int i;
67 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
68 v8hi va, vb, vc;
69
70 for (i=0; i<N/8; i++){
71 vb = pb[i];
72 vc = pc[i];
73 va = vb + vc;
74 pa[i] = va;
75 }
76
77 The main entry to this pass is vectorize_loops(), in which
78 the vectorizer applies a set of analyses on a given set of loops,
79 followed by the actual vectorization transformation for the loops that
80 had successfully passed the analysis phase.
81 Throughout this pass we make a distinction between two types of
82 data: scalars (which are represented by SSA_NAMES), and memory references
83 ("data-refs"). These two types of data require different handling both
84 during analysis and transformation. The types of data-refs that the
85 vectorizer currently supports are ARRAY_REFS which base is an array DECL
86 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
87 accesses are required to have a simple (consecutive) access pattern.
88
89 Analysis phase:
90 ===============
91 The driver for the analysis phase is vect_analyze_loop().
92 It applies a set of analyses, some of which rely on the scalar evolution
93 analyzer (scev) developed by Sebastian Pop.
94
95 During the analysis phase the vectorizer records some information
96 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
97 loop, as well as general information about the loop as a whole, which is
98 recorded in a "loop_vec_info" struct attached to each loop.
99
100 Transformation phase:
101 =====================
102 The loop transformation phase scans all the stmts in the loop, and
103 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
104 the loop that needs to be vectorized. It inserts the vector code sequence
105 just before the scalar stmt S, and records a pointer to the vector code
106 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
107 attached to S). This pointer will be used for the vectorization of following
108 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
109 otherwise, we rely on dead code elimination for removing it.
110
111 For example, say stmt S1 was vectorized into stmt VS1:
112
113 VS1: vb = px[i];
114 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
115 S2: a = b;
116
117 To vectorize stmt S2, the vectorizer first finds the stmt that defines
118 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
119 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
120 resulting sequence would be:
121
122 VS1: vb = px[i];
123 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 VS2: va = vb;
125 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
126
127 Operands that are not SSA_NAMEs, are data-refs that appear in
128 load/store operations (like 'x[i]' in S1), and are handled differently.
129
130 Target modeling:
131 =================
132 Currently the only target specific information that is used is the
133 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
134 Targets that can support different sizes of vectors, for now will need
135 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
136 flexibility will be added in the future.
137
138 Since we only vectorize operations which vector form can be
139 expressed using existing tree codes, to verify that an operation is
140 supported, the vectorizer checks the relevant optab at the relevant
141 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
142 the value found is CODE_FOR_nothing, then there's no target support, and
143 we can't vectorize the stmt.
144
145 For additional information on this project see:
146 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
147 */
148
149 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
150
151 /* Function vect_determine_vectorization_factor
152
153 Determine the vectorization factor (VF). VF is the number of data elements
154 that are operated upon in parallel in a single iteration of the vectorized
155 loop. For example, when vectorizing a loop that operates on 4byte elements,
156 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
157 elements can fit in a single vector register.
158
159 We currently support vectorization of loops in which all types operated upon
160 are of the same size. Therefore this function currently sets VF according to
161 the size of the types operated upon, and fails if there are multiple sizes
162 in the loop.
163
164 VF is also the factor by which the loop iterations are strip-mined, e.g.:
165 original loop:
166 for (i=0; i<N; i++){
167 a[i] = b[i] + c[i];
168 }
169
170 vectorized loop:
171 for (i=0; i<N; i+=VF){
172 a[i:VF] = b[i:VF] + c[i:VF];
173 }
174 */
175
176 static bool
177 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
178 {
179 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
180 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
181 unsigned nbbs = loop->num_nodes;
182 unsigned int vectorization_factor = 0;
183 tree scalar_type;
184 gphi *phi;
185 tree vectype;
186 unsigned int nunits;
187 stmt_vec_info stmt_info;
188 unsigned i;
189 HOST_WIDE_INT dummy;
190 gimple *stmt, *pattern_stmt = NULL;
191 gimple_seq pattern_def_seq = NULL;
192 gimple_stmt_iterator pattern_def_si = gsi_none ();
193 bool analyze_pattern_stmt = false;
194 bool bool_result;
195 auto_vec<stmt_vec_info> mask_producers;
196
197 if (dump_enabled_p ())
198 dump_printf_loc (MSG_NOTE, vect_location,
199 "=== vect_determine_vectorization_factor ===\n");
200
201 for (i = 0; i < nbbs; i++)
202 {
203 basic_block bb = bbs[i];
204
205 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
206 gsi_next (&si))
207 {
208 phi = si.phi ();
209 stmt_info = vinfo_for_stmt (phi);
210 if (dump_enabled_p ())
211 {
212 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
213 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
214 dump_printf (MSG_NOTE, "\n");
215 }
216
217 gcc_assert (stmt_info);
218
219 if (STMT_VINFO_RELEVANT_P (stmt_info))
220 {
221 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
222 scalar_type = TREE_TYPE (PHI_RESULT (phi));
223
224 if (dump_enabled_p ())
225 {
226 dump_printf_loc (MSG_NOTE, vect_location,
227 "get vectype for scalar type: ");
228 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
229 dump_printf (MSG_NOTE, "\n");
230 }
231
232 vectype = get_vectype_for_scalar_type (scalar_type);
233 if (!vectype)
234 {
235 if (dump_enabled_p ())
236 {
237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
238 "not vectorized: unsupported "
239 "data-type ");
240 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
241 scalar_type);
242 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
243 }
244 return false;
245 }
246 STMT_VINFO_VECTYPE (stmt_info) = vectype;
247
248 if (dump_enabled_p ())
249 {
250 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
251 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
252 dump_printf (MSG_NOTE, "\n");
253 }
254
255 nunits = TYPE_VECTOR_SUBPARTS (vectype);
256 if (dump_enabled_p ())
257 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
258 nunits);
259
260 if (!vectorization_factor
261 || (nunits > vectorization_factor))
262 vectorization_factor = nunits;
263 }
264 }
265
266 for (gimple_stmt_iterator si = gsi_start_bb (bb);
267 !gsi_end_p (si) || analyze_pattern_stmt;)
268 {
269 tree vf_vectype;
270
271 if (analyze_pattern_stmt)
272 stmt = pattern_stmt;
273 else
274 stmt = gsi_stmt (si);
275
276 stmt_info = vinfo_for_stmt (stmt);
277
278 if (dump_enabled_p ())
279 {
280 dump_printf_loc (MSG_NOTE, vect_location,
281 "==> examining statement: ");
282 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
283 dump_printf (MSG_NOTE, "\n");
284 }
285
286 gcc_assert (stmt_info);
287
288 /* Skip stmts which do not need to be vectorized. */
289 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
290 && !STMT_VINFO_LIVE_P (stmt_info))
291 || gimple_clobber_p (stmt))
292 {
293 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
294 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
295 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
296 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
297 {
298 stmt = pattern_stmt;
299 stmt_info = vinfo_for_stmt (pattern_stmt);
300 if (dump_enabled_p ())
301 {
302 dump_printf_loc (MSG_NOTE, vect_location,
303 "==> examining pattern statement: ");
304 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
305 dump_printf (MSG_NOTE, "\n");
306 }
307 }
308 else
309 {
310 if (dump_enabled_p ())
311 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
312 gsi_next (&si);
313 continue;
314 }
315 }
316 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
317 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
318 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
319 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
320 analyze_pattern_stmt = true;
321
322 /* If a pattern statement has def stmts, analyze them too. */
323 if (is_pattern_stmt_p (stmt_info))
324 {
325 if (pattern_def_seq == NULL)
326 {
327 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
328 pattern_def_si = gsi_start (pattern_def_seq);
329 }
330 else if (!gsi_end_p (pattern_def_si))
331 gsi_next (&pattern_def_si);
332 if (pattern_def_seq != NULL)
333 {
334 gimple *pattern_def_stmt = NULL;
335 stmt_vec_info pattern_def_stmt_info = NULL;
336
337 while (!gsi_end_p (pattern_def_si))
338 {
339 pattern_def_stmt = gsi_stmt (pattern_def_si);
340 pattern_def_stmt_info
341 = vinfo_for_stmt (pattern_def_stmt);
342 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
343 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
344 break;
345 gsi_next (&pattern_def_si);
346 }
347
348 if (!gsi_end_p (pattern_def_si))
349 {
350 if (dump_enabled_p ())
351 {
352 dump_printf_loc (MSG_NOTE, vect_location,
353 "==> examining pattern def stmt: ");
354 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
355 pattern_def_stmt, 0);
356 dump_printf (MSG_NOTE, "\n");
357 }
358
359 stmt = pattern_def_stmt;
360 stmt_info = pattern_def_stmt_info;
361 }
362 else
363 {
364 pattern_def_si = gsi_none ();
365 analyze_pattern_stmt = false;
366 }
367 }
368 else
369 analyze_pattern_stmt = false;
370 }
371
372 if (gimple_get_lhs (stmt) == NULL_TREE
373 /* MASK_STORE has no lhs, but is ok. */
374 && (!is_gimple_call (stmt)
375 || !gimple_call_internal_p (stmt)
376 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
377 {
378 if (is_gimple_call (stmt))
379 {
380 /* Ignore calls with no lhs. These must be calls to
381 #pragma omp simd functions, and what vectorization factor
382 it really needs can't be determined until
383 vectorizable_simd_clone_call. */
384 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
385 {
386 pattern_def_seq = NULL;
387 gsi_next (&si);
388 }
389 continue;
390 }
391 if (dump_enabled_p ())
392 {
393 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
394 "not vectorized: irregular stmt.");
395 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
396 0);
397 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
398 }
399 return false;
400 }
401
402 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
403 {
404 if (dump_enabled_p ())
405 {
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "not vectorized: vector stmt in loop:");
408 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
409 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
410 }
411 return false;
412 }
413
414 bool_result = false;
415
416 if (STMT_VINFO_VECTYPE (stmt_info))
417 {
418 /* The only case when a vectype had been already set is for stmts
419 that contain a dataref, or for "pattern-stmts" (stmts
420 generated by the vectorizer to represent/replace a certain
421 idiom). */
422 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
423 || is_pattern_stmt_p (stmt_info)
424 || !gsi_end_p (pattern_def_si));
425 vectype = STMT_VINFO_VECTYPE (stmt_info);
426 }
427 else
428 {
429 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
430 if (is_gimple_call (stmt)
431 && gimple_call_internal_p (stmt)
432 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
433 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
434 else
435 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
436
437 /* Bool ops don't participate in vectorization factor
438 computation. For comparison use compared types to
439 compute a factor. */
440 if (TREE_CODE (scalar_type) == BOOLEAN_TYPE)
441 {
442 mask_producers.safe_push (stmt_info);
443 bool_result = true;
444
445 if (gimple_code (stmt) == GIMPLE_ASSIGN
446 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
447 == tcc_comparison
448 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
449 != BOOLEAN_TYPE)
450 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
451 else
452 {
453 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
454 {
455 pattern_def_seq = NULL;
456 gsi_next (&si);
457 }
458 continue;
459 }
460 }
461
462 if (dump_enabled_p ())
463 {
464 dump_printf_loc (MSG_NOTE, vect_location,
465 "get vectype for scalar type: ");
466 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
467 dump_printf (MSG_NOTE, "\n");
468 }
469 vectype = get_vectype_for_scalar_type (scalar_type);
470 if (!vectype)
471 {
472 if (dump_enabled_p ())
473 {
474 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
475 "not vectorized: unsupported "
476 "data-type ");
477 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
478 scalar_type);
479 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
480 }
481 return false;
482 }
483
484 if (!bool_result)
485 STMT_VINFO_VECTYPE (stmt_info) = vectype;
486
487 if (dump_enabled_p ())
488 {
489 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
490 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
491 dump_printf (MSG_NOTE, "\n");
492 }
493 }
494
495 /* Don't try to compute VF out scalar types if we stmt
496 produces boolean vector. Use result vectype instead. */
497 if (VECTOR_BOOLEAN_TYPE_P (vectype))
498 vf_vectype = vectype;
499 else
500 {
501 /* The vectorization factor is according to the smallest
502 scalar type (or the largest vector size, but we only
503 support one vector size per loop). */
504 if (!bool_result)
505 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
506 &dummy);
507 if (dump_enabled_p ())
508 {
509 dump_printf_loc (MSG_NOTE, vect_location,
510 "get vectype for scalar type: ");
511 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
512 dump_printf (MSG_NOTE, "\n");
513 }
514 vf_vectype = get_vectype_for_scalar_type (scalar_type);
515 }
516 if (!vf_vectype)
517 {
518 if (dump_enabled_p ())
519 {
520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
521 "not vectorized: unsupported data-type ");
522 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
523 scalar_type);
524 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
525 }
526 return false;
527 }
528
529 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
530 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
531 {
532 if (dump_enabled_p ())
533 {
534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
535 "not vectorized: different sized vector "
536 "types in statement, ");
537 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
538 vectype);
539 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
541 vf_vectype);
542 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
543 }
544 return false;
545 }
546
547 if (dump_enabled_p ())
548 {
549 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
550 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
551 dump_printf (MSG_NOTE, "\n");
552 }
553
554 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
557 if (!vectorization_factor
558 || (nunits > vectorization_factor))
559 vectorization_factor = nunits;
560
561 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
562 {
563 pattern_def_seq = NULL;
564 gsi_next (&si);
565 }
566 }
567 }
568
569 /* TODO: Analyze cost. Decide if worth while to vectorize. */
570 if (dump_enabled_p ())
571 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
572 vectorization_factor);
573 if (vectorization_factor <= 1)
574 {
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
577 "not vectorized: unsupported data-type\n");
578 return false;
579 }
580 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
581
582 for (i = 0; i < mask_producers.length (); i++)
583 {
584 tree mask_type = NULL;
585
586 stmt = STMT_VINFO_STMT (mask_producers[i]);
587
588 if (gimple_code (stmt) == GIMPLE_ASSIGN
589 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
590 && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
591 {
592 scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
593 mask_type = get_mask_type_for_scalar_type (scalar_type);
594
595 if (!mask_type)
596 {
597 if (dump_enabled_p ())
598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
599 "not vectorized: unsupported mask\n");
600 return false;
601 }
602 }
603 else
604 {
605 tree rhs;
606 ssa_op_iter iter;
607 gimple *def_stmt;
608 enum vect_def_type dt;
609
610 FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
611 {
612 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
613 &def_stmt, &dt, &vectype))
614 {
615 if (dump_enabled_p ())
616 {
617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
618 "not vectorized: can't compute mask type "
619 "for statement, ");
620 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
621 0);
622 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
623 }
624 return false;
625 }
626
627 /* No vectype probably means external definition.
628 Allow it in case there is another operand which
629 allows to determine mask type. */
630 if (!vectype)
631 continue;
632
633 if (!mask_type)
634 mask_type = vectype;
635 else if (TYPE_VECTOR_SUBPARTS (mask_type)
636 != TYPE_VECTOR_SUBPARTS (vectype))
637 {
638 if (dump_enabled_p ())
639 {
640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641 "not vectorized: different sized masks "
642 "types in statement, ");
643 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
644 mask_type);
645 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
646 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
647 vectype);
648 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
649 }
650 return false;
651 }
652 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
653 != VECTOR_BOOLEAN_TYPE_P (vectype))
654 {
655 if (dump_enabled_p ())
656 {
657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
658 "not vectorized: mixed mask and "
659 "nonmask vector types in statement, ");
660 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
661 mask_type);
662 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
663 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
664 vectype);
665 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
666 }
667 return false;
668 }
669 }
670
671 /* We may compare boolean value loaded as vector of integers.
672 Fix mask_type in such case. */
673 if (mask_type
674 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
675 && gimple_code (stmt) == GIMPLE_ASSIGN
676 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
677 mask_type = build_same_sized_truth_vector_type (mask_type);
678 }
679
680 /* No mask_type should mean loop invariant predicate.
681 This is probably a subject for optimization in
682 if-conversion. */
683 if (!mask_type)
684 {
685 if (dump_enabled_p ())
686 {
687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
688 "not vectorized: can't compute mask type "
689 "for statement, ");
690 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
691 0);
692 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
693 }
694 return false;
695 }
696
697 STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
698 }
699
700 return true;
701 }
702
703
704 /* Function vect_is_simple_iv_evolution.
705
706 FORNOW: A simple evolution of an induction variables in the loop is
707 considered a polynomial evolution. */
708
709 static bool
710 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
711 tree * step)
712 {
713 tree init_expr;
714 tree step_expr;
715 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
716 basic_block bb;
717
718 /* When there is no evolution in this loop, the evolution function
719 is not "simple". */
720 if (evolution_part == NULL_TREE)
721 return false;
722
723 /* When the evolution is a polynomial of degree >= 2
724 the evolution function is not "simple". */
725 if (tree_is_chrec (evolution_part))
726 return false;
727
728 step_expr = evolution_part;
729 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
730
731 if (dump_enabled_p ())
732 {
733 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
734 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
735 dump_printf (MSG_NOTE, ", init: ");
736 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
737 dump_printf (MSG_NOTE, "\n");
738 }
739
740 *init = init_expr;
741 *step = step_expr;
742
743 if (TREE_CODE (step_expr) != INTEGER_CST
744 && (TREE_CODE (step_expr) != SSA_NAME
745 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
746 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
747 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
748 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
749 || !flag_associative_math)))
750 && (TREE_CODE (step_expr) != REAL_CST
751 || !flag_associative_math))
752 {
753 if (dump_enabled_p ())
754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
755 "step unknown.\n");
756 return false;
757 }
758
759 return true;
760 }
761
762 /* Function vect_analyze_scalar_cycles_1.
763
764 Examine the cross iteration def-use cycles of scalar variables
765 in LOOP. LOOP_VINFO represents the loop that is now being
766 considered for vectorization (can be LOOP, or an outer-loop
767 enclosing LOOP). */
768
769 static void
770 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
771 {
772 basic_block bb = loop->header;
773 tree init, step;
774 auto_vec<gimple *, 64> worklist;
775 gphi_iterator gsi;
776 bool double_reduc;
777
778 if (dump_enabled_p ())
779 dump_printf_loc (MSG_NOTE, vect_location,
780 "=== vect_analyze_scalar_cycles ===\n");
781
782 /* First - identify all inductions. Reduction detection assumes that all the
783 inductions have been identified, therefore, this order must not be
784 changed. */
785 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
786 {
787 gphi *phi = gsi.phi ();
788 tree access_fn = NULL;
789 tree def = PHI_RESULT (phi);
790 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
791
792 if (dump_enabled_p ())
793 {
794 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
795 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
796 dump_printf (MSG_NOTE, "\n");
797 }
798
799 /* Skip virtual phi's. The data dependences that are associated with
800 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
801 if (virtual_operand_p (def))
802 continue;
803
804 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
805
806 /* Analyze the evolution function. */
807 access_fn = analyze_scalar_evolution (loop, def);
808 if (access_fn)
809 {
810 STRIP_NOPS (access_fn);
811 if (dump_enabled_p ())
812 {
813 dump_printf_loc (MSG_NOTE, vect_location,
814 "Access function of PHI: ");
815 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
816 dump_printf (MSG_NOTE, "\n");
817 }
818 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
819 = initial_condition_in_loop_num (access_fn, loop->num);
820 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
821 = evolution_part_in_loop_num (access_fn, loop->num);
822 }
823
824 if (!access_fn
825 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
826 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
827 && TREE_CODE (step) != INTEGER_CST))
828 {
829 worklist.safe_push (phi);
830 continue;
831 }
832
833 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
834 != NULL_TREE);
835 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
836
837 if (dump_enabled_p ())
838 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
839 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
840 }
841
842
843 /* Second - identify all reductions and nested cycles. */
844 while (worklist.length () > 0)
845 {
846 gimple *phi = worklist.pop ();
847 tree def = PHI_RESULT (phi);
848 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
849 gimple *reduc_stmt;
850 bool nested_cycle;
851
852 if (dump_enabled_p ())
853 {
854 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
855 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
856 dump_printf (MSG_NOTE, "\n");
857 }
858
859 gcc_assert (!virtual_operand_p (def)
860 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
861
862 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
863 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
864 &double_reduc, false);
865 if (reduc_stmt)
866 {
867 if (double_reduc)
868 {
869 if (dump_enabled_p ())
870 dump_printf_loc (MSG_NOTE, vect_location,
871 "Detected double reduction.\n");
872
873 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
874 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
875 vect_double_reduction_def;
876 }
877 else
878 {
879 if (nested_cycle)
880 {
881 if (dump_enabled_p ())
882 dump_printf_loc (MSG_NOTE, vect_location,
883 "Detected vectorizable nested cycle.\n");
884
885 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
886 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
887 vect_nested_cycle;
888 }
889 else
890 {
891 if (dump_enabled_p ())
892 dump_printf_loc (MSG_NOTE, vect_location,
893 "Detected reduction.\n");
894
895 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
896 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
897 vect_reduction_def;
898 /* Store the reduction cycles for possible vectorization in
899 loop-aware SLP. */
900 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
901 }
902 }
903 }
904 else
905 if (dump_enabled_p ())
906 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
907 "Unknown def-use cycle pattern.\n");
908 }
909 }
910
911
912 /* Function vect_analyze_scalar_cycles.
913
914 Examine the cross iteration def-use cycles of scalar variables, by
915 analyzing the loop-header PHIs of scalar variables. Classify each
916 cycle as one of the following: invariant, induction, reduction, unknown.
917 We do that for the loop represented by LOOP_VINFO, and also to its
918 inner-loop, if exists.
919 Examples for scalar cycles:
920
921 Example1: reduction:
922
923 loop1:
924 for (i=0; i<N; i++)
925 sum += a[i];
926
927 Example2: induction:
928
929 loop2:
930 for (i=0; i<N; i++)
931 a[i] = i; */
932
933 static void
934 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
935 {
936 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
937
938 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
939
940 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
941 Reductions in such inner-loop therefore have different properties than
942 the reductions in the nest that gets vectorized:
943 1. When vectorized, they are executed in the same order as in the original
944 scalar loop, so we can't change the order of computation when
945 vectorizing them.
946 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
947 current checks are too strict. */
948
949 if (loop->inner)
950 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
951 }
952
953 /* Transfer group and reduction information from STMT to its pattern stmt. */
954
955 static void
956 vect_fixup_reduc_chain (gimple *stmt)
957 {
958 gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
959 gimple *stmtp;
960 gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
961 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
962 GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
963 do
964 {
965 stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
966 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
967 stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
968 if (stmt)
969 GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
970 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
971 }
972 while (stmt);
973 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
974 }
975
976 /* Fixup scalar cycles that now have their stmts detected as patterns. */
977
978 static void
979 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
980 {
981 gimple *first;
982 unsigned i;
983
984 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
985 if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
986 {
987 vect_fixup_reduc_chain (first);
988 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
989 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
990 }
991 }
992
993 /* Function vect_get_loop_niters.
994
995 Determine how many iterations the loop is executed and place it
996 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
997 in NUMBER_OF_ITERATIONSM1.
998
999 Return the loop exit condition. */
1000
1001
1002 static gcond *
1003 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
1004 tree *number_of_iterationsm1)
1005 {
1006 tree niters;
1007
1008 if (dump_enabled_p ())
1009 dump_printf_loc (MSG_NOTE, vect_location,
1010 "=== get_loop_niters ===\n");
1011
1012 niters = number_of_latch_executions (loop);
1013 *number_of_iterationsm1 = niters;
1014
1015 /* We want the number of loop header executions which is the number
1016 of latch executions plus one.
1017 ??? For UINT_MAX latch executions this number overflows to zero
1018 for loops like do { n++; } while (n != 0); */
1019 if (niters && !chrec_contains_undetermined (niters))
1020 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
1021 build_int_cst (TREE_TYPE (niters), 1));
1022 *number_of_iterations = niters;
1023
1024 return get_loop_exit_condition (loop);
1025 }
1026
1027
1028 /* Function bb_in_loop_p
1029
1030 Used as predicate for dfs order traversal of the loop bbs. */
1031
1032 static bool
1033 bb_in_loop_p (const_basic_block bb, const void *data)
1034 {
1035 const struct loop *const loop = (const struct loop *)data;
1036 if (flow_bb_inside_loop_p (loop, bb))
1037 return true;
1038 return false;
1039 }
1040
1041
1042 /* Function new_loop_vec_info.
1043
1044 Create and initialize a new loop_vec_info struct for LOOP, as well as
1045 stmt_vec_info structs for all the stmts in LOOP. */
1046
1047 static loop_vec_info
1048 new_loop_vec_info (struct loop *loop)
1049 {
1050 loop_vec_info res;
1051 basic_block *bbs;
1052 gimple_stmt_iterator si;
1053 unsigned int i, nbbs;
1054
1055 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1056 res->kind = vec_info::loop;
1057 LOOP_VINFO_LOOP (res) = loop;
1058
1059 bbs = get_loop_body (loop);
1060
1061 /* Create/Update stmt_info for all stmts in the loop. */
1062 for (i = 0; i < loop->num_nodes; i++)
1063 {
1064 basic_block bb = bbs[i];
1065
1066 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1067 {
1068 gimple *phi = gsi_stmt (si);
1069 gimple_set_uid (phi, 0);
1070 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1071 }
1072
1073 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1074 {
1075 gimple *stmt = gsi_stmt (si);
1076 gimple_set_uid (stmt, 0);
1077 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1078 }
1079 }
1080
1081 /* CHECKME: We want to visit all BBs before their successors (except for
1082 latch blocks, for which this assertion wouldn't hold). In the simple
1083 case of the loop forms we allow, a dfs order of the BBs would the same
1084 as reversed postorder traversal, so we are safe. */
1085
1086 free (bbs);
1087 bbs = XCNEWVEC (basic_block, loop->num_nodes);
1088 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1089 bbs, loop->num_nodes, loop);
1090 gcc_assert (nbbs == loop->num_nodes);
1091
1092 LOOP_VINFO_BBS (res) = bbs;
1093 LOOP_VINFO_NITERSM1 (res) = NULL;
1094 LOOP_VINFO_NITERS (res) = NULL;
1095 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1096 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1097 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1098 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1099 LOOP_VINFO_VECT_FACTOR (res) = 0;
1100 LOOP_VINFO_LOOP_NEST (res) = vNULL;
1101 LOOP_VINFO_DATAREFS (res) = vNULL;
1102 LOOP_VINFO_DDRS (res) = vNULL;
1103 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1104 LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1105 LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1106 LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1107 LOOP_VINFO_REDUCTIONS (res) = vNULL;
1108 LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1109 LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1110 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1111 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1112 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1113 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1114 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1115
1116 return res;
1117 }
1118
1119
1120 /* Function destroy_loop_vec_info.
1121
1122 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1123 stmts in the loop. */
1124
1125 void
1126 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1127 {
1128 struct loop *loop;
1129 basic_block *bbs;
1130 int nbbs;
1131 gimple_stmt_iterator si;
1132 int j;
1133 vec<slp_instance> slp_instances;
1134 slp_instance instance;
1135 bool swapped;
1136
1137 if (!loop_vinfo)
1138 return;
1139
1140 loop = LOOP_VINFO_LOOP (loop_vinfo);
1141
1142 bbs = LOOP_VINFO_BBS (loop_vinfo);
1143 nbbs = clean_stmts ? loop->num_nodes : 0;
1144 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1145
1146 for (j = 0; j < nbbs; j++)
1147 {
1148 basic_block bb = bbs[j];
1149 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1150 free_stmt_vec_info (gsi_stmt (si));
1151
1152 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1153 {
1154 gimple *stmt = gsi_stmt (si);
1155
1156 /* We may have broken canonical form by moving a constant
1157 into RHS1 of a commutative op. Fix such occurrences. */
1158 if (swapped && is_gimple_assign (stmt))
1159 {
1160 enum tree_code code = gimple_assign_rhs_code (stmt);
1161
1162 if ((code == PLUS_EXPR
1163 || code == POINTER_PLUS_EXPR
1164 || code == MULT_EXPR)
1165 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1166 swap_ssa_operands (stmt,
1167 gimple_assign_rhs1_ptr (stmt),
1168 gimple_assign_rhs2_ptr (stmt));
1169 }
1170
1171 /* Free stmt_vec_info. */
1172 free_stmt_vec_info (stmt);
1173 gsi_next (&si);
1174 }
1175 }
1176
1177 free (LOOP_VINFO_BBS (loop_vinfo));
1178 vect_destroy_datarefs (loop_vinfo);
1179 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1180 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1181 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1182 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1183 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1184 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1185 vect_free_slp_instance (instance);
1186
1187 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1188 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1189 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1190 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1191
1192 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1193 loop_vinfo->scalar_cost_vec.release ();
1194
1195 free (loop_vinfo);
1196 loop->aux = NULL;
1197 }
1198
1199
1200 /* Calculate the cost of one scalar iteration of the loop. */
1201 static void
1202 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1203 {
1204 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1205 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1206 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1207 int innerloop_iters, i;
1208
1209 /* Count statements in scalar loop. Using this as scalar cost for a single
1210 iteration for now.
1211
1212 TODO: Add outer loop support.
1213
1214 TODO: Consider assigning different costs to different scalar
1215 statements. */
1216
1217 /* FORNOW. */
1218 innerloop_iters = 1;
1219 if (loop->inner)
1220 innerloop_iters = 50; /* FIXME */
1221
1222 for (i = 0; i < nbbs; i++)
1223 {
1224 gimple_stmt_iterator si;
1225 basic_block bb = bbs[i];
1226
1227 if (bb->loop_father == loop->inner)
1228 factor = innerloop_iters;
1229 else
1230 factor = 1;
1231
1232 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1233 {
1234 gimple *stmt = gsi_stmt (si);
1235 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1236
1237 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1238 continue;
1239
1240 /* Skip stmts that are not vectorized inside the loop. */
1241 if (stmt_info
1242 && !STMT_VINFO_RELEVANT_P (stmt_info)
1243 && (!STMT_VINFO_LIVE_P (stmt_info)
1244 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1245 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1246 continue;
1247
1248 vect_cost_for_stmt kind;
1249 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1250 {
1251 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1252 kind = scalar_load;
1253 else
1254 kind = scalar_store;
1255 }
1256 else
1257 kind = scalar_stmt;
1258
1259 scalar_single_iter_cost
1260 += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1261 factor, kind, NULL, 0, vect_prologue);
1262 }
1263 }
1264 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1265 = scalar_single_iter_cost;
1266 }
1267
1268
1269 /* Function vect_analyze_loop_form_1.
1270
1271 Verify that certain CFG restrictions hold, including:
1272 - the loop has a pre-header
1273 - the loop has a single entry and exit
1274 - the loop exit condition is simple enough, and the number of iterations
1275 can be analyzed (a countable loop). */
1276
1277 bool
1278 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1279 tree *number_of_iterationsm1,
1280 tree *number_of_iterations, gcond **inner_loop_cond)
1281 {
1282 if (dump_enabled_p ())
1283 dump_printf_loc (MSG_NOTE, vect_location,
1284 "=== vect_analyze_loop_form ===\n");
1285
1286 /* Different restrictions apply when we are considering an inner-most loop,
1287 vs. an outer (nested) loop.
1288 (FORNOW. May want to relax some of these restrictions in the future). */
1289
1290 if (!loop->inner)
1291 {
1292 /* Inner-most loop. We currently require that the number of BBs is
1293 exactly 2 (the header and latch). Vectorizable inner-most loops
1294 look like this:
1295
1296 (pre-header)
1297 |
1298 header <--------+
1299 | | |
1300 | +--> latch --+
1301 |
1302 (exit-bb) */
1303
1304 if (loop->num_nodes != 2)
1305 {
1306 if (dump_enabled_p ())
1307 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1308 "not vectorized: control flow in loop.\n");
1309 return false;
1310 }
1311
1312 if (empty_block_p (loop->header))
1313 {
1314 if (dump_enabled_p ())
1315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1316 "not vectorized: empty loop.\n");
1317 return false;
1318 }
1319 }
1320 else
1321 {
1322 struct loop *innerloop = loop->inner;
1323 edge entryedge;
1324
1325 /* Nested loop. We currently require that the loop is doubly-nested,
1326 contains a single inner loop, and the number of BBs is exactly 5.
1327 Vectorizable outer-loops look like this:
1328
1329 (pre-header)
1330 |
1331 header <---+
1332 | |
1333 inner-loop |
1334 | |
1335 tail ------+
1336 |
1337 (exit-bb)
1338
1339 The inner-loop has the properties expected of inner-most loops
1340 as described above. */
1341
1342 if ((loop->inner)->inner || (loop->inner)->next)
1343 {
1344 if (dump_enabled_p ())
1345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1346 "not vectorized: multiple nested loops.\n");
1347 return false;
1348 }
1349
1350 if (loop->num_nodes != 5)
1351 {
1352 if (dump_enabled_p ())
1353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1354 "not vectorized: control flow in loop.\n");
1355 return false;
1356 }
1357
1358 entryedge = loop_preheader_edge (innerloop);
1359 if (entryedge->src != loop->header
1360 || !single_exit (innerloop)
1361 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1362 {
1363 if (dump_enabled_p ())
1364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1365 "not vectorized: unsupported outerloop form.\n");
1366 return false;
1367 }
1368
1369 /* Analyze the inner-loop. */
1370 tree inner_niterm1, inner_niter;
1371 if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1372 &inner_niterm1, &inner_niter, NULL))
1373 {
1374 if (dump_enabled_p ())
1375 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1376 "not vectorized: Bad inner loop.\n");
1377 return false;
1378 }
1379
1380 if (!expr_invariant_in_loop_p (loop, inner_niter))
1381 {
1382 if (dump_enabled_p ())
1383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1384 "not vectorized: inner-loop count not"
1385 " invariant.\n");
1386 return false;
1387 }
1388
1389 if (dump_enabled_p ())
1390 dump_printf_loc (MSG_NOTE, vect_location,
1391 "Considering outer-loop vectorization.\n");
1392 }
1393
1394 if (!single_exit (loop)
1395 || EDGE_COUNT (loop->header->preds) != 2)
1396 {
1397 if (dump_enabled_p ())
1398 {
1399 if (!single_exit (loop))
1400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1401 "not vectorized: multiple exits.\n");
1402 else if (EDGE_COUNT (loop->header->preds) != 2)
1403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1404 "not vectorized: too many incoming edges.\n");
1405 }
1406 return false;
1407 }
1408
1409 /* We assume that the loop exit condition is at the end of the loop. i.e,
1410 that the loop is represented as a do-while (with a proper if-guard
1411 before the loop if needed), where the loop header contains all the
1412 executable statements, and the latch is empty. */
1413 if (!empty_block_p (loop->latch)
1414 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1415 {
1416 if (dump_enabled_p ())
1417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1418 "not vectorized: latch block not empty.\n");
1419 return false;
1420 }
1421
1422 /* Make sure there exists a single-predecessor exit bb: */
1423 if (!single_pred_p (single_exit (loop)->dest))
1424 {
1425 edge e = single_exit (loop);
1426 if (!(e->flags & EDGE_ABNORMAL))
1427 {
1428 split_loop_exit_edge (e);
1429 if (dump_enabled_p ())
1430 dump_printf (MSG_NOTE, "split exit edge.\n");
1431 }
1432 else
1433 {
1434 if (dump_enabled_p ())
1435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1436 "not vectorized: abnormal loop exit edge.\n");
1437 return false;
1438 }
1439 }
1440
1441 *loop_cond = vect_get_loop_niters (loop, number_of_iterations,
1442 number_of_iterationsm1);
1443 if (!*loop_cond)
1444 {
1445 if (dump_enabled_p ())
1446 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1447 "not vectorized: complicated exit condition.\n");
1448 return false;
1449 }
1450
1451 if (!*number_of_iterations
1452 || chrec_contains_undetermined (*number_of_iterations))
1453 {
1454 if (dump_enabled_p ())
1455 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1456 "not vectorized: number of iterations cannot be "
1457 "computed.\n");
1458 return false;
1459 }
1460
1461 if (integer_zerop (*number_of_iterations))
1462 {
1463 if (dump_enabled_p ())
1464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465 "not vectorized: number of iterations = 0.\n");
1466 return false;
1467 }
1468
1469 return true;
1470 }
1471
1472 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1473
1474 loop_vec_info
1475 vect_analyze_loop_form (struct loop *loop)
1476 {
1477 tree number_of_iterations, number_of_iterationsm1;
1478 gcond *loop_cond, *inner_loop_cond = NULL;
1479
1480 if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
1481 &number_of_iterations, &inner_loop_cond))
1482 return NULL;
1483
1484 loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1485 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1486 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1487 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1488
1489 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1490 {
1491 if (dump_enabled_p ())
1492 {
1493 dump_printf_loc (MSG_NOTE, vect_location,
1494 "Symbolic number of iterations is ");
1495 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1496 dump_printf (MSG_NOTE, "\n");
1497 }
1498 }
1499
1500 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1501 if (inner_loop_cond)
1502 STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1503 = loop_exit_ctrl_vec_info_type;
1504
1505 gcc_assert (!loop->aux);
1506 loop->aux = loop_vinfo;
1507 return loop_vinfo;
1508 }
1509
1510
1511
1512 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1513 statements update the vectorization factor. */
1514
1515 static void
1516 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1517 {
1518 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1519 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1520 int nbbs = loop->num_nodes;
1521 unsigned int vectorization_factor;
1522 int i;
1523
1524 if (dump_enabled_p ())
1525 dump_printf_loc (MSG_NOTE, vect_location,
1526 "=== vect_update_vf_for_slp ===\n");
1527
1528 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1529 gcc_assert (vectorization_factor != 0);
1530
1531 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1532 vectorization factor of the loop is the unrolling factor required by
1533 the SLP instances. If that unrolling factor is 1, we say, that we
1534 perform pure SLP on loop - cross iteration parallelism is not
1535 exploited. */
1536 bool only_slp_in_loop = true;
1537 for (i = 0; i < nbbs; i++)
1538 {
1539 basic_block bb = bbs[i];
1540 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1541 gsi_next (&si))
1542 {
1543 gimple *stmt = gsi_stmt (si);
1544 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1545 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1546 && STMT_VINFO_RELATED_STMT (stmt_info))
1547 {
1548 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1549 stmt_info = vinfo_for_stmt (stmt);
1550 }
1551 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1552 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1553 && !PURE_SLP_STMT (stmt_info))
1554 /* STMT needs both SLP and loop-based vectorization. */
1555 only_slp_in_loop = false;
1556 }
1557 }
1558
1559 if (only_slp_in_loop)
1560 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1561 else
1562 vectorization_factor
1563 = least_common_multiple (vectorization_factor,
1564 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1565
1566 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1567 if (dump_enabled_p ())
1568 dump_printf_loc (MSG_NOTE, vect_location,
1569 "Updating vectorization factor to %d\n",
1570 vectorization_factor);
1571 }
1572
1573 /* Function vect_analyze_loop_operations.
1574
1575 Scan the loop stmts and make sure they are all vectorizable. */
1576
1577 static bool
1578 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1579 {
1580 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1581 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1582 int nbbs = loop->num_nodes;
1583 int i;
1584 stmt_vec_info stmt_info;
1585 bool need_to_vectorize = false;
1586 bool ok;
1587
1588 if (dump_enabled_p ())
1589 dump_printf_loc (MSG_NOTE, vect_location,
1590 "=== vect_analyze_loop_operations ===\n");
1591
1592 for (i = 0; i < nbbs; i++)
1593 {
1594 basic_block bb = bbs[i];
1595
1596 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1597 gsi_next (&si))
1598 {
1599 gphi *phi = si.phi ();
1600 ok = true;
1601
1602 stmt_info = vinfo_for_stmt (phi);
1603 if (dump_enabled_p ())
1604 {
1605 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1606 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1607 dump_printf (MSG_NOTE, "\n");
1608 }
1609 if (virtual_operand_p (gimple_phi_result (phi)))
1610 continue;
1611
1612 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1613 (i.e., a phi in the tail of the outer-loop). */
1614 if (! is_loop_header_bb_p (bb))
1615 {
1616 /* FORNOW: we currently don't support the case that these phis
1617 are not used in the outerloop (unless it is double reduction,
1618 i.e., this phi is vect_reduction_def), cause this case
1619 requires to actually do something here. */
1620 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1621 || STMT_VINFO_LIVE_P (stmt_info))
1622 && STMT_VINFO_DEF_TYPE (stmt_info)
1623 != vect_double_reduction_def)
1624 {
1625 if (dump_enabled_p ())
1626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1627 "Unsupported loop-closed phi in "
1628 "outer-loop.\n");
1629 return false;
1630 }
1631
1632 /* If PHI is used in the outer loop, we check that its operand
1633 is defined in the inner loop. */
1634 if (STMT_VINFO_RELEVANT_P (stmt_info))
1635 {
1636 tree phi_op;
1637 gimple *op_def_stmt;
1638
1639 if (gimple_phi_num_args (phi) != 1)
1640 return false;
1641
1642 phi_op = PHI_ARG_DEF (phi, 0);
1643 if (TREE_CODE (phi_op) != SSA_NAME)
1644 return false;
1645
1646 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1647 if (gimple_nop_p (op_def_stmt)
1648 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1649 || !vinfo_for_stmt (op_def_stmt))
1650 return false;
1651
1652 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1653 != vect_used_in_outer
1654 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1655 != vect_used_in_outer_by_reduction)
1656 return false;
1657 }
1658
1659 continue;
1660 }
1661
1662 gcc_assert (stmt_info);
1663
1664 if (STMT_VINFO_LIVE_P (stmt_info))
1665 {
1666 /* FORNOW: not yet supported. */
1667 if (dump_enabled_p ())
1668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1669 "not vectorized: value used after loop.\n");
1670 return false;
1671 }
1672
1673 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1674 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1675 {
1676 /* A scalar-dependence cycle that we don't support. */
1677 if (dump_enabled_p ())
1678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1679 "not vectorized: scalar dependence cycle.\n");
1680 return false;
1681 }
1682
1683 if (STMT_VINFO_RELEVANT_P (stmt_info))
1684 {
1685 need_to_vectorize = true;
1686 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1687 ok = vectorizable_induction (phi, NULL, NULL);
1688 }
1689
1690 if (!ok)
1691 {
1692 if (dump_enabled_p ())
1693 {
1694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1695 "not vectorized: relevant phi not "
1696 "supported: ");
1697 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1698 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1699 }
1700 return false;
1701 }
1702 }
1703
1704 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1705 gsi_next (&si))
1706 {
1707 gimple *stmt = gsi_stmt (si);
1708 if (!gimple_clobber_p (stmt)
1709 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1710 return false;
1711 }
1712 } /* bbs */
1713
1714 /* All operations in the loop are either irrelevant (deal with loop
1715 control, or dead), or only used outside the loop and can be moved
1716 out of the loop (e.g. invariants, inductions). The loop can be
1717 optimized away by scalar optimizations. We're better off not
1718 touching this loop. */
1719 if (!need_to_vectorize)
1720 {
1721 if (dump_enabled_p ())
1722 dump_printf_loc (MSG_NOTE, vect_location,
1723 "All the computation can be taken out of the loop.\n");
1724 if (dump_enabled_p ())
1725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1726 "not vectorized: redundant loop. no profit to "
1727 "vectorize.\n");
1728 return false;
1729 }
1730
1731 return true;
1732 }
1733
1734
1735 /* Function vect_analyze_loop_2.
1736
1737 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1738 for it. The different analyses will record information in the
1739 loop_vec_info struct. */
1740 static bool
1741 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1742 {
1743 bool ok;
1744 int max_vf = MAX_VECTORIZATION_FACTOR;
1745 int min_vf = 2;
1746 unsigned int n_stmts = 0;
1747
1748 /* The first group of checks is independent of the vector size. */
1749 fatal = true;
1750
1751 /* Find all data references in the loop (which correspond to vdefs/vuses)
1752 and analyze their evolution in the loop. */
1753
1754 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1755
1756 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1757 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1758 {
1759 if (dump_enabled_p ())
1760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1761 "not vectorized: loop contains function calls"
1762 " or data references that cannot be analyzed\n");
1763 return false;
1764 }
1765
1766 for (unsigned i = 0; i < loop->num_nodes; i++)
1767 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1768 !gsi_end_p (gsi); gsi_next (&gsi))
1769 {
1770 gimple *stmt = gsi_stmt (gsi);
1771 if (is_gimple_debug (stmt))
1772 continue;
1773 ++n_stmts;
1774 if (!find_data_references_in_stmt (loop, stmt,
1775 &LOOP_VINFO_DATAREFS (loop_vinfo)))
1776 {
1777 if (is_gimple_call (stmt) && loop->safelen)
1778 {
1779 tree fndecl = gimple_call_fndecl (stmt), op;
1780 if (fndecl != NULL_TREE)
1781 {
1782 cgraph_node *node = cgraph_node::get (fndecl);
1783 if (node != NULL && node->simd_clones != NULL)
1784 {
1785 unsigned int j, n = gimple_call_num_args (stmt);
1786 for (j = 0; j < n; j++)
1787 {
1788 op = gimple_call_arg (stmt, j);
1789 if (DECL_P (op)
1790 || (REFERENCE_CLASS_P (op)
1791 && get_base_address (op)))
1792 break;
1793 }
1794 op = gimple_call_lhs (stmt);
1795 /* Ignore #pragma omp declare simd functions
1796 if they don't have data references in the
1797 call stmt itself. */
1798 if (j == n
1799 && !(op
1800 && (DECL_P (op)
1801 || (REFERENCE_CLASS_P (op)
1802 && get_base_address (op)))))
1803 continue;
1804 }
1805 }
1806 }
1807 if (dump_enabled_p ())
1808 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1809 "not vectorized: loop contains function "
1810 "calls or data references that cannot "
1811 "be analyzed\n");
1812 return false;
1813 }
1814 }
1815
1816 /* Analyze the data references and also adjust the minimal
1817 vectorization factor according to the loads and stores. */
1818
1819 ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1820 if (!ok)
1821 {
1822 if (dump_enabled_p ())
1823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824 "bad data references.\n");
1825 return false;
1826 }
1827
1828 /* Classify all cross-iteration scalar data-flow cycles.
1829 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1830 vect_analyze_scalar_cycles (loop_vinfo);
1831
1832 vect_pattern_recog (loop_vinfo);
1833
1834 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1835
1836 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1837 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1838
1839 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1840 if (!ok)
1841 {
1842 if (dump_enabled_p ())
1843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1844 "bad data access.\n");
1845 return false;
1846 }
1847
1848 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1849
1850 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1851 if (!ok)
1852 {
1853 if (dump_enabled_p ())
1854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1855 "unexpected pattern.\n");
1856 return false;
1857 }
1858
1859 /* While the rest of the analysis below depends on it in some way. */
1860 fatal = false;
1861
1862 /* Analyze data dependences between the data-refs in the loop
1863 and adjust the maximum vectorization factor according to
1864 the dependences.
1865 FORNOW: fail at the first data dependence that we encounter. */
1866
1867 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1868 if (!ok
1869 || max_vf < min_vf)
1870 {
1871 if (dump_enabled_p ())
1872 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1873 "bad data dependence.\n");
1874 return false;
1875 }
1876
1877 ok = vect_determine_vectorization_factor (loop_vinfo);
1878 if (!ok)
1879 {
1880 if (dump_enabled_p ())
1881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1882 "can't determine vectorization factor.\n");
1883 return false;
1884 }
1885 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1886 {
1887 if (dump_enabled_p ())
1888 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1889 "bad data dependence.\n");
1890 return false;
1891 }
1892
1893 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1894 ok = vect_analyze_slp (loop_vinfo, n_stmts);
1895 if (!ok)
1896 return false;
1897
1898 /* If there are any SLP instances mark them as pure_slp. */
1899 bool slp = vect_make_slp_decision (loop_vinfo);
1900 if (slp)
1901 {
1902 /* Find stmts that need to be both vectorized and SLPed. */
1903 vect_detect_hybrid_slp (loop_vinfo);
1904
1905 /* Update the vectorization factor based on the SLP decision. */
1906 vect_update_vf_for_slp (loop_vinfo);
1907 }
1908
1909 /* Now the vectorization factor is final. */
1910 unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1911 gcc_assert (vectorization_factor != 0);
1912
1913 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1914 dump_printf_loc (MSG_NOTE, vect_location,
1915 "vectorization_factor = %d, niters = "
1916 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1917 LOOP_VINFO_INT_NITERS (loop_vinfo));
1918
1919 HOST_WIDE_INT max_niter
1920 = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1921 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1922 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1923 || (max_niter != -1
1924 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1925 {
1926 if (dump_enabled_p ())
1927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1928 "not vectorized: iteration count too small.\n");
1929 if (dump_enabled_p ())
1930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1931 "not vectorized: iteration count smaller than "
1932 "vectorization factor.\n");
1933 return false;
1934 }
1935
1936 /* Analyze the alignment of the data-refs in the loop.
1937 Fail if a data reference is found that cannot be vectorized. */
1938
1939 ok = vect_analyze_data_refs_alignment (loop_vinfo);
1940 if (!ok)
1941 {
1942 if (dump_enabled_p ())
1943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1944 "bad data alignment.\n");
1945 return false;
1946 }
1947
1948 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1949 It is important to call pruning after vect_analyze_data_ref_accesses,
1950 since we use grouping information gathered by interleaving analysis. */
1951 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1952 if (!ok)
1953 {
1954 if (dump_enabled_p ())
1955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1956 "number of versioning for alias "
1957 "run-time tests exceeds %d "
1958 "(--param vect-max-version-for-alias-checks)\n",
1959 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1960 return false;
1961 }
1962
1963 /* Compute the scalar iteration cost. */
1964 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1965
1966 /* This pass will decide on using loop versioning and/or loop peeling in
1967 order to enhance the alignment of data references in the loop. */
1968
1969 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1970 if (!ok)
1971 {
1972 if (dump_enabled_p ())
1973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1974 "bad data alignment.\n");
1975 return false;
1976 }
1977
1978 if (slp)
1979 {
1980 /* Analyze operations in the SLP instances. Note this may
1981 remove unsupported SLP instances which makes the above
1982 SLP kind detection invalid. */
1983 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1984 vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1985 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1986 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1987 return false;
1988 }
1989
1990 /* Scan all the remaining operations in the loop that are not subject
1991 to SLP and make sure they are vectorizable. */
1992 ok = vect_analyze_loop_operations (loop_vinfo);
1993 if (!ok)
1994 {
1995 if (dump_enabled_p ())
1996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1997 "bad operation or unsupported loop bound.\n");
1998 return false;
1999 }
2000
2001 /* Analyze cost. Decide if worth while to vectorize. */
2002 int min_profitable_estimate, min_profitable_iters;
2003 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2004 &min_profitable_estimate);
2005
2006 if (min_profitable_iters < 0)
2007 {
2008 if (dump_enabled_p ())
2009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2010 "not vectorized: vectorization not profitable.\n");
2011 if (dump_enabled_p ())
2012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2013 "not vectorized: vector version will never be "
2014 "profitable.\n");
2015 return false;
2016 }
2017
2018 int min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2019 * vectorization_factor) - 1);
2020
2021 /* Use the cost model only if it is more conservative than user specified
2022 threshold. */
2023 unsigned th = (unsigned) min_scalar_loop_bound;
2024 if (min_profitable_iters
2025 && (!min_scalar_loop_bound
2026 || min_profitable_iters > min_scalar_loop_bound))
2027 th = (unsigned) min_profitable_iters;
2028
2029 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2030
2031 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2032 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2033 {
2034 if (dump_enabled_p ())
2035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2036 "not vectorized: vectorization not profitable.\n");
2037 if (dump_enabled_p ())
2038 dump_printf_loc (MSG_NOTE, vect_location,
2039 "not vectorized: iteration count smaller than user "
2040 "specified loop bound parameter or minimum profitable "
2041 "iterations (whichever is more conservative).\n");
2042 return false;
2043 }
2044
2045 HOST_WIDE_INT estimated_niter
2046 = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2047 if (estimated_niter != -1
2048 && ((unsigned HOST_WIDE_INT) estimated_niter
2049 <= MAX (th, (unsigned)min_profitable_estimate)))
2050 {
2051 if (dump_enabled_p ())
2052 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2053 "not vectorized: estimated iteration count too "
2054 "small.\n");
2055 if (dump_enabled_p ())
2056 dump_printf_loc (MSG_NOTE, vect_location,
2057 "not vectorized: estimated iteration count smaller "
2058 "than specified loop bound parameter or minimum "
2059 "profitable iterations (whichever is more "
2060 "conservative).\n");
2061 return false;
2062 }
2063
2064 /* Decide whether we need to create an epilogue loop to handle
2065 remaining scalar iterations. */
2066 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2067 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2068 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2069
2070 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2071 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2072 {
2073 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2074 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2075 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2076 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2077 }
2078 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2079 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2080 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2081 /* In case of versioning, check if the maximum number of
2082 iterations is greater than th. If they are identical,
2083 the epilogue is unnecessary. */
2084 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
2085 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2086 || (unsigned HOST_WIDE_INT) max_niter > th)))
2087 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2088
2089 /* If an epilogue loop is required make sure we can create one. */
2090 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2091 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2092 {
2093 if (dump_enabled_p ())
2094 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2095 if (!vect_can_advance_ivs_p (loop_vinfo)
2096 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2097 single_exit (LOOP_VINFO_LOOP
2098 (loop_vinfo))))
2099 {
2100 if (dump_enabled_p ())
2101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2102 "not vectorized: can't create required "
2103 "epilog loop\n");
2104 return false;
2105 }
2106 }
2107
2108 gcc_assert (vectorization_factor
2109 == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2110
2111 return true;
2112 }
2113
2114 /* Function vect_analyze_loop.
2115
2116 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2117 for it. The different analyses will record information in the
2118 loop_vec_info struct. */
2119 loop_vec_info
2120 vect_analyze_loop (struct loop *loop)
2121 {
2122 loop_vec_info loop_vinfo;
2123 unsigned int vector_sizes;
2124
2125 /* Autodetect first vector size we try. */
2126 current_vector_size = 0;
2127 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2128
2129 if (dump_enabled_p ())
2130 dump_printf_loc (MSG_NOTE, vect_location,
2131 "===== analyze_loop_nest =====\n");
2132
2133 if (loop_outer (loop)
2134 && loop_vec_info_for_loop (loop_outer (loop))
2135 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2136 {
2137 if (dump_enabled_p ())
2138 dump_printf_loc (MSG_NOTE, vect_location,
2139 "outer-loop already vectorized.\n");
2140 return NULL;
2141 }
2142
2143 while (1)
2144 {
2145 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2146 loop_vinfo = vect_analyze_loop_form (loop);
2147 if (!loop_vinfo)
2148 {
2149 if (dump_enabled_p ())
2150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2151 "bad loop form.\n");
2152 return NULL;
2153 }
2154
2155 bool fatal = false;
2156 if (vect_analyze_loop_2 (loop_vinfo, fatal))
2157 {
2158 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2159
2160 return loop_vinfo;
2161 }
2162
2163 destroy_loop_vec_info (loop_vinfo, true);
2164
2165 vector_sizes &= ~current_vector_size;
2166 if (fatal
2167 || vector_sizes == 0
2168 || current_vector_size == 0)
2169 return NULL;
2170
2171 /* Try the next biggest vector size. */
2172 current_vector_size = 1 << floor_log2 (vector_sizes);
2173 if (dump_enabled_p ())
2174 dump_printf_loc (MSG_NOTE, vect_location,
2175 "***** Re-trying analysis with "
2176 "vector size %d\n", current_vector_size);
2177 }
2178 }
2179
2180
2181 /* Function reduction_code_for_scalar_code
2182
2183 Input:
2184 CODE - tree_code of a reduction operations.
2185
2186 Output:
2187 REDUC_CODE - the corresponding tree-code to be used to reduce the
2188 vector of partial results into a single scalar result, or ERROR_MARK
2189 if the operation is a supported reduction operation, but does not have
2190 such a tree-code.
2191
2192 Return FALSE if CODE currently cannot be vectorized as reduction. */
2193
2194 static bool
2195 reduction_code_for_scalar_code (enum tree_code code,
2196 enum tree_code *reduc_code)
2197 {
2198 switch (code)
2199 {
2200 case MAX_EXPR:
2201 *reduc_code = REDUC_MAX_EXPR;
2202 return true;
2203
2204 case MIN_EXPR:
2205 *reduc_code = REDUC_MIN_EXPR;
2206 return true;
2207
2208 case PLUS_EXPR:
2209 *reduc_code = REDUC_PLUS_EXPR;
2210 return true;
2211
2212 case MULT_EXPR:
2213 case MINUS_EXPR:
2214 case BIT_IOR_EXPR:
2215 case BIT_XOR_EXPR:
2216 case BIT_AND_EXPR:
2217 *reduc_code = ERROR_MARK;
2218 return true;
2219
2220 default:
2221 return false;
2222 }
2223 }
2224
2225
2226 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2227 STMT is printed with a message MSG. */
2228
2229 static void
2230 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2231 {
2232 dump_printf_loc (msg_type, vect_location, "%s", msg);
2233 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2234 dump_printf (msg_type, "\n");
2235 }
2236
2237
2238 /* Detect SLP reduction of the form:
2239
2240 #a1 = phi <a5, a0>
2241 a2 = operation (a1)
2242 a3 = operation (a2)
2243 a4 = operation (a3)
2244 a5 = operation (a4)
2245
2246 #a = phi <a5>
2247
2248 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2249 FIRST_STMT is the first reduction stmt in the chain
2250 (a2 = operation (a1)).
2251
2252 Return TRUE if a reduction chain was detected. */
2253
2254 static bool
2255 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2256 gimple *first_stmt)
2257 {
2258 struct loop *loop = (gimple_bb (phi))->loop_father;
2259 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2260 enum tree_code code;
2261 gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2262 stmt_vec_info use_stmt_info, current_stmt_info;
2263 tree lhs;
2264 imm_use_iterator imm_iter;
2265 use_operand_p use_p;
2266 int nloop_uses, size = 0, n_out_of_loop_uses;
2267 bool found = false;
2268
2269 if (loop != vect_loop)
2270 return false;
2271
2272 lhs = PHI_RESULT (phi);
2273 code = gimple_assign_rhs_code (first_stmt);
2274 while (1)
2275 {
2276 nloop_uses = 0;
2277 n_out_of_loop_uses = 0;
2278 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2279 {
2280 gimple *use_stmt = USE_STMT (use_p);
2281 if (is_gimple_debug (use_stmt))
2282 continue;
2283
2284 /* Check if we got back to the reduction phi. */
2285 if (use_stmt == phi)
2286 {
2287 loop_use_stmt = use_stmt;
2288 found = true;
2289 break;
2290 }
2291
2292 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2293 {
2294 loop_use_stmt = use_stmt;
2295 nloop_uses++;
2296 }
2297 else
2298 n_out_of_loop_uses++;
2299
2300 /* There are can be either a single use in the loop or two uses in
2301 phi nodes. */
2302 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2303 return false;
2304 }
2305
2306 if (found)
2307 break;
2308
2309 /* We reached a statement with no loop uses. */
2310 if (nloop_uses == 0)
2311 return false;
2312
2313 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2314 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2315 return false;
2316
2317 if (!is_gimple_assign (loop_use_stmt)
2318 || code != gimple_assign_rhs_code (loop_use_stmt)
2319 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2320 return false;
2321
2322 /* Insert USE_STMT into reduction chain. */
2323 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2324 if (current_stmt)
2325 {
2326 current_stmt_info = vinfo_for_stmt (current_stmt);
2327 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2328 GROUP_FIRST_ELEMENT (use_stmt_info)
2329 = GROUP_FIRST_ELEMENT (current_stmt_info);
2330 }
2331 else
2332 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2333
2334 lhs = gimple_assign_lhs (loop_use_stmt);
2335 current_stmt = loop_use_stmt;
2336 size++;
2337 }
2338
2339 if (!found || loop_use_stmt != phi || size < 2)
2340 return false;
2341
2342 /* Swap the operands, if needed, to make the reduction operand be the second
2343 operand. */
2344 lhs = PHI_RESULT (phi);
2345 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2346 while (next_stmt)
2347 {
2348 if (gimple_assign_rhs2 (next_stmt) == lhs)
2349 {
2350 tree op = gimple_assign_rhs1 (next_stmt);
2351 gimple *def_stmt = NULL;
2352
2353 if (TREE_CODE (op) == SSA_NAME)
2354 def_stmt = SSA_NAME_DEF_STMT (op);
2355
2356 /* Check that the other def is either defined in the loop
2357 ("vect_internal_def"), or it's an induction (defined by a
2358 loop-header phi-node). */
2359 if (def_stmt
2360 && gimple_bb (def_stmt)
2361 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2362 && (is_gimple_assign (def_stmt)
2363 || is_gimple_call (def_stmt)
2364 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2365 == vect_induction_def
2366 || (gimple_code (def_stmt) == GIMPLE_PHI
2367 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2368 == vect_internal_def
2369 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2370 {
2371 lhs = gimple_assign_lhs (next_stmt);
2372 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2373 continue;
2374 }
2375
2376 return false;
2377 }
2378 else
2379 {
2380 tree op = gimple_assign_rhs2 (next_stmt);
2381 gimple *def_stmt = NULL;
2382
2383 if (TREE_CODE (op) == SSA_NAME)
2384 def_stmt = SSA_NAME_DEF_STMT (op);
2385
2386 /* Check that the other def is either defined in the loop
2387 ("vect_internal_def"), or it's an induction (defined by a
2388 loop-header phi-node). */
2389 if (def_stmt
2390 && gimple_bb (def_stmt)
2391 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2392 && (is_gimple_assign (def_stmt)
2393 || is_gimple_call (def_stmt)
2394 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2395 == vect_induction_def
2396 || (gimple_code (def_stmt) == GIMPLE_PHI
2397 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2398 == vect_internal_def
2399 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2400 {
2401 if (dump_enabled_p ())
2402 {
2403 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2404 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2405 dump_printf (MSG_NOTE, "\n");
2406 }
2407
2408 swap_ssa_operands (next_stmt,
2409 gimple_assign_rhs1_ptr (next_stmt),
2410 gimple_assign_rhs2_ptr (next_stmt));
2411 update_stmt (next_stmt);
2412
2413 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2414 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2415 }
2416 else
2417 return false;
2418 }
2419
2420 lhs = gimple_assign_lhs (next_stmt);
2421 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2422 }
2423
2424 /* Save the chain for further analysis in SLP detection. */
2425 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2426 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2427 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2428
2429 return true;
2430 }
2431
2432
2433 /* Function vect_is_simple_reduction_1
2434
2435 (1) Detect a cross-iteration def-use cycle that represents a simple
2436 reduction computation. We look for the following pattern:
2437
2438 loop_header:
2439 a1 = phi < a0, a2 >
2440 a3 = ...
2441 a2 = operation (a3, a1)
2442
2443 or
2444
2445 a3 = ...
2446 loop_header:
2447 a1 = phi < a0, a2 >
2448 a2 = operation (a3, a1)
2449
2450 such that:
2451 1. operation is commutative and associative and it is safe to
2452 change the order of the computation (if CHECK_REDUCTION is true)
2453 2. no uses for a2 in the loop (a2 is used out of the loop)
2454 3. no uses of a1 in the loop besides the reduction operation
2455 4. no uses of a1 outside the loop.
2456
2457 Conditions 1,4 are tested here.
2458 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2459
2460 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2461 nested cycles, if CHECK_REDUCTION is false.
2462
2463 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2464 reductions:
2465
2466 a1 = phi < a0, a2 >
2467 inner loop (def of a3)
2468 a2 = phi < a3 >
2469
2470 (4) Detect condition expressions, ie:
2471 for (int i = 0; i < N; i++)
2472 if (a[i] < val)
2473 ret_val = a[i];
2474
2475 */
2476
2477 static gimple *
2478 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2479 bool check_reduction, bool *double_reduc,
2480 bool need_wrapping_integral_overflow,
2481 enum vect_reduction_type *v_reduc_type)
2482 {
2483 struct loop *loop = (gimple_bb (phi))->loop_father;
2484 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2485 edge latch_e = loop_latch_edge (loop);
2486 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2487 gimple *def_stmt, *def1 = NULL, *def2 = NULL;
2488 enum tree_code orig_code, code;
2489 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2490 tree type;
2491 int nloop_uses;
2492 tree name;
2493 imm_use_iterator imm_iter;
2494 use_operand_p use_p;
2495 bool phi_def;
2496
2497 *double_reduc = false;
2498 *v_reduc_type = TREE_CODE_REDUCTION;
2499
2500 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2501 otherwise, we assume outer loop vectorization. */
2502 gcc_assert ((check_reduction && loop == vect_loop)
2503 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2504
2505 name = PHI_RESULT (phi);
2506 /* ??? If there are no uses of the PHI result the inner loop reduction
2507 won't be detected as possibly double-reduction by vectorizable_reduction
2508 because that tries to walk the PHI arg from the preheader edge which
2509 can be constant. See PR60382. */
2510 if (has_zero_uses (name))
2511 return NULL;
2512 nloop_uses = 0;
2513 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2514 {
2515 gimple *use_stmt = USE_STMT (use_p);
2516 if (is_gimple_debug (use_stmt))
2517 continue;
2518
2519 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2520 {
2521 if (dump_enabled_p ())
2522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2523 "intermediate value used outside loop.\n");
2524
2525 return NULL;
2526 }
2527
2528 nloop_uses++;
2529 if (nloop_uses > 1)
2530 {
2531 if (dump_enabled_p ())
2532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2533 "reduction used in loop.\n");
2534 return NULL;
2535 }
2536 }
2537
2538 if (TREE_CODE (loop_arg) != SSA_NAME)
2539 {
2540 if (dump_enabled_p ())
2541 {
2542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2543 "reduction: not ssa_name: ");
2544 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2545 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2546 }
2547 return NULL;
2548 }
2549
2550 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2551 if (!def_stmt)
2552 {
2553 if (dump_enabled_p ())
2554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555 "reduction: no def_stmt.\n");
2556 return NULL;
2557 }
2558
2559 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2560 {
2561 if (dump_enabled_p ())
2562 {
2563 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2564 dump_printf (MSG_NOTE, "\n");
2565 }
2566 return NULL;
2567 }
2568
2569 if (is_gimple_assign (def_stmt))
2570 {
2571 name = gimple_assign_lhs (def_stmt);
2572 phi_def = false;
2573 }
2574 else
2575 {
2576 name = PHI_RESULT (def_stmt);
2577 phi_def = true;
2578 }
2579
2580 nloop_uses = 0;
2581 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2582 {
2583 gimple *use_stmt = USE_STMT (use_p);
2584 if (is_gimple_debug (use_stmt))
2585 continue;
2586 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2587 nloop_uses++;
2588 if (nloop_uses > 1)
2589 {
2590 if (dump_enabled_p ())
2591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2592 "reduction used in loop.\n");
2593 return NULL;
2594 }
2595 }
2596
2597 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2598 defined in the inner loop. */
2599 if (phi_def)
2600 {
2601 op1 = PHI_ARG_DEF (def_stmt, 0);
2602
2603 if (gimple_phi_num_args (def_stmt) != 1
2604 || TREE_CODE (op1) != SSA_NAME)
2605 {
2606 if (dump_enabled_p ())
2607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2608 "unsupported phi node definition.\n");
2609
2610 return NULL;
2611 }
2612
2613 def1 = SSA_NAME_DEF_STMT (op1);
2614 if (gimple_bb (def1)
2615 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2616 && loop->inner
2617 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2618 && is_gimple_assign (def1))
2619 {
2620 if (dump_enabled_p ())
2621 report_vect_op (MSG_NOTE, def_stmt,
2622 "detected double reduction: ");
2623
2624 *double_reduc = true;
2625 return def_stmt;
2626 }
2627
2628 return NULL;
2629 }
2630
2631 code = orig_code = gimple_assign_rhs_code (def_stmt);
2632
2633 /* We can handle "res -= x[i]", which is non-associative by
2634 simply rewriting this into "res += -x[i]". Avoid changing
2635 gimple instruction for the first simple tests and only do this
2636 if we're allowed to change code at all. */
2637 if (code == MINUS_EXPR
2638 && (op1 = gimple_assign_rhs1 (def_stmt))
2639 && TREE_CODE (op1) == SSA_NAME
2640 && SSA_NAME_DEF_STMT (op1) == phi)
2641 code = PLUS_EXPR;
2642
2643 if (check_reduction)
2644 {
2645 if (code == COND_EXPR)
2646 *v_reduc_type = COND_REDUCTION;
2647 else if (!commutative_tree_code (code) || !associative_tree_code (code))
2648 {
2649 if (dump_enabled_p ())
2650 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2651 "reduction: not commutative/associative: ");
2652 return NULL;
2653 }
2654 }
2655
2656 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2657 {
2658 if (code != COND_EXPR)
2659 {
2660 if (dump_enabled_p ())
2661 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2662 "reduction: not binary operation: ");
2663
2664 return NULL;
2665 }
2666
2667 op3 = gimple_assign_rhs1 (def_stmt);
2668 if (COMPARISON_CLASS_P (op3))
2669 {
2670 op4 = TREE_OPERAND (op3, 1);
2671 op3 = TREE_OPERAND (op3, 0);
2672 }
2673
2674 op1 = gimple_assign_rhs2 (def_stmt);
2675 op2 = gimple_assign_rhs3 (def_stmt);
2676
2677 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2678 {
2679 if (dump_enabled_p ())
2680 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2681 "reduction: uses not ssa_names: ");
2682
2683 return NULL;
2684 }
2685 }
2686 else
2687 {
2688 op1 = gimple_assign_rhs1 (def_stmt);
2689 op2 = gimple_assign_rhs2 (def_stmt);
2690
2691 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2692 {
2693 if (dump_enabled_p ())
2694 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2695 "reduction: uses not ssa_names: ");
2696
2697 return NULL;
2698 }
2699 }
2700
2701 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2702 if ((TREE_CODE (op1) == SSA_NAME
2703 && !types_compatible_p (type,TREE_TYPE (op1)))
2704 || (TREE_CODE (op2) == SSA_NAME
2705 && !types_compatible_p (type, TREE_TYPE (op2)))
2706 || (op3 && TREE_CODE (op3) == SSA_NAME
2707 && !types_compatible_p (type, TREE_TYPE (op3)))
2708 || (op4 && TREE_CODE (op4) == SSA_NAME
2709 && !types_compatible_p (type, TREE_TYPE (op4))))
2710 {
2711 if (dump_enabled_p ())
2712 {
2713 dump_printf_loc (MSG_NOTE, vect_location,
2714 "reduction: multiple types: operation type: ");
2715 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2716 dump_printf (MSG_NOTE, ", operands types: ");
2717 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2718 TREE_TYPE (op1));
2719 dump_printf (MSG_NOTE, ",");
2720 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2721 TREE_TYPE (op2));
2722 if (op3)
2723 {
2724 dump_printf (MSG_NOTE, ",");
2725 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2726 TREE_TYPE (op3));
2727 }
2728
2729 if (op4)
2730 {
2731 dump_printf (MSG_NOTE, ",");
2732 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2733 TREE_TYPE (op4));
2734 }
2735 dump_printf (MSG_NOTE, "\n");
2736 }
2737
2738 return NULL;
2739 }
2740
2741 /* Check that it's ok to change the order of the computation.
2742 Generally, when vectorizing a reduction we change the order of the
2743 computation. This may change the behavior of the program in some
2744 cases, so we need to check that this is ok. One exception is when
2745 vectorizing an outer-loop: the inner-loop is executed sequentially,
2746 and therefore vectorizing reductions in the inner-loop during
2747 outer-loop vectorization is safe. */
2748
2749 if (*v_reduc_type != COND_REDUCTION)
2750 {
2751 /* CHECKME: check for !flag_finite_math_only too? */
2752 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2753 && check_reduction)
2754 {
2755 /* Changing the order of operations changes the semantics. */
2756 if (dump_enabled_p ())
2757 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2758 "reduction: unsafe fp math optimization: ");
2759 return NULL;
2760 }
2761 else if (INTEGRAL_TYPE_P (type) && check_reduction)
2762 {
2763 if (!operation_no_trapping_overflow (type, code))
2764 {
2765 /* Changing the order of operations changes the semantics. */
2766 if (dump_enabled_p ())
2767 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2768 "reduction: unsafe int math optimization"
2769 " (overflow traps): ");
2770 return NULL;
2771 }
2772 if (need_wrapping_integral_overflow
2773 && !TYPE_OVERFLOW_WRAPS (type)
2774 && operation_can_overflow (code))
2775 {
2776 /* Changing the order of operations changes the semantics. */
2777 if (dump_enabled_p ())
2778 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2779 "reduction: unsafe int math optimization"
2780 " (overflow doesn't wrap): ");
2781 return NULL;
2782 }
2783 }
2784 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2785 {
2786 /* Changing the order of operations changes the semantics. */
2787 if (dump_enabled_p ())
2788 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2789 "reduction: unsafe fixed-point math optimization: ");
2790 return NULL;
2791 }
2792 }
2793
2794 /* Reduction is safe. We're dealing with one of the following:
2795 1) integer arithmetic and no trapv
2796 2) floating point arithmetic, and special flags permit this optimization
2797 3) nested cycle (i.e., outer loop vectorization). */
2798 if (TREE_CODE (op1) == SSA_NAME)
2799 def1 = SSA_NAME_DEF_STMT (op1);
2800
2801 if (TREE_CODE (op2) == SSA_NAME)
2802 def2 = SSA_NAME_DEF_STMT (op2);
2803
2804 if (code != COND_EXPR
2805 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2806 {
2807 if (dump_enabled_p ())
2808 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2809 return NULL;
2810 }
2811
2812 /* Check that one def is the reduction def, defined by PHI,
2813 the other def is either defined in the loop ("vect_internal_def"),
2814 or it's an induction (defined by a loop-header phi-node). */
2815
2816 if (def2 && def2 == phi
2817 && (code == COND_EXPR
2818 || !def1 || gimple_nop_p (def1)
2819 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2820 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2821 && (is_gimple_assign (def1)
2822 || is_gimple_call (def1)
2823 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2824 == vect_induction_def
2825 || (gimple_code (def1) == GIMPLE_PHI
2826 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2827 == vect_internal_def
2828 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2829 {
2830 if (dump_enabled_p ())
2831 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2832 return def_stmt;
2833 }
2834
2835 if (def1 && def1 == phi
2836 && (code == COND_EXPR
2837 || !def2 || gimple_nop_p (def2)
2838 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2839 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2840 && (is_gimple_assign (def2)
2841 || is_gimple_call (def2)
2842 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2843 == vect_induction_def
2844 || (gimple_code (def2) == GIMPLE_PHI
2845 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2846 == vect_internal_def
2847 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2848 {
2849 if (check_reduction
2850 && orig_code != MINUS_EXPR)
2851 {
2852 if (code == COND_EXPR)
2853 {
2854 /* No current known use where this case would be useful. */
2855 if (dump_enabled_p ())
2856 report_vect_op (MSG_NOTE, def_stmt,
2857 "detected reduction: cannot currently swap "
2858 "operands for cond_expr");
2859 return NULL;
2860 }
2861
2862 /* Swap operands (just for simplicity - so that the rest of the code
2863 can assume that the reduction variable is always the last (second)
2864 argument). */
2865 if (dump_enabled_p ())
2866 report_vect_op (MSG_NOTE, def_stmt,
2867 "detected reduction: need to swap operands: ");
2868
2869 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2870 gimple_assign_rhs2_ptr (def_stmt));
2871
2872 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2873 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2874 }
2875 else
2876 {
2877 if (dump_enabled_p ())
2878 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2879 }
2880
2881 return def_stmt;
2882 }
2883
2884 /* Try to find SLP reduction chain. */
2885 if (check_reduction && code != COND_EXPR
2886 && vect_is_slp_reduction (loop_info, phi, def_stmt))
2887 {
2888 if (dump_enabled_p ())
2889 report_vect_op (MSG_NOTE, def_stmt,
2890 "reduction: detected reduction chain: ");
2891
2892 return def_stmt;
2893 }
2894
2895 if (dump_enabled_p ())
2896 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2897 "reduction: unknown pattern: ");
2898
2899 return NULL;
2900 }
2901
2902 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2903 in-place if it enables detection of more reductions. Arguments
2904 as there. */
2905
2906 gimple *
2907 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
2908 bool check_reduction, bool *double_reduc,
2909 bool need_wrapping_integral_overflow)
2910 {
2911 enum vect_reduction_type v_reduc_type;
2912 return vect_is_simple_reduction (loop_info, phi, check_reduction,
2913 double_reduc,
2914 need_wrapping_integral_overflow,
2915 &v_reduc_type);
2916 }
2917
2918 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2919 int
2920 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2921 int *peel_iters_epilogue,
2922 stmt_vector_for_cost *scalar_cost_vec,
2923 stmt_vector_for_cost *prologue_cost_vec,
2924 stmt_vector_for_cost *epilogue_cost_vec)
2925 {
2926 int retval = 0;
2927 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2928
2929 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2930 {
2931 *peel_iters_epilogue = vf/2;
2932 if (dump_enabled_p ())
2933 dump_printf_loc (MSG_NOTE, vect_location,
2934 "cost model: epilogue peel iters set to vf/2 "
2935 "because loop iterations are unknown .\n");
2936
2937 /* If peeled iterations are known but number of scalar loop
2938 iterations are unknown, count a taken branch per peeled loop. */
2939 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2940 NULL, 0, vect_prologue);
2941 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2942 NULL, 0, vect_epilogue);
2943 }
2944 else
2945 {
2946 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2947 peel_iters_prologue = niters < peel_iters_prologue ?
2948 niters : peel_iters_prologue;
2949 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2950 /* If we need to peel for gaps, but no peeling is required, we have to
2951 peel VF iterations. */
2952 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2953 *peel_iters_epilogue = vf;
2954 }
2955
2956 stmt_info_for_cost *si;
2957 int j;
2958 if (peel_iters_prologue)
2959 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2960 retval += record_stmt_cost (prologue_cost_vec,
2961 si->count * peel_iters_prologue,
2962 si->kind, NULL, si->misalign,
2963 vect_prologue);
2964 if (*peel_iters_epilogue)
2965 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2966 retval += record_stmt_cost (epilogue_cost_vec,
2967 si->count * *peel_iters_epilogue,
2968 si->kind, NULL, si->misalign,
2969 vect_epilogue);
2970
2971 return retval;
2972 }
2973
2974 /* Function vect_estimate_min_profitable_iters
2975
2976 Return the number of iterations required for the vector version of the
2977 loop to be profitable relative to the cost of the scalar version of the
2978 loop. */
2979
2980 static void
2981 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2982 int *ret_min_profitable_niters,
2983 int *ret_min_profitable_estimate)
2984 {
2985 int min_profitable_iters;
2986 int min_profitable_estimate;
2987 int peel_iters_prologue;
2988 int peel_iters_epilogue;
2989 unsigned vec_inside_cost = 0;
2990 int vec_outside_cost = 0;
2991 unsigned vec_prologue_cost = 0;
2992 unsigned vec_epilogue_cost = 0;
2993 int scalar_single_iter_cost = 0;
2994 int scalar_outside_cost = 0;
2995 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2996 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2997 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2998
2999 /* Cost model disabled. */
3000 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3001 {
3002 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3003 *ret_min_profitable_niters = 0;
3004 *ret_min_profitable_estimate = 0;
3005 return;
3006 }
3007
3008 /* Requires loop versioning tests to handle misalignment. */
3009 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3010 {
3011 /* FIXME: Make cost depend on complexity of individual check. */
3012 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3013 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3014 vect_prologue);
3015 dump_printf (MSG_NOTE,
3016 "cost model: Adding cost of checks for loop "
3017 "versioning to treat misalignment.\n");
3018 }
3019
3020 /* Requires loop versioning with alias checks. */
3021 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3022 {
3023 /* FIXME: Make cost depend on complexity of individual check. */
3024 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3025 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3026 vect_prologue);
3027 dump_printf (MSG_NOTE,
3028 "cost model: Adding cost of checks for loop "
3029 "versioning aliasing.\n");
3030 }
3031
3032 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3033 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3034 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3035 vect_prologue);
3036
3037 /* Count statements in scalar loop. Using this as scalar cost for a single
3038 iteration for now.
3039
3040 TODO: Add outer loop support.
3041
3042 TODO: Consider assigning different costs to different scalar
3043 statements. */
3044
3045 scalar_single_iter_cost
3046 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3047
3048 /* Add additional cost for the peeled instructions in prologue and epilogue
3049 loop.
3050
3051 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3052 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3053
3054 TODO: Build an expression that represents peel_iters for prologue and
3055 epilogue to be used in a run-time test. */
3056
3057 if (npeel < 0)
3058 {
3059 peel_iters_prologue = vf/2;
3060 dump_printf (MSG_NOTE, "cost model: "
3061 "prologue peel iters set to vf/2.\n");
3062
3063 /* If peeling for alignment is unknown, loop bound of main loop becomes
3064 unknown. */
3065 peel_iters_epilogue = vf/2;
3066 dump_printf (MSG_NOTE, "cost model: "
3067 "epilogue peel iters set to vf/2 because "
3068 "peeling for alignment is unknown.\n");
3069
3070 /* If peeled iterations are unknown, count a taken branch and a not taken
3071 branch per peeled loop. Even if scalar loop iterations are known,
3072 vector iterations are not known since peeled prologue iterations are
3073 not known. Hence guards remain the same. */
3074 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3075 NULL, 0, vect_prologue);
3076 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3077 NULL, 0, vect_prologue);
3078 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3079 NULL, 0, vect_epilogue);
3080 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3081 NULL, 0, vect_epilogue);
3082 stmt_info_for_cost *si;
3083 int j;
3084 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3085 {
3086 struct _stmt_vec_info *stmt_info
3087 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3088 (void) add_stmt_cost (target_cost_data,
3089 si->count * peel_iters_prologue,
3090 si->kind, stmt_info, si->misalign,
3091 vect_prologue);
3092 (void) add_stmt_cost (target_cost_data,
3093 si->count * peel_iters_epilogue,
3094 si->kind, stmt_info, si->misalign,
3095 vect_epilogue);
3096 }
3097 }
3098 else
3099 {
3100 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3101 stmt_info_for_cost *si;
3102 int j;
3103 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3104
3105 prologue_cost_vec.create (2);
3106 epilogue_cost_vec.create (2);
3107 peel_iters_prologue = npeel;
3108
3109 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3110 &peel_iters_epilogue,
3111 &LOOP_VINFO_SCALAR_ITERATION_COST
3112 (loop_vinfo),
3113 &prologue_cost_vec,
3114 &epilogue_cost_vec);
3115
3116 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3117 {
3118 struct _stmt_vec_info *stmt_info
3119 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3120 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3121 si->misalign, vect_prologue);
3122 }
3123
3124 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3125 {
3126 struct _stmt_vec_info *stmt_info
3127 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3128 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3129 si->misalign, vect_epilogue);
3130 }
3131
3132 prologue_cost_vec.release ();
3133 epilogue_cost_vec.release ();
3134 }
3135
3136 /* FORNOW: The scalar outside cost is incremented in one of the
3137 following ways:
3138
3139 1. The vectorizer checks for alignment and aliasing and generates
3140 a condition that allows dynamic vectorization. A cost model
3141 check is ANDED with the versioning condition. Hence scalar code
3142 path now has the added cost of the versioning check.
3143
3144 if (cost > th & versioning_check)
3145 jmp to vector code
3146
3147 Hence run-time scalar is incremented by not-taken branch cost.
3148
3149 2. The vectorizer then checks if a prologue is required. If the
3150 cost model check was not done before during versioning, it has to
3151 be done before the prologue check.
3152
3153 if (cost <= th)
3154 prologue = scalar_iters
3155 if (prologue == 0)
3156 jmp to vector code
3157 else
3158 execute prologue
3159 if (prologue == num_iters)
3160 go to exit
3161
3162 Hence the run-time scalar cost is incremented by a taken branch,
3163 plus a not-taken branch, plus a taken branch cost.
3164
3165 3. The vectorizer then checks if an epilogue is required. If the
3166 cost model check was not done before during prologue check, it
3167 has to be done with the epilogue check.
3168
3169 if (prologue == 0)
3170 jmp to vector code
3171 else
3172 execute prologue
3173 if (prologue == num_iters)
3174 go to exit
3175 vector code:
3176 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3177 jmp to epilogue
3178
3179 Hence the run-time scalar cost should be incremented by 2 taken
3180 branches.
3181
3182 TODO: The back end may reorder the BBS's differently and reverse
3183 conditions/branch directions. Change the estimates below to
3184 something more reasonable. */
3185
3186 /* If the number of iterations is known and we do not do versioning, we can
3187 decide whether to vectorize at compile time. Hence the scalar version
3188 do not carry cost model guard costs. */
3189 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3190 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3191 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3192 {
3193 /* Cost model check occurs at versioning. */
3194 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3195 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3196 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3197 else
3198 {
3199 /* Cost model check occurs at prologue generation. */
3200 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3201 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3202 + vect_get_stmt_cost (cond_branch_not_taken);
3203 /* Cost model check occurs at epilogue generation. */
3204 else
3205 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3206 }
3207 }
3208
3209 /* Complete the target-specific cost calculations. */
3210 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3211 &vec_inside_cost, &vec_epilogue_cost);
3212
3213 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3214
3215 if (dump_enabled_p ())
3216 {
3217 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3218 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3219 vec_inside_cost);
3220 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3221 vec_prologue_cost);
3222 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3223 vec_epilogue_cost);
3224 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3225 scalar_single_iter_cost);
3226 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3227 scalar_outside_cost);
3228 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3229 vec_outside_cost);
3230 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3231 peel_iters_prologue);
3232 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3233 peel_iters_epilogue);
3234 }
3235
3236 /* Calculate number of iterations required to make the vector version
3237 profitable, relative to the loop bodies only. The following condition
3238 must hold true:
3239 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3240 where
3241 SIC = scalar iteration cost, VIC = vector iteration cost,
3242 VOC = vector outside cost, VF = vectorization factor,
3243 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3244 SOC = scalar outside cost for run time cost model check. */
3245
3246 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3247 {
3248 if (vec_outside_cost <= 0)
3249 min_profitable_iters = 1;
3250 else
3251 {
3252 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3253 - vec_inside_cost * peel_iters_prologue
3254 - vec_inside_cost * peel_iters_epilogue)
3255 / ((scalar_single_iter_cost * vf)
3256 - vec_inside_cost);
3257
3258 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3259 <= (((int) vec_inside_cost * min_profitable_iters)
3260 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3261 min_profitable_iters++;
3262 }
3263 }
3264 /* vector version will never be profitable. */
3265 else
3266 {
3267 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3268 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3269 "did not happen for a simd loop");
3270
3271 if (dump_enabled_p ())
3272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3273 "cost model: the vector iteration cost = %d "
3274 "divided by the scalar iteration cost = %d "
3275 "is greater or equal to the vectorization factor = %d"
3276 ".\n",
3277 vec_inside_cost, scalar_single_iter_cost, vf);
3278 *ret_min_profitable_niters = -1;
3279 *ret_min_profitable_estimate = -1;
3280 return;
3281 }
3282
3283 dump_printf (MSG_NOTE,
3284 " Calculated minimum iters for profitability: %d\n",
3285 min_profitable_iters);
3286
3287 min_profitable_iters =
3288 min_profitable_iters < vf ? vf : min_profitable_iters;
3289
3290 /* Because the condition we create is:
3291 if (niters <= min_profitable_iters)
3292 then skip the vectorized loop. */
3293 min_profitable_iters--;
3294
3295 if (dump_enabled_p ())
3296 dump_printf_loc (MSG_NOTE, vect_location,
3297 " Runtime profitability threshold = %d\n",
3298 min_profitable_iters);
3299
3300 *ret_min_profitable_niters = min_profitable_iters;
3301
3302 /* Calculate number of iterations required to make the vector version
3303 profitable, relative to the loop bodies only.
3304
3305 Non-vectorized variant is SIC * niters and it must win over vector
3306 variant on the expected loop trip count. The following condition must hold true:
3307 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3308
3309 if (vec_outside_cost <= 0)
3310 min_profitable_estimate = 1;
3311 else
3312 {
3313 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3314 - vec_inside_cost * peel_iters_prologue
3315 - vec_inside_cost * peel_iters_epilogue)
3316 / ((scalar_single_iter_cost * vf)
3317 - vec_inside_cost);
3318 }
3319 min_profitable_estimate --;
3320 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3321 if (dump_enabled_p ())
3322 dump_printf_loc (MSG_NOTE, vect_location,
3323 " Static estimate profitability threshold = %d\n",
3324 min_profitable_iters);
3325
3326 *ret_min_profitable_estimate = min_profitable_estimate;
3327 }
3328
3329 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3330 vector elements (not bits) for a vector of mode MODE. */
3331 static void
3332 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3333 unsigned char *sel)
3334 {
3335 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3336
3337 for (i = 0; i < nelt; i++)
3338 sel[i] = (i + offset) & (2*nelt - 1);
3339 }
3340
3341 /* Checks whether the target supports whole-vector shifts for vectors of mode
3342 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3343 it supports vec_perm_const with masks for all necessary shift amounts. */
3344 static bool
3345 have_whole_vector_shift (enum machine_mode mode)
3346 {
3347 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3348 return true;
3349
3350 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3351 return false;
3352
3353 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3354 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3355
3356 for (i = nelt/2; i >= 1; i/=2)
3357 {
3358 calc_vec_perm_mask_for_shift (mode, i, sel);
3359 if (!can_vec_perm_p (mode, false, sel))
3360 return false;
3361 }
3362 return true;
3363 }
3364
3365 /* Return the reduction operand (with index REDUC_INDEX) of STMT. */
3366
3367 static tree
3368 get_reduction_op (gimple *stmt, int reduc_index)
3369 {
3370 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3371 {
3372 case GIMPLE_SINGLE_RHS:
3373 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3374 == ternary_op);
3375 return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3376 case GIMPLE_UNARY_RHS:
3377 return gimple_assign_rhs1 (stmt);
3378 case GIMPLE_BINARY_RHS:
3379 return (reduc_index
3380 ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3381 case GIMPLE_TERNARY_RHS:
3382 return gimple_op (stmt, reduc_index + 1);
3383 default:
3384 gcc_unreachable ();
3385 }
3386 }
3387
3388 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3389 functions. Design better to avoid maintenance issues. */
3390
3391 /* Function vect_model_reduction_cost.
3392
3393 Models cost for a reduction operation, including the vector ops
3394 generated within the strip-mine loop, the initial definition before
3395 the loop, and the epilogue code that must be generated. */
3396
3397 static bool
3398 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3399 int ncopies, int reduc_index)
3400 {
3401 int prologue_cost = 0, epilogue_cost = 0;
3402 enum tree_code code;
3403 optab optab;
3404 tree vectype;
3405 gimple *stmt, *orig_stmt;
3406 tree reduction_op;
3407 machine_mode mode;
3408 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3409 struct loop *loop = NULL;
3410 void *target_cost_data;
3411
3412 if (loop_vinfo)
3413 {
3414 loop = LOOP_VINFO_LOOP (loop_vinfo);
3415 target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3416 }
3417 else
3418 target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3419
3420 /* Condition reductions generate two reductions in the loop. */
3421 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3422 ncopies *= 2;
3423
3424 /* Cost of reduction op inside loop. */
3425 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3426 stmt_info, 0, vect_body);
3427 stmt = STMT_VINFO_STMT (stmt_info);
3428
3429 reduction_op = get_reduction_op (stmt, reduc_index);
3430
3431 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3432 if (!vectype)
3433 {
3434 if (dump_enabled_p ())
3435 {
3436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3437 "unsupported data-type ");
3438 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3439 TREE_TYPE (reduction_op));
3440 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3441 }
3442 return false;
3443 }
3444
3445 mode = TYPE_MODE (vectype);
3446 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3447
3448 if (!orig_stmt)
3449 orig_stmt = STMT_VINFO_STMT (stmt_info);
3450
3451 code = gimple_assign_rhs_code (orig_stmt);
3452
3453 /* Add in cost for initial definition.
3454 For cond reduction we have four vectors: initial index, step, initial
3455 result of the data reduction, initial value of the index reduction. */
3456 int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3457 == COND_REDUCTION ? 4 : 1;
3458 prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3459 scalar_to_vec, stmt_info, 0,
3460 vect_prologue);
3461
3462 /* Determine cost of epilogue code.
3463
3464 We have a reduction operator that will reduce the vector in one statement.
3465 Also requires scalar extract. */
3466
3467 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3468 {
3469 if (reduc_code != ERROR_MARK)
3470 {
3471 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3472 {
3473 /* An EQ stmt and an COND_EXPR stmt. */
3474 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3475 vector_stmt, stmt_info, 0,
3476 vect_epilogue);
3477 /* Reduction of the max index and a reduction of the found
3478 values. */
3479 epilogue_cost += add_stmt_cost (target_cost_data, 2,
3480 vec_to_scalar, stmt_info, 0,
3481 vect_epilogue);
3482 /* A broadcast of the max value. */
3483 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3484 scalar_to_vec, stmt_info, 0,
3485 vect_epilogue);
3486 }
3487 else
3488 {
3489 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3490 stmt_info, 0, vect_epilogue);
3491 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3492 vec_to_scalar, stmt_info, 0,
3493 vect_epilogue);
3494 }
3495 }
3496 else
3497 {
3498 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3499 tree bitsize =
3500 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3501 int element_bitsize = tree_to_uhwi (bitsize);
3502 int nelements = vec_size_in_bits / element_bitsize;
3503
3504 optab = optab_for_tree_code (code, vectype, optab_default);
3505
3506 /* We have a whole vector shift available. */
3507 if (VECTOR_MODE_P (mode)
3508 && optab_handler (optab, mode) != CODE_FOR_nothing
3509 && have_whole_vector_shift (mode))
3510 {
3511 /* Final reduction via vector shifts and the reduction operator.
3512 Also requires scalar extract. */
3513 epilogue_cost += add_stmt_cost (target_cost_data,
3514 exact_log2 (nelements) * 2,
3515 vector_stmt, stmt_info, 0,
3516 vect_epilogue);
3517 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3518 vec_to_scalar, stmt_info, 0,
3519 vect_epilogue);
3520 }
3521 else
3522 /* Use extracts and reduction op for final reduction. For N
3523 elements, we have N extracts and N-1 reduction ops. */
3524 epilogue_cost += add_stmt_cost (target_cost_data,
3525 nelements + nelements - 1,
3526 vector_stmt, stmt_info, 0,
3527 vect_epilogue);
3528 }
3529 }
3530
3531 if (dump_enabled_p ())
3532 dump_printf (MSG_NOTE,
3533 "vect_model_reduction_cost: inside_cost = %d, "
3534 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3535 prologue_cost, epilogue_cost);
3536
3537 return true;
3538 }
3539
3540
3541 /* Function vect_model_induction_cost.
3542
3543 Models cost for induction operations. */
3544
3545 static void
3546 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3547 {
3548 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3549 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3550 unsigned inside_cost, prologue_cost;
3551
3552 /* loop cost for vec_loop. */
3553 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3554 stmt_info, 0, vect_body);
3555
3556 /* prologue cost for vec_init and vec_step. */
3557 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3558 stmt_info, 0, vect_prologue);
3559
3560 if (dump_enabled_p ())
3561 dump_printf_loc (MSG_NOTE, vect_location,
3562 "vect_model_induction_cost: inside_cost = %d, "
3563 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3564 }
3565
3566
3567 /* Function get_initial_def_for_induction
3568
3569 Input:
3570 STMT - a stmt that performs an induction operation in the loop.
3571 IV_PHI - the initial value of the induction variable
3572
3573 Output:
3574 Return a vector variable, initialized with the first VF values of
3575 the induction variable. E.g., for an iv with IV_PHI='X' and
3576 evolution S, for a vector of 4 units, we want to return:
3577 [X, X + S, X + 2*S, X + 3*S]. */
3578
3579 static tree
3580 get_initial_def_for_induction (gimple *iv_phi)
3581 {
3582 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3583 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3584 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3585 tree vectype;
3586 int nunits;
3587 edge pe = loop_preheader_edge (loop);
3588 struct loop *iv_loop;
3589 basic_block new_bb;
3590 tree new_vec, vec_init, vec_step, t;
3591 tree new_name;
3592 gimple *new_stmt;
3593 gphi *induction_phi;
3594 tree induc_def, vec_def, vec_dest;
3595 tree init_expr, step_expr;
3596 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3597 int i;
3598 int ncopies;
3599 tree expr;
3600 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3601 bool nested_in_vect_loop = false;
3602 gimple_seq stmts;
3603 imm_use_iterator imm_iter;
3604 use_operand_p use_p;
3605 gimple *exit_phi;
3606 edge latch_e;
3607 tree loop_arg;
3608 gimple_stmt_iterator si;
3609 basic_block bb = gimple_bb (iv_phi);
3610 tree stepvectype;
3611 tree resvectype;
3612
3613 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3614 if (nested_in_vect_loop_p (loop, iv_phi))
3615 {
3616 nested_in_vect_loop = true;
3617 iv_loop = loop->inner;
3618 }
3619 else
3620 iv_loop = loop;
3621 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3622
3623 latch_e = loop_latch_edge (iv_loop);
3624 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3625
3626 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3627 gcc_assert (step_expr != NULL_TREE);
3628
3629 pe = loop_preheader_edge (iv_loop);
3630 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3631 loop_preheader_edge (iv_loop));
3632
3633 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3634 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3635 gcc_assert (vectype);
3636 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3637 ncopies = vf / nunits;
3638
3639 gcc_assert (phi_info);
3640 gcc_assert (ncopies >= 1);
3641
3642 /* Convert the step to the desired type. */
3643 stmts = NULL;
3644 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3645 if (stmts)
3646 {
3647 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3648 gcc_assert (!new_bb);
3649 }
3650
3651 /* Find the first insertion point in the BB. */
3652 si = gsi_after_labels (bb);
3653
3654 /* Create the vector that holds the initial_value of the induction. */
3655 if (nested_in_vect_loop)
3656 {
3657 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3658 been created during vectorization of previous stmts. We obtain it
3659 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3660 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3661 /* If the initial value is not of proper type, convert it. */
3662 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3663 {
3664 new_stmt
3665 = gimple_build_assign (vect_get_new_ssa_name (vectype,
3666 vect_simple_var,
3667 "vec_iv_"),
3668 VIEW_CONVERT_EXPR,
3669 build1 (VIEW_CONVERT_EXPR, vectype,
3670 vec_init));
3671 vec_init = gimple_assign_lhs (new_stmt);
3672 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3673 new_stmt);
3674 gcc_assert (!new_bb);
3675 set_vinfo_for_stmt (new_stmt,
3676 new_stmt_vec_info (new_stmt, loop_vinfo));
3677 }
3678 }
3679 else
3680 {
3681 vec<constructor_elt, va_gc> *v;
3682
3683 /* iv_loop is the loop to be vectorized. Create:
3684 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3685 stmts = NULL;
3686 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3687
3688 vec_alloc (v, nunits);
3689 bool constant_p = is_gimple_min_invariant (new_name);
3690 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3691 for (i = 1; i < nunits; i++)
3692 {
3693 /* Create: new_name_i = new_name + step_expr */
3694 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3695 new_name, step_expr);
3696 if (!is_gimple_min_invariant (new_name))
3697 constant_p = false;
3698 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3699 }
3700 if (stmts)
3701 {
3702 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3703 gcc_assert (!new_bb);
3704 }
3705
3706 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3707 if (constant_p)
3708 new_vec = build_vector_from_ctor (vectype, v);
3709 else
3710 new_vec = build_constructor (vectype, v);
3711 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3712 }
3713
3714
3715 /* Create the vector that holds the step of the induction. */
3716 if (nested_in_vect_loop)
3717 /* iv_loop is nested in the loop to be vectorized. Generate:
3718 vec_step = [S, S, S, S] */
3719 new_name = step_expr;
3720 else
3721 {
3722 /* iv_loop is the loop to be vectorized. Generate:
3723 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3724 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3725 {
3726 expr = build_int_cst (integer_type_node, vf);
3727 expr = fold_convert (TREE_TYPE (step_expr), expr);
3728 }
3729 else
3730 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3731 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3732 expr, step_expr);
3733 if (TREE_CODE (step_expr) == SSA_NAME)
3734 new_name = vect_init_vector (iv_phi, new_name,
3735 TREE_TYPE (step_expr), NULL);
3736 }
3737
3738 t = unshare_expr (new_name);
3739 gcc_assert (CONSTANT_CLASS_P (new_name)
3740 || TREE_CODE (new_name) == SSA_NAME);
3741 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3742 gcc_assert (stepvectype);
3743 new_vec = build_vector_from_val (stepvectype, t);
3744 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3745
3746
3747 /* Create the following def-use cycle:
3748 loop prolog:
3749 vec_init = ...
3750 vec_step = ...
3751 loop:
3752 vec_iv = PHI <vec_init, vec_loop>
3753 ...
3754 STMT
3755 ...
3756 vec_loop = vec_iv + vec_step; */
3757
3758 /* Create the induction-phi that defines the induction-operand. */
3759 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3760 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3761 set_vinfo_for_stmt (induction_phi,
3762 new_stmt_vec_info (induction_phi, loop_vinfo));
3763 induc_def = PHI_RESULT (induction_phi);
3764
3765 /* Create the iv update inside the loop */
3766 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3767 vec_def = make_ssa_name (vec_dest, new_stmt);
3768 gimple_assign_set_lhs (new_stmt, vec_def);
3769 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3770 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3771
3772 /* Set the arguments of the phi node: */
3773 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3774 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3775 UNKNOWN_LOCATION);
3776
3777
3778 /* In case that vectorization factor (VF) is bigger than the number
3779 of elements that we can fit in a vectype (nunits), we have to generate
3780 more than one vector stmt - i.e - we need to "unroll" the
3781 vector stmt by a factor VF/nunits. For more details see documentation
3782 in vectorizable_operation. */
3783
3784 if (ncopies > 1)
3785 {
3786 stmt_vec_info prev_stmt_vinfo;
3787 /* FORNOW. This restriction should be relaxed. */
3788 gcc_assert (!nested_in_vect_loop);
3789
3790 /* Create the vector that holds the step of the induction. */
3791 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3792 {
3793 expr = build_int_cst (integer_type_node, nunits);
3794 expr = fold_convert (TREE_TYPE (step_expr), expr);
3795 }
3796 else
3797 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3798 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3799 expr, step_expr);
3800 if (TREE_CODE (step_expr) == SSA_NAME)
3801 new_name = vect_init_vector (iv_phi, new_name,
3802 TREE_TYPE (step_expr), NULL);
3803 t = unshare_expr (new_name);
3804 gcc_assert (CONSTANT_CLASS_P (new_name)
3805 || TREE_CODE (new_name) == SSA_NAME);
3806 new_vec = build_vector_from_val (stepvectype, t);
3807 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3808
3809 vec_def = induc_def;
3810 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3811 for (i = 1; i < ncopies; i++)
3812 {
3813 /* vec_i = vec_prev + vec_step */
3814 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3815 vec_def, vec_step);
3816 vec_def = make_ssa_name (vec_dest, new_stmt);
3817 gimple_assign_set_lhs (new_stmt, vec_def);
3818
3819 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3820 if (!useless_type_conversion_p (resvectype, vectype))
3821 {
3822 new_stmt
3823 = gimple_build_assign
3824 (vect_get_new_vect_var (resvectype, vect_simple_var,
3825 "vec_iv_"),
3826 VIEW_CONVERT_EXPR,
3827 build1 (VIEW_CONVERT_EXPR, resvectype,
3828 gimple_assign_lhs (new_stmt)));
3829 gimple_assign_set_lhs (new_stmt,
3830 make_ssa_name
3831 (gimple_assign_lhs (new_stmt), new_stmt));
3832 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3833 }
3834 set_vinfo_for_stmt (new_stmt,
3835 new_stmt_vec_info (new_stmt, loop_vinfo));
3836 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3837 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3838 }
3839 }
3840
3841 if (nested_in_vect_loop)
3842 {
3843 /* Find the loop-closed exit-phi of the induction, and record
3844 the final vector of induction results: */
3845 exit_phi = NULL;
3846 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3847 {
3848 gimple *use_stmt = USE_STMT (use_p);
3849 if (is_gimple_debug (use_stmt))
3850 continue;
3851
3852 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3853 {
3854 exit_phi = use_stmt;
3855 break;
3856 }
3857 }
3858 if (exit_phi)
3859 {
3860 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3861 /* FORNOW. Currently not supporting the case that an inner-loop induction
3862 is not used in the outer-loop (i.e. only outside the outer-loop). */
3863 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3864 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3865
3866 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3867 if (dump_enabled_p ())
3868 {
3869 dump_printf_loc (MSG_NOTE, vect_location,
3870 "vector of inductions after inner-loop:");
3871 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3872 dump_printf (MSG_NOTE, "\n");
3873 }
3874 }
3875 }
3876
3877
3878 if (dump_enabled_p ())
3879 {
3880 dump_printf_loc (MSG_NOTE, vect_location,
3881 "transform induction: created def-use cycle: ");
3882 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3883 dump_printf (MSG_NOTE, "\n");
3884 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3885 SSA_NAME_DEF_STMT (vec_def), 0);
3886 dump_printf (MSG_NOTE, "\n");
3887 }
3888
3889 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3890 if (!useless_type_conversion_p (resvectype, vectype))
3891 {
3892 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3893 vect_simple_var,
3894 "vec_iv_"),
3895 VIEW_CONVERT_EXPR,
3896 build1 (VIEW_CONVERT_EXPR, resvectype,
3897 induc_def));
3898 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3899 gimple_assign_set_lhs (new_stmt, induc_def);
3900 si = gsi_after_labels (bb);
3901 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3902 set_vinfo_for_stmt (new_stmt,
3903 new_stmt_vec_info (new_stmt, loop_vinfo));
3904 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3905 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3906 }
3907
3908 return induc_def;
3909 }
3910
3911
3912 /* Function get_initial_def_for_reduction
3913
3914 Input:
3915 STMT - a stmt that performs a reduction operation in the loop.
3916 INIT_VAL - the initial value of the reduction variable
3917
3918 Output:
3919 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3920 of the reduction (used for adjusting the epilog - see below).
3921 Return a vector variable, initialized according to the operation that STMT
3922 performs. This vector will be used as the initial value of the
3923 vector of partial results.
3924
3925 Option1 (adjust in epilog): Initialize the vector as follows:
3926 add/bit or/xor: [0,0,...,0,0]
3927 mult/bit and: [1,1,...,1,1]
3928 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3929 and when necessary (e.g. add/mult case) let the caller know
3930 that it needs to adjust the result by init_val.
3931
3932 Option2: Initialize the vector as follows:
3933 add/bit or/xor: [init_val,0,0,...,0]
3934 mult/bit and: [init_val,1,1,...,1]
3935 min/max/cond_expr: [init_val,init_val,...,init_val]
3936 and no adjustments are needed.
3937
3938 For example, for the following code:
3939
3940 s = init_val;
3941 for (i=0;i<n;i++)
3942 s = s + a[i];
3943
3944 STMT is 's = s + a[i]', and the reduction variable is 's'.
3945 For a vector of 4 units, we want to return either [0,0,0,init_val],
3946 or [0,0,0,0] and let the caller know that it needs to adjust
3947 the result at the end by 'init_val'.
3948
3949 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3950 initialization vector is simpler (same element in all entries), if
3951 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3952
3953 A cost model should help decide between these two schemes. */
3954
3955 tree
3956 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3957 tree *adjustment_def)
3958 {
3959 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3960 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3961 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3962 tree scalar_type = TREE_TYPE (init_val);
3963 tree vectype = get_vectype_for_scalar_type (scalar_type);
3964 int nunits;
3965 enum tree_code code = gimple_assign_rhs_code (stmt);
3966 tree def_for_init;
3967 tree init_def;
3968 tree *elts;
3969 int i;
3970 bool nested_in_vect_loop = false;
3971 tree init_value;
3972 REAL_VALUE_TYPE real_init_val = dconst0;
3973 int int_init_val = 0;
3974 gimple *def_stmt = NULL;
3975
3976 gcc_assert (vectype);
3977 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3978
3979 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3980 || SCALAR_FLOAT_TYPE_P (scalar_type));
3981
3982 if (nested_in_vect_loop_p (loop, stmt))
3983 nested_in_vect_loop = true;
3984 else
3985 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3986
3987 /* In case of double reduction we only create a vector variable to be put
3988 in the reduction phi node. The actual statement creation is done in
3989 vect_create_epilog_for_reduction. */
3990 if (adjustment_def && nested_in_vect_loop
3991 && TREE_CODE (init_val) == SSA_NAME
3992 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3993 && gimple_code (def_stmt) == GIMPLE_PHI
3994 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3995 && vinfo_for_stmt (def_stmt)
3996 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3997 == vect_double_reduction_def)
3998 {
3999 *adjustment_def = NULL;
4000 return vect_create_destination_var (init_val, vectype);
4001 }
4002
4003 if (TREE_CONSTANT (init_val))
4004 {
4005 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4006 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
4007 else
4008 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
4009 }
4010 else
4011 init_value = init_val;
4012
4013 switch (code)
4014 {
4015 case WIDEN_SUM_EXPR:
4016 case DOT_PROD_EXPR:
4017 case SAD_EXPR:
4018 case PLUS_EXPR:
4019 case MINUS_EXPR:
4020 case BIT_IOR_EXPR:
4021 case BIT_XOR_EXPR:
4022 case MULT_EXPR:
4023 case BIT_AND_EXPR:
4024 /* ADJUSMENT_DEF is NULL when called from
4025 vect_create_epilog_for_reduction to vectorize double reduction. */
4026 if (adjustment_def)
4027 {
4028 if (nested_in_vect_loop)
4029 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt);
4030 else
4031 *adjustment_def = init_val;
4032 }
4033
4034 if (code == MULT_EXPR)
4035 {
4036 real_init_val = dconst1;
4037 int_init_val = 1;
4038 }
4039
4040 if (code == BIT_AND_EXPR)
4041 int_init_val = -1;
4042
4043 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4044 def_for_init = build_real (scalar_type, real_init_val);
4045 else
4046 def_for_init = build_int_cst (scalar_type, int_init_val);
4047
4048 /* Create a vector of '0' or '1' except the first element. */
4049 elts = XALLOCAVEC (tree, nunits);
4050 for (i = nunits - 2; i >= 0; --i)
4051 elts[i + 1] = def_for_init;
4052
4053 /* Option1: the first element is '0' or '1' as well. */
4054 if (adjustment_def)
4055 {
4056 elts[0] = def_for_init;
4057 init_def = build_vector (vectype, elts);
4058 break;
4059 }
4060
4061 /* Option2: the first element is INIT_VAL. */
4062 elts[0] = init_val;
4063 if (TREE_CONSTANT (init_val))
4064 init_def = build_vector (vectype, elts);
4065 else
4066 {
4067 vec<constructor_elt, va_gc> *v;
4068 vec_alloc (v, nunits);
4069 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4070 for (i = 1; i < nunits; ++i)
4071 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4072 init_def = build_constructor (vectype, v);
4073 }
4074
4075 break;
4076
4077 case MIN_EXPR:
4078 case MAX_EXPR:
4079 case COND_EXPR:
4080 if (adjustment_def)
4081 {
4082 *adjustment_def = NULL_TREE;
4083 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4084 {
4085 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4086 break;
4087 }
4088 }
4089 init_def = build_vector_from_val (vectype, init_value);
4090 break;
4091
4092 default:
4093 gcc_unreachable ();
4094 }
4095
4096 return init_def;
4097 }
4098
4099 /* Function vect_create_epilog_for_reduction
4100
4101 Create code at the loop-epilog to finalize the result of a reduction
4102 computation.
4103
4104 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4105 reduction statements.
4106 STMT is the scalar reduction stmt that is being vectorized.
4107 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4108 number of elements that we can fit in a vectype (nunits). In this case
4109 we have to generate more than one vector stmt - i.e - we need to "unroll"
4110 the vector stmt by a factor VF/nunits. For more details see documentation
4111 in vectorizable_operation.
4112 REDUC_CODE is the tree-code for the epilog reduction.
4113 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4114 computation.
4115 REDUC_INDEX is the index of the operand in the right hand side of the
4116 statement that is defined by REDUCTION_PHI.
4117 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4118 SLP_NODE is an SLP node containing a group of reduction statements. The
4119 first one in this group is STMT.
4120 INDUCTION_INDEX is the index of the loop for condition reductions.
4121 Otherwise it is undefined.
4122
4123 This function:
4124 1. Creates the reduction def-use cycles: sets the arguments for
4125 REDUCTION_PHIS:
4126 The loop-entry argument is the vectorized initial-value of the reduction.
4127 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4128 sums.
4129 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4130 by applying the operation specified by REDUC_CODE if available, or by
4131 other means (whole-vector shifts or a scalar loop).
4132 The function also creates a new phi node at the loop exit to preserve
4133 loop-closed form, as illustrated below.
4134
4135 The flow at the entry to this function:
4136
4137 loop:
4138 vec_def = phi <null, null> # REDUCTION_PHI
4139 VECT_DEF = vector_stmt # vectorized form of STMT
4140 s_loop = scalar_stmt # (scalar) STMT
4141 loop_exit:
4142 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4143 use <s_out0>
4144 use <s_out0>
4145
4146 The above is transformed by this function into:
4147
4148 loop:
4149 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4150 VECT_DEF = vector_stmt # vectorized form of STMT
4151 s_loop = scalar_stmt # (scalar) STMT
4152 loop_exit:
4153 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4154 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4155 v_out2 = reduce <v_out1>
4156 s_out3 = extract_field <v_out2, 0>
4157 s_out4 = adjust_result <s_out3>
4158 use <s_out4>
4159 use <s_out4>
4160 */
4161
4162 static void
4163 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4164 int ncopies, enum tree_code reduc_code,
4165 vec<gimple *> reduction_phis,
4166 int reduc_index, bool double_reduc,
4167 slp_tree slp_node, tree induction_index)
4168 {
4169 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4170 stmt_vec_info prev_phi_info;
4171 tree vectype;
4172 machine_mode mode;
4173 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4174 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4175 basic_block exit_bb;
4176 tree scalar_dest;
4177 tree scalar_type;
4178 gimple *new_phi = NULL, *phi;
4179 gimple_stmt_iterator exit_gsi;
4180 tree vec_dest;
4181 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4182 gimple *epilog_stmt = NULL;
4183 enum tree_code code = gimple_assign_rhs_code (stmt);
4184 gimple *exit_phi;
4185 tree bitsize;
4186 tree adjustment_def = NULL;
4187 tree vec_initial_def = NULL;
4188 tree reduction_op, expr, def, initial_def = NULL;
4189 tree orig_name, scalar_result;
4190 imm_use_iterator imm_iter, phi_imm_iter;
4191 use_operand_p use_p, phi_use_p;
4192 gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4193 bool nested_in_vect_loop = false;
4194 auto_vec<gimple *> new_phis;
4195 auto_vec<gimple *> inner_phis;
4196 enum vect_def_type dt = vect_unknown_def_type;
4197 int j, i;
4198 auto_vec<tree> scalar_results;
4199 unsigned int group_size = 1, k, ratio;
4200 auto_vec<tree> vec_initial_defs;
4201 auto_vec<gimple *> phis;
4202 bool slp_reduc = false;
4203 tree new_phi_result;
4204 gimple *inner_phi = NULL;
4205
4206 if (slp_node)
4207 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4208
4209 if (nested_in_vect_loop_p (loop, stmt))
4210 {
4211 outer_loop = loop;
4212 loop = loop->inner;
4213 nested_in_vect_loop = true;
4214 gcc_assert (!slp_node);
4215 }
4216
4217 reduction_op = get_reduction_op (stmt, reduc_index);
4218
4219 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4220 gcc_assert (vectype);
4221 mode = TYPE_MODE (vectype);
4222
4223 /* 1. Create the reduction def-use cycle:
4224 Set the arguments of REDUCTION_PHIS, i.e., transform
4225
4226 loop:
4227 vec_def = phi <null, null> # REDUCTION_PHI
4228 VECT_DEF = vector_stmt # vectorized form of STMT
4229 ...
4230
4231 into:
4232
4233 loop:
4234 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4235 VECT_DEF = vector_stmt # vectorized form of STMT
4236 ...
4237
4238 (in case of SLP, do it for all the phis). */
4239
4240 /* Get the loop-entry arguments. */
4241 if (slp_node)
4242 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4243 NULL, slp_node, reduc_index);
4244 else
4245 {
4246 /* Get at the scalar def before the loop, that defines the initial value
4247 of the reduction variable. */
4248 gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4249 initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4250 loop_preheader_edge (loop));
4251 vec_initial_defs.create (1);
4252 vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4253 &adjustment_def);
4254 vec_initial_defs.quick_push (vec_initial_def);
4255 }
4256
4257 /* Set phi nodes arguments. */
4258 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4259 {
4260 tree vec_init_def, def;
4261 gimple_seq stmts;
4262 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4263 true, NULL_TREE);
4264 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4265 def = vect_defs[i];
4266 for (j = 0; j < ncopies; j++)
4267 {
4268 /* Set the loop-entry arg of the reduction-phi. */
4269
4270 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4271 == INTEGER_INDUC_COND_REDUCTION)
4272 {
4273 /* Initialise the reduction phi to zero. This prevents initial
4274 values of non-zero interferring with the reduction op. */
4275 gcc_assert (ncopies == 1);
4276 gcc_assert (i == 0);
4277
4278 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4279 tree zero_vec = build_zero_cst (vec_init_def_type);
4280
4281 add_phi_arg (as_a <gphi *> (phi), zero_vec,
4282 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4283 }
4284 else
4285 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4286 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4287
4288 /* Set the loop-latch arg for the reduction-phi. */
4289 if (j > 0)
4290 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4291
4292 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4293 UNKNOWN_LOCATION);
4294
4295 if (dump_enabled_p ())
4296 {
4297 dump_printf_loc (MSG_NOTE, vect_location,
4298 "transform reduction: created def-use cycle: ");
4299 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4300 dump_printf (MSG_NOTE, "\n");
4301 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4302 dump_printf (MSG_NOTE, "\n");
4303 }
4304
4305 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4306 }
4307 }
4308
4309 /* 2. Create epilog code.
4310 The reduction epilog code operates across the elements of the vector
4311 of partial results computed by the vectorized loop.
4312 The reduction epilog code consists of:
4313
4314 step 1: compute the scalar result in a vector (v_out2)
4315 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4316 step 3: adjust the scalar result (s_out3) if needed.
4317
4318 Step 1 can be accomplished using one the following three schemes:
4319 (scheme 1) using reduc_code, if available.
4320 (scheme 2) using whole-vector shifts, if available.
4321 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4322 combined.
4323
4324 The overall epilog code looks like this:
4325
4326 s_out0 = phi <s_loop> # original EXIT_PHI
4327 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4328 v_out2 = reduce <v_out1> # step 1
4329 s_out3 = extract_field <v_out2, 0> # step 2
4330 s_out4 = adjust_result <s_out3> # step 3
4331
4332 (step 3 is optional, and steps 1 and 2 may be combined).
4333 Lastly, the uses of s_out0 are replaced by s_out4. */
4334
4335
4336 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4337 v_out1 = phi <VECT_DEF>
4338 Store them in NEW_PHIS. */
4339
4340 exit_bb = single_exit (loop)->dest;
4341 prev_phi_info = NULL;
4342 new_phis.create (vect_defs.length ());
4343 FOR_EACH_VEC_ELT (vect_defs, i, def)
4344 {
4345 for (j = 0; j < ncopies; j++)
4346 {
4347 tree new_def = copy_ssa_name (def);
4348 phi = create_phi_node (new_def, exit_bb);
4349 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4350 if (j == 0)
4351 new_phis.quick_push (phi);
4352 else
4353 {
4354 def = vect_get_vec_def_for_stmt_copy (dt, def);
4355 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4356 }
4357
4358 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4359 prev_phi_info = vinfo_for_stmt (phi);
4360 }
4361 }
4362
4363 /* The epilogue is created for the outer-loop, i.e., for the loop being
4364 vectorized. Create exit phis for the outer loop. */
4365 if (double_reduc)
4366 {
4367 loop = outer_loop;
4368 exit_bb = single_exit (loop)->dest;
4369 inner_phis.create (vect_defs.length ());
4370 FOR_EACH_VEC_ELT (new_phis, i, phi)
4371 {
4372 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4373 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4374 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4375 PHI_RESULT (phi));
4376 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4377 loop_vinfo));
4378 inner_phis.quick_push (phi);
4379 new_phis[i] = outer_phi;
4380 prev_phi_info = vinfo_for_stmt (outer_phi);
4381 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4382 {
4383 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4384 new_result = copy_ssa_name (PHI_RESULT (phi));
4385 outer_phi = create_phi_node (new_result, exit_bb);
4386 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4387 PHI_RESULT (phi));
4388 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4389 loop_vinfo));
4390 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4391 prev_phi_info = vinfo_for_stmt (outer_phi);
4392 }
4393 }
4394 }
4395
4396 exit_gsi = gsi_after_labels (exit_bb);
4397
4398 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4399 (i.e. when reduc_code is not available) and in the final adjustment
4400 code (if needed). Also get the original scalar reduction variable as
4401 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4402 represents a reduction pattern), the tree-code and scalar-def are
4403 taken from the original stmt that the pattern-stmt (STMT) replaces.
4404 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4405 are taken from STMT. */
4406
4407 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4408 if (!orig_stmt)
4409 {
4410 /* Regular reduction */
4411 orig_stmt = stmt;
4412 }
4413 else
4414 {
4415 /* Reduction pattern */
4416 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4417 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4418 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4419 }
4420
4421 code = gimple_assign_rhs_code (orig_stmt);
4422 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4423 partial results are added and not subtracted. */
4424 if (code == MINUS_EXPR)
4425 code = PLUS_EXPR;
4426
4427 scalar_dest = gimple_assign_lhs (orig_stmt);
4428 scalar_type = TREE_TYPE (scalar_dest);
4429 scalar_results.create (group_size);
4430 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4431 bitsize = TYPE_SIZE (scalar_type);
4432
4433 /* In case this is a reduction in an inner-loop while vectorizing an outer
4434 loop - we don't need to extract a single scalar result at the end of the
4435 inner-loop (unless it is double reduction, i.e., the use of reduction is
4436 outside the outer-loop). The final vector of partial results will be used
4437 in the vectorized outer-loop, or reduced to a scalar result at the end of
4438 the outer-loop. */
4439 if (nested_in_vect_loop && !double_reduc)
4440 goto vect_finalize_reduction;
4441
4442 /* SLP reduction without reduction chain, e.g.,
4443 # a1 = phi <a2, a0>
4444 # b1 = phi <b2, b0>
4445 a2 = operation (a1)
4446 b2 = operation (b1) */
4447 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4448
4449 /* In case of reduction chain, e.g.,
4450 # a1 = phi <a3, a0>
4451 a2 = operation (a1)
4452 a3 = operation (a2),
4453
4454 we may end up with more than one vector result. Here we reduce them to
4455 one vector. */
4456 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4457 {
4458 tree first_vect = PHI_RESULT (new_phis[0]);
4459 tree tmp;
4460 gassign *new_vec_stmt = NULL;
4461
4462 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4463 for (k = 1; k < new_phis.length (); k++)
4464 {
4465 gimple *next_phi = new_phis[k];
4466 tree second_vect = PHI_RESULT (next_phi);
4467
4468 tmp = build2 (code, vectype, first_vect, second_vect);
4469 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4470 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4471 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4472 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4473 }
4474
4475 new_phi_result = first_vect;
4476 if (new_vec_stmt)
4477 {
4478 new_phis.truncate (0);
4479 new_phis.safe_push (new_vec_stmt);
4480 }
4481 }
4482 else
4483 new_phi_result = PHI_RESULT (new_phis[0]);
4484
4485 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4486 {
4487 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4488 various data values where the condition matched and another vector
4489 (INDUCTION_INDEX) containing all the indexes of those matches. We
4490 need to extract the last matching index (which will be the index with
4491 highest value) and use this to index into the data vector.
4492 For the case where there were no matches, the data vector will contain
4493 all default values and the index vector will be all zeros. */
4494
4495 /* Get various versions of the type of the vector of indexes. */
4496 tree index_vec_type = TREE_TYPE (induction_index);
4497 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4498 tree index_scalar_type = TREE_TYPE (index_vec_type);
4499 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4500 (index_vec_type);
4501
4502 /* Get an unsigned integer version of the type of the data vector. */
4503 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4504 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4505 tree vectype_unsigned = build_vector_type
4506 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4507
4508 /* First we need to create a vector (ZERO_VEC) of zeros and another
4509 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4510 can create using a MAX reduction and then expanding.
4511 In the case where the loop never made any matches, the max index will
4512 be zero. */
4513
4514 /* Vector of {0, 0, 0,...}. */
4515 tree zero_vec = make_ssa_name (vectype);
4516 tree zero_vec_rhs = build_zero_cst (vectype);
4517 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4518 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4519
4520 /* Find maximum value from the vector of found indexes. */
4521 tree max_index = make_ssa_name (index_scalar_type);
4522 gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4523 induction_index);
4524 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4525
4526 /* Vector of {max_index, max_index, max_index,...}. */
4527 tree max_index_vec = make_ssa_name (index_vec_type);
4528 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4529 max_index);
4530 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4531 max_index_vec_rhs);
4532 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4533
4534 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4535 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4536 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4537 otherwise. Only one value should match, resulting in a vector
4538 (VEC_COND) with one data value and the rest zeros.
4539 In the case where the loop never made any matches, every index will
4540 match, resulting in a vector with all data values (which will all be
4541 the default value). */
4542
4543 /* Compare the max index vector to the vector of found indexes to find
4544 the position of the max value. */
4545 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4546 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4547 induction_index,
4548 max_index_vec);
4549 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4550
4551 /* Use the compare to choose either values from the data vector or
4552 zero. */
4553 tree vec_cond = make_ssa_name (vectype);
4554 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4555 vec_compare, new_phi_result,
4556 zero_vec);
4557 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4558
4559 /* Finally we need to extract the data value from the vector (VEC_COND)
4560 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4561 reduction, but because this doesn't exist, we can use a MAX reduction
4562 instead. The data value might be signed or a float so we need to cast
4563 it first.
4564 In the case where the loop never made any matches, the data values are
4565 all identical, and so will reduce down correctly. */
4566
4567 /* Make the matched data values unsigned. */
4568 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4569 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4570 vec_cond);
4571 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4572 VIEW_CONVERT_EXPR,
4573 vec_cond_cast_rhs);
4574 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4575
4576 /* Reduce down to a scalar value. */
4577 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4578 optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4579 optab_default);
4580 gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4581 != CODE_FOR_nothing);
4582 gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4583 REDUC_MAX_EXPR,
4584 vec_cond_cast);
4585 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4586
4587 /* Convert the reduced value back to the result type and set as the
4588 result. */
4589 tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4590 data_reduc);
4591 epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4592 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4593 gimple_assign_set_lhs (epilog_stmt, new_temp);
4594 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4595 scalar_results.safe_push (new_temp);
4596 }
4597
4598 /* 2.3 Create the reduction code, using one of the three schemes described
4599 above. In SLP we simply need to extract all the elements from the
4600 vector (without reducing them), so we use scalar shifts. */
4601 else if (reduc_code != ERROR_MARK && !slp_reduc)
4602 {
4603 tree tmp;
4604 tree vec_elem_type;
4605
4606 /*** Case 1: Create:
4607 v_out2 = reduc_expr <v_out1> */
4608
4609 if (dump_enabled_p ())
4610 dump_printf_loc (MSG_NOTE, vect_location,
4611 "Reduce using direct vector reduction.\n");
4612
4613 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4614 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4615 {
4616 tree tmp_dest =
4617 vect_create_destination_var (scalar_dest, vec_elem_type);
4618 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4619 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4620 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4621 gimple_assign_set_lhs (epilog_stmt, new_temp);
4622 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4623
4624 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4625 }
4626 else
4627 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4628
4629 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4630 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4631 gimple_assign_set_lhs (epilog_stmt, new_temp);
4632 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4633
4634 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4635 == INTEGER_INDUC_COND_REDUCTION)
4636 {
4637 /* Earlier we set the initial value to be zero. Check the result
4638 and if it is zero then replace with the original initial
4639 value. */
4640 tree zero = build_zero_cst (scalar_type);
4641 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4642
4643 tmp = make_ssa_name (new_scalar_dest);
4644 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4645 initial_def, new_temp);
4646 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4647 new_temp = tmp;
4648 }
4649
4650 scalar_results.safe_push (new_temp);
4651 }
4652 else
4653 {
4654 bool reduce_with_shift = have_whole_vector_shift (mode);
4655 int element_bitsize = tree_to_uhwi (bitsize);
4656 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4657 tree vec_temp;
4658
4659 /* Regardless of whether we have a whole vector shift, if we're
4660 emulating the operation via tree-vect-generic, we don't want
4661 to use it. Only the first round of the reduction is likely
4662 to still be profitable via emulation. */
4663 /* ??? It might be better to emit a reduction tree code here, so that
4664 tree-vect-generic can expand the first round via bit tricks. */
4665 if (!VECTOR_MODE_P (mode))
4666 reduce_with_shift = false;
4667 else
4668 {
4669 optab optab = optab_for_tree_code (code, vectype, optab_default);
4670 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4671 reduce_with_shift = false;
4672 }
4673
4674 if (reduce_with_shift && !slp_reduc)
4675 {
4676 int nelements = vec_size_in_bits / element_bitsize;
4677 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4678
4679 int elt_offset;
4680
4681 tree zero_vec = build_zero_cst (vectype);
4682 /*** Case 2: Create:
4683 for (offset = nelements/2; offset >= 1; offset/=2)
4684 {
4685 Create: va' = vec_shift <va, offset>
4686 Create: va = vop <va, va'>
4687 } */
4688
4689 tree rhs;
4690
4691 if (dump_enabled_p ())
4692 dump_printf_loc (MSG_NOTE, vect_location,
4693 "Reduce using vector shifts\n");
4694
4695 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4696 new_temp = new_phi_result;
4697 for (elt_offset = nelements / 2;
4698 elt_offset >= 1;
4699 elt_offset /= 2)
4700 {
4701 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4702 tree mask = vect_gen_perm_mask_any (vectype, sel);
4703 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4704 new_temp, zero_vec, mask);
4705 new_name = make_ssa_name (vec_dest, epilog_stmt);
4706 gimple_assign_set_lhs (epilog_stmt, new_name);
4707 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4708
4709 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4710 new_temp);
4711 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4712 gimple_assign_set_lhs (epilog_stmt, new_temp);
4713 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4714 }
4715
4716 /* 2.4 Extract the final scalar result. Create:
4717 s_out3 = extract_field <v_out2, bitpos> */
4718
4719 if (dump_enabled_p ())
4720 dump_printf_loc (MSG_NOTE, vect_location,
4721 "extract scalar result\n");
4722
4723 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4724 bitsize, bitsize_zero_node);
4725 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4726 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4727 gimple_assign_set_lhs (epilog_stmt, new_temp);
4728 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4729 scalar_results.safe_push (new_temp);
4730 }
4731 else
4732 {
4733 /*** Case 3: Create:
4734 s = extract_field <v_out2, 0>
4735 for (offset = element_size;
4736 offset < vector_size;
4737 offset += element_size;)
4738 {
4739 Create: s' = extract_field <v_out2, offset>
4740 Create: s = op <s, s'> // For non SLP cases
4741 } */
4742
4743 if (dump_enabled_p ())
4744 dump_printf_loc (MSG_NOTE, vect_location,
4745 "Reduce using scalar code.\n");
4746
4747 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4748 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4749 {
4750 int bit_offset;
4751 if (gimple_code (new_phi) == GIMPLE_PHI)
4752 vec_temp = PHI_RESULT (new_phi);
4753 else
4754 vec_temp = gimple_assign_lhs (new_phi);
4755 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4756 bitsize_zero_node);
4757 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4758 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4759 gimple_assign_set_lhs (epilog_stmt, new_temp);
4760 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4761
4762 /* In SLP we don't need to apply reduction operation, so we just
4763 collect s' values in SCALAR_RESULTS. */
4764 if (slp_reduc)
4765 scalar_results.safe_push (new_temp);
4766
4767 for (bit_offset = element_bitsize;
4768 bit_offset < vec_size_in_bits;
4769 bit_offset += element_bitsize)
4770 {
4771 tree bitpos = bitsize_int (bit_offset);
4772 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4773 bitsize, bitpos);
4774
4775 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4776 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4777 gimple_assign_set_lhs (epilog_stmt, new_name);
4778 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4779
4780 if (slp_reduc)
4781 {
4782 /* In SLP we don't need to apply reduction operation, so
4783 we just collect s' values in SCALAR_RESULTS. */
4784 new_temp = new_name;
4785 scalar_results.safe_push (new_name);
4786 }
4787 else
4788 {
4789 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4790 new_name, new_temp);
4791 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4792 gimple_assign_set_lhs (epilog_stmt, new_temp);
4793 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4794 }
4795 }
4796 }
4797
4798 /* The only case where we need to reduce scalar results in SLP, is
4799 unrolling. If the size of SCALAR_RESULTS is greater than
4800 GROUP_SIZE, we reduce them combining elements modulo
4801 GROUP_SIZE. */
4802 if (slp_reduc)
4803 {
4804 tree res, first_res, new_res;
4805 gimple *new_stmt;
4806
4807 /* Reduce multiple scalar results in case of SLP unrolling. */
4808 for (j = group_size; scalar_results.iterate (j, &res);
4809 j++)
4810 {
4811 first_res = scalar_results[j % group_size];
4812 new_stmt = gimple_build_assign (new_scalar_dest, code,
4813 first_res, res);
4814 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4815 gimple_assign_set_lhs (new_stmt, new_res);
4816 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4817 scalar_results[j % group_size] = new_res;
4818 }
4819 }
4820 else
4821 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4822 scalar_results.safe_push (new_temp);
4823 }
4824 }
4825
4826 vect_finalize_reduction:
4827
4828 if (double_reduc)
4829 loop = loop->inner;
4830
4831 /* 2.5 Adjust the final result by the initial value of the reduction
4832 variable. (When such adjustment is not needed, then
4833 'adjustment_def' is zero). For example, if code is PLUS we create:
4834 new_temp = loop_exit_def + adjustment_def */
4835
4836 if (adjustment_def)
4837 {
4838 gcc_assert (!slp_reduc);
4839 if (nested_in_vect_loop)
4840 {
4841 new_phi = new_phis[0];
4842 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4843 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4844 new_dest = vect_create_destination_var (scalar_dest, vectype);
4845 }
4846 else
4847 {
4848 new_temp = scalar_results[0];
4849 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4850 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4851 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4852 }
4853
4854 epilog_stmt = gimple_build_assign (new_dest, expr);
4855 new_temp = make_ssa_name (new_dest, epilog_stmt);
4856 gimple_assign_set_lhs (epilog_stmt, new_temp);
4857 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4858 if (nested_in_vect_loop)
4859 {
4860 set_vinfo_for_stmt (epilog_stmt,
4861 new_stmt_vec_info (epilog_stmt, loop_vinfo));
4862 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4863 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4864
4865 if (!double_reduc)
4866 scalar_results.quick_push (new_temp);
4867 else
4868 scalar_results[0] = new_temp;
4869 }
4870 else
4871 scalar_results[0] = new_temp;
4872
4873 new_phis[0] = epilog_stmt;
4874 }
4875
4876 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4877 phis with new adjusted scalar results, i.e., replace use <s_out0>
4878 with use <s_out4>.
4879
4880 Transform:
4881 loop_exit:
4882 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4883 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4884 v_out2 = reduce <v_out1>
4885 s_out3 = extract_field <v_out2, 0>
4886 s_out4 = adjust_result <s_out3>
4887 use <s_out0>
4888 use <s_out0>
4889
4890 into:
4891
4892 loop_exit:
4893 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4894 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4895 v_out2 = reduce <v_out1>
4896 s_out3 = extract_field <v_out2, 0>
4897 s_out4 = adjust_result <s_out3>
4898 use <s_out4>
4899 use <s_out4> */
4900
4901
4902 /* In SLP reduction chain we reduce vector results into one vector if
4903 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4904 the last stmt in the reduction chain, since we are looking for the loop
4905 exit phi node. */
4906 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4907 {
4908 gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4909 /* Handle reduction patterns. */
4910 if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4911 dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4912
4913 scalar_dest = gimple_assign_lhs (dest_stmt);
4914 group_size = 1;
4915 }
4916
4917 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4918 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4919 need to match SCALAR_RESULTS with corresponding statements. The first
4920 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4921 the first vector stmt, etc.
4922 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4923 if (group_size > new_phis.length ())
4924 {
4925 ratio = group_size / new_phis.length ();
4926 gcc_assert (!(group_size % new_phis.length ()));
4927 }
4928 else
4929 ratio = 1;
4930
4931 for (k = 0; k < group_size; k++)
4932 {
4933 if (k % ratio == 0)
4934 {
4935 epilog_stmt = new_phis[k / ratio];
4936 reduction_phi = reduction_phis[k / ratio];
4937 if (double_reduc)
4938 inner_phi = inner_phis[k / ratio];
4939 }
4940
4941 if (slp_reduc)
4942 {
4943 gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4944
4945 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4946 /* SLP statements can't participate in patterns. */
4947 gcc_assert (!orig_stmt);
4948 scalar_dest = gimple_assign_lhs (current_stmt);
4949 }
4950
4951 phis.create (3);
4952 /* Find the loop-closed-use at the loop exit of the original scalar
4953 result. (The reduction result is expected to have two immediate uses -
4954 one at the latch block, and one at the loop exit). */
4955 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4956 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4957 && !is_gimple_debug (USE_STMT (use_p)))
4958 phis.safe_push (USE_STMT (use_p));
4959
4960 /* While we expect to have found an exit_phi because of loop-closed-ssa
4961 form we can end up without one if the scalar cycle is dead. */
4962
4963 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4964 {
4965 if (outer_loop)
4966 {
4967 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4968 gphi *vect_phi;
4969
4970 /* FORNOW. Currently not supporting the case that an inner-loop
4971 reduction is not used in the outer-loop (but only outside the
4972 outer-loop), unless it is double reduction. */
4973 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4974 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4975 || double_reduc);
4976
4977 if (double_reduc)
4978 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4979 else
4980 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4981 if (!double_reduc
4982 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4983 != vect_double_reduction_def)
4984 continue;
4985
4986 /* Handle double reduction:
4987
4988 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4989 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4990 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4991 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4992
4993 At that point the regular reduction (stmt2 and stmt3) is
4994 already vectorized, as well as the exit phi node, stmt4.
4995 Here we vectorize the phi node of double reduction, stmt1, and
4996 update all relevant statements. */
4997
4998 /* Go through all the uses of s2 to find double reduction phi
4999 node, i.e., stmt1 above. */
5000 orig_name = PHI_RESULT (exit_phi);
5001 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5002 {
5003 stmt_vec_info use_stmt_vinfo;
5004 stmt_vec_info new_phi_vinfo;
5005 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5006 basic_block bb = gimple_bb (use_stmt);
5007 gimple *use;
5008
5009 /* Check that USE_STMT is really double reduction phi
5010 node. */
5011 if (gimple_code (use_stmt) != GIMPLE_PHI
5012 || gimple_phi_num_args (use_stmt) != 2
5013 || bb->loop_father != outer_loop)
5014 continue;
5015 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5016 if (!use_stmt_vinfo
5017 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5018 != vect_double_reduction_def)
5019 continue;
5020
5021 /* Create vector phi node for double reduction:
5022 vs1 = phi <vs0, vs2>
5023 vs1 was created previously in this function by a call to
5024 vect_get_vec_def_for_operand and is stored in
5025 vec_initial_def;
5026 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5027 vs0 is created here. */
5028
5029 /* Create vector phi node. */
5030 vect_phi = create_phi_node (vec_initial_def, bb);
5031 new_phi_vinfo = new_stmt_vec_info (vect_phi,
5032 loop_vec_info_for_loop (outer_loop));
5033 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5034
5035 /* Create vs0 - initial def of the double reduction phi. */
5036 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5037 loop_preheader_edge (outer_loop));
5038 init_def = get_initial_def_for_reduction (stmt,
5039 preheader_arg, NULL);
5040 vect_phi_init = vect_init_vector (use_stmt, init_def,
5041 vectype, NULL);
5042
5043 /* Update phi node arguments with vs0 and vs2. */
5044 add_phi_arg (vect_phi, vect_phi_init,
5045 loop_preheader_edge (outer_loop),
5046 UNKNOWN_LOCATION);
5047 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5048 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5049 if (dump_enabled_p ())
5050 {
5051 dump_printf_loc (MSG_NOTE, vect_location,
5052 "created double reduction phi node: ");
5053 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5054 dump_printf (MSG_NOTE, "\n");
5055 }
5056
5057 vect_phi_res = PHI_RESULT (vect_phi);
5058
5059 /* Replace the use, i.e., set the correct vs1 in the regular
5060 reduction phi node. FORNOW, NCOPIES is always 1, so the
5061 loop is redundant. */
5062 use = reduction_phi;
5063 for (j = 0; j < ncopies; j++)
5064 {
5065 edge pr_edge = loop_preheader_edge (loop);
5066 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5067 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5068 }
5069 }
5070 }
5071 }
5072
5073 phis.release ();
5074 if (nested_in_vect_loop)
5075 {
5076 if (double_reduc)
5077 loop = outer_loop;
5078 else
5079 continue;
5080 }
5081
5082 phis.create (3);
5083 /* Find the loop-closed-use at the loop exit of the original scalar
5084 result. (The reduction result is expected to have two immediate uses,
5085 one at the latch block, and one at the loop exit). For double
5086 reductions we are looking for exit phis of the outer loop. */
5087 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5088 {
5089 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5090 {
5091 if (!is_gimple_debug (USE_STMT (use_p)))
5092 phis.safe_push (USE_STMT (use_p));
5093 }
5094 else
5095 {
5096 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5097 {
5098 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5099
5100 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5101 {
5102 if (!flow_bb_inside_loop_p (loop,
5103 gimple_bb (USE_STMT (phi_use_p)))
5104 && !is_gimple_debug (USE_STMT (phi_use_p)))
5105 phis.safe_push (USE_STMT (phi_use_p));
5106 }
5107 }
5108 }
5109 }
5110
5111 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5112 {
5113 /* Replace the uses: */
5114 orig_name = PHI_RESULT (exit_phi);
5115 scalar_result = scalar_results[k];
5116 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5117 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5118 SET_USE (use_p, scalar_result);
5119 }
5120
5121 phis.release ();
5122 }
5123 }
5124
5125
5126 /* Function is_nonwrapping_integer_induction.
5127
5128 Check if STMT (which is part of loop LOOP) both increments and
5129 does not cause overflow. */
5130
5131 static bool
5132 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5133 {
5134 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5135 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5136 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5137 tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5138 widest_int ni, max_loop_value, lhs_max;
5139 bool overflow = false;
5140
5141 /* Make sure the loop is integer based. */
5142 if (TREE_CODE (base) != INTEGER_CST
5143 || TREE_CODE (step) != INTEGER_CST)
5144 return false;
5145
5146 /* Check that the induction increments. */
5147 if (tree_int_cst_sgn (step) == -1)
5148 return false;
5149
5150 /* Check that the max size of the loop will not wrap. */
5151
5152 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5153 return true;
5154
5155 if (! max_stmt_executions (loop, &ni))
5156 return false;
5157
5158 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5159 &overflow);
5160 if (overflow)
5161 return false;
5162
5163 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5164 TYPE_SIGN (lhs_type), &overflow);
5165 if (overflow)
5166 return false;
5167
5168 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5169 <= TYPE_PRECISION (lhs_type));
5170 }
5171
5172 /* Function vectorizable_reduction.
5173
5174 Check if STMT performs a reduction operation that can be vectorized.
5175 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5176 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5177 Return FALSE if not a vectorizable STMT, TRUE otherwise.
5178
5179 This function also handles reduction idioms (patterns) that have been
5180 recognized in advance during vect_pattern_recog. In this case, STMT may be
5181 of this form:
5182 X = pattern_expr (arg0, arg1, ..., X)
5183 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5184 sequence that had been detected and replaced by the pattern-stmt (STMT).
5185
5186 This function also handles reduction of condition expressions, for example:
5187 for (int i = 0; i < N; i++)
5188 if (a[i] < value)
5189 last = a[i];
5190 This is handled by vectorising the loop and creating an additional vector
5191 containing the loop indexes for which "a[i] < value" was true. In the
5192 function epilogue this is reduced to a single max value and then used to
5193 index into the vector of results.
5194
5195 In some cases of reduction patterns, the type of the reduction variable X is
5196 different than the type of the other arguments of STMT.
5197 In such cases, the vectype that is used when transforming STMT into a vector
5198 stmt is different than the vectype that is used to determine the
5199 vectorization factor, because it consists of a different number of elements
5200 than the actual number of elements that are being operated upon in parallel.
5201
5202 For example, consider an accumulation of shorts into an int accumulator.
5203 On some targets it's possible to vectorize this pattern operating on 8
5204 shorts at a time (hence, the vectype for purposes of determining the
5205 vectorization factor should be V8HI); on the other hand, the vectype that
5206 is used to create the vector form is actually V4SI (the type of the result).
5207
5208 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5209 indicates what is the actual level of parallelism (V8HI in the example), so
5210 that the right vectorization factor would be derived. This vectype
5211 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5212 be used to create the vectorized stmt. The right vectype for the vectorized
5213 stmt is obtained from the type of the result X:
5214 get_vectype_for_scalar_type (TREE_TYPE (X))
5215
5216 This means that, contrary to "regular" reductions (or "regular" stmts in
5217 general), the following equation:
5218 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5219 does *NOT* necessarily hold for reduction patterns. */
5220
5221 bool
5222 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5223 gimple **vec_stmt, slp_tree slp_node)
5224 {
5225 tree vec_dest;
5226 tree scalar_dest;
5227 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5228 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5229 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5230 tree vectype_in = NULL_TREE;
5231 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5232 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5233 enum tree_code code, orig_code, epilog_reduc_code;
5234 machine_mode vec_mode;
5235 int op_type;
5236 optab optab, reduc_optab;
5237 tree new_temp = NULL_TREE;
5238 gimple *def_stmt;
5239 enum vect_def_type dt;
5240 gphi *new_phi = NULL;
5241 tree scalar_type;
5242 bool is_simple_use;
5243 gimple *orig_stmt;
5244 stmt_vec_info orig_stmt_info;
5245 tree expr = NULL_TREE;
5246 int i;
5247 int ncopies;
5248 int epilog_copies;
5249 stmt_vec_info prev_stmt_info, prev_phi_info;
5250 bool single_defuse_cycle = false;
5251 tree reduc_def = NULL_TREE;
5252 gimple *new_stmt = NULL;
5253 int j;
5254 tree ops[3];
5255 bool nested_cycle = false, found_nested_cycle_def = false;
5256 gimple *reduc_def_stmt = NULL;
5257 bool double_reduc = false, dummy;
5258 basic_block def_bb;
5259 struct loop * def_stmt_loop, *outer_loop = NULL;
5260 tree def_arg;
5261 gimple *def_arg_stmt;
5262 auto_vec<tree> vec_oprnds0;
5263 auto_vec<tree> vec_oprnds1;
5264 auto_vec<tree> vect_defs;
5265 auto_vec<gimple *> phis;
5266 int vec_num;
5267 tree def0, def1, tem, op0, op1 = NULL_TREE;
5268 bool first_p = true;
5269 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5270 gimple *cond_expr_induction_def_stmt = NULL;
5271
5272 /* In case of reduction chain we switch to the first stmt in the chain, but
5273 we don't update STMT_INFO, since only the last stmt is marked as reduction
5274 and has reduction properties. */
5275 if (GROUP_FIRST_ELEMENT (stmt_info)
5276 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5277 {
5278 stmt = GROUP_FIRST_ELEMENT (stmt_info);
5279 first_p = false;
5280 }
5281
5282 if (nested_in_vect_loop_p (loop, stmt))
5283 {
5284 outer_loop = loop;
5285 loop = loop->inner;
5286 nested_cycle = true;
5287 }
5288
5289 /* 1. Is vectorizable reduction? */
5290 /* Not supportable if the reduction variable is used in the loop, unless
5291 it's a reduction chain. */
5292 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5293 && !GROUP_FIRST_ELEMENT (stmt_info))
5294 return false;
5295
5296 /* Reductions that are not used even in an enclosing outer-loop,
5297 are expected to be "live" (used out of the loop). */
5298 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5299 && !STMT_VINFO_LIVE_P (stmt_info))
5300 return false;
5301
5302 /* Make sure it was already recognized as a reduction computation. */
5303 if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5304 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5305 return false;
5306
5307 /* 2. Has this been recognized as a reduction pattern?
5308
5309 Check if STMT represents a pattern that has been recognized
5310 in earlier analysis stages. For stmts that represent a pattern,
5311 the STMT_VINFO_RELATED_STMT field records the last stmt in
5312 the original sequence that constitutes the pattern. */
5313
5314 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5315 if (orig_stmt)
5316 {
5317 orig_stmt_info = vinfo_for_stmt (orig_stmt);
5318 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5319 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5320 }
5321
5322 /* 3. Check the operands of the operation. The first operands are defined
5323 inside the loop body. The last operand is the reduction variable,
5324 which is defined by the loop-header-phi. */
5325
5326 gcc_assert (is_gimple_assign (stmt));
5327
5328 /* Flatten RHS. */
5329 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5330 {
5331 case GIMPLE_SINGLE_RHS:
5332 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5333 if (op_type == ternary_op)
5334 {
5335 tree rhs = gimple_assign_rhs1 (stmt);
5336 ops[0] = TREE_OPERAND (rhs, 0);
5337 ops[1] = TREE_OPERAND (rhs, 1);
5338 ops[2] = TREE_OPERAND (rhs, 2);
5339 code = TREE_CODE (rhs);
5340 }
5341 else
5342 return false;
5343 break;
5344
5345 case GIMPLE_BINARY_RHS:
5346 code = gimple_assign_rhs_code (stmt);
5347 op_type = TREE_CODE_LENGTH (code);
5348 gcc_assert (op_type == binary_op);
5349 ops[0] = gimple_assign_rhs1 (stmt);
5350 ops[1] = gimple_assign_rhs2 (stmt);
5351 break;
5352
5353 case GIMPLE_TERNARY_RHS:
5354 code = gimple_assign_rhs_code (stmt);
5355 op_type = TREE_CODE_LENGTH (code);
5356 gcc_assert (op_type == ternary_op);
5357 ops[0] = gimple_assign_rhs1 (stmt);
5358 ops[1] = gimple_assign_rhs2 (stmt);
5359 ops[2] = gimple_assign_rhs3 (stmt);
5360 break;
5361
5362 case GIMPLE_UNARY_RHS:
5363 return false;
5364
5365 default:
5366 gcc_unreachable ();
5367 }
5368 /* The default is that the reduction variable is the last in statement. */
5369 int reduc_index = op_type - 1;
5370 if (code == MINUS_EXPR)
5371 reduc_index = 0;
5372
5373 if (code == COND_EXPR && slp_node)
5374 return false;
5375
5376 scalar_dest = gimple_assign_lhs (stmt);
5377 scalar_type = TREE_TYPE (scalar_dest);
5378 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5379 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5380 return false;
5381
5382 /* Do not try to vectorize bit-precision reductions. */
5383 if ((TYPE_PRECISION (scalar_type)
5384 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5385 return false;
5386
5387 /* All uses but the last are expected to be defined in the loop.
5388 The last use is the reduction variable. In case of nested cycle this
5389 assumption is not true: we use reduc_index to record the index of the
5390 reduction variable. */
5391 for (i = 0; i < op_type; i++)
5392 {
5393 if (i == reduc_index)
5394 continue;
5395
5396 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5397 if (i == 0 && code == COND_EXPR)
5398 continue;
5399
5400 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5401 &def_stmt, &dt, &tem);
5402 if (!vectype_in)
5403 vectype_in = tem;
5404 gcc_assert (is_simple_use);
5405
5406 if (dt != vect_internal_def
5407 && dt != vect_external_def
5408 && dt != vect_constant_def
5409 && dt != vect_induction_def
5410 && !(dt == vect_nested_cycle && nested_cycle))
5411 return false;
5412
5413 if (dt == vect_nested_cycle)
5414 {
5415 found_nested_cycle_def = true;
5416 reduc_def_stmt = def_stmt;
5417 reduc_index = i;
5418 }
5419
5420 if (i == 1 && code == COND_EXPR && dt == vect_induction_def)
5421 cond_expr_induction_def_stmt = def_stmt;
5422 }
5423
5424 is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5425 &def_stmt, &dt, &tem);
5426 if (!vectype_in)
5427 vectype_in = tem;
5428 gcc_assert (is_simple_use);
5429 if (!found_nested_cycle_def)
5430 reduc_def_stmt = def_stmt;
5431
5432 if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5433 return false;
5434
5435 if (!(dt == vect_reduction_def
5436 || dt == vect_nested_cycle
5437 || ((dt == vect_internal_def || dt == vect_external_def
5438 || dt == vect_constant_def || dt == vect_induction_def)
5439 && nested_cycle && found_nested_cycle_def)))
5440 {
5441 /* For pattern recognized stmts, orig_stmt might be a reduction,
5442 but some helper statements for the pattern might not, or
5443 might be COND_EXPRs with reduction uses in the condition. */
5444 gcc_assert (orig_stmt);
5445 return false;
5446 }
5447
5448 enum vect_reduction_type v_reduc_type;
5449 gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5450 !nested_cycle, &dummy, false,
5451 &v_reduc_type);
5452
5453 /* If we have a condition reduction, see if we can simplify it further. */
5454 if (v_reduc_type == COND_REDUCTION
5455 && cond_expr_induction_def_stmt != NULL
5456 && is_nonwrapping_integer_induction (cond_expr_induction_def_stmt, loop))
5457 {
5458 if (dump_enabled_p ())
5459 dump_printf_loc (MSG_NOTE, vect_location,
5460 "condition expression based on integer induction.\n");
5461 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = INTEGER_INDUC_COND_REDUCTION;
5462 }
5463 else
5464 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5465
5466 if (orig_stmt)
5467 gcc_assert (tmp == orig_stmt
5468 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5469 else
5470 /* We changed STMT to be the first stmt in reduction chain, hence we
5471 check that in this case the first element in the chain is STMT. */
5472 gcc_assert (stmt == tmp
5473 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5474
5475 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5476 return false;
5477
5478 if (slp_node || PURE_SLP_STMT (stmt_info))
5479 ncopies = 1;
5480 else
5481 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5482 / TYPE_VECTOR_SUBPARTS (vectype_in));
5483
5484 gcc_assert (ncopies >= 1);
5485
5486 vec_mode = TYPE_MODE (vectype_in);
5487
5488 if (code == COND_EXPR)
5489 {
5490 /* Only call during the analysis stage, otherwise we'll lose
5491 STMT_VINFO_TYPE. */
5492 if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5493 ops[reduc_index], 0, NULL))
5494 {
5495 if (dump_enabled_p ())
5496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5497 "unsupported condition in reduction\n");
5498 return false;
5499 }
5500 }
5501 else
5502 {
5503 /* 4. Supportable by target? */
5504
5505 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5506 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5507 {
5508 /* Shifts and rotates are only supported by vectorizable_shifts,
5509 not vectorizable_reduction. */
5510 if (dump_enabled_p ())
5511 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5512 "unsupported shift or rotation.\n");
5513 return false;
5514 }
5515
5516 /* 4.1. check support for the operation in the loop */
5517 optab = optab_for_tree_code (code, vectype_in, optab_default);
5518 if (!optab)
5519 {
5520 if (dump_enabled_p ())
5521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5522 "no optab.\n");
5523
5524 return false;
5525 }
5526
5527 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5528 {
5529 if (dump_enabled_p ())
5530 dump_printf (MSG_NOTE, "op not supported by target.\n");
5531
5532 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5533 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5534 < vect_min_worthwhile_factor (code))
5535 return false;
5536
5537 if (dump_enabled_p ())
5538 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5539 }
5540
5541 /* Worthwhile without SIMD support? */
5542 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5543 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5544 < vect_min_worthwhile_factor (code))
5545 {
5546 if (dump_enabled_p ())
5547 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5548 "not worthwhile without SIMD support.\n");
5549
5550 return false;
5551 }
5552 }
5553
5554 /* 4.2. Check support for the epilog operation.
5555
5556 If STMT represents a reduction pattern, then the type of the
5557 reduction variable may be different than the type of the rest
5558 of the arguments. For example, consider the case of accumulation
5559 of shorts into an int accumulator; The original code:
5560 S1: int_a = (int) short_a;
5561 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5562
5563 was replaced with:
5564 STMT: int_acc = widen_sum <short_a, int_acc>
5565
5566 This means that:
5567 1. The tree-code that is used to create the vector operation in the
5568 epilog code (that reduces the partial results) is not the
5569 tree-code of STMT, but is rather the tree-code of the original
5570 stmt from the pattern that STMT is replacing. I.e, in the example
5571 above we want to use 'widen_sum' in the loop, but 'plus' in the
5572 epilog.
5573 2. The type (mode) we use to check available target support
5574 for the vector operation to be created in the *epilog*, is
5575 determined by the type of the reduction variable (in the example
5576 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5577 However the type (mode) we use to check available target support
5578 for the vector operation to be created *inside the loop*, is
5579 determined by the type of the other arguments to STMT (in the
5580 example we'd check this: optab_handler (widen_sum_optab,
5581 vect_short_mode)).
5582
5583 This is contrary to "regular" reductions, in which the types of all
5584 the arguments are the same as the type of the reduction variable.
5585 For "regular" reductions we can therefore use the same vector type
5586 (and also the same tree-code) when generating the epilog code and
5587 when generating the code inside the loop. */
5588
5589 if (orig_stmt)
5590 {
5591 /* This is a reduction pattern: get the vectype from the type of the
5592 reduction variable, and get the tree-code from orig_stmt. */
5593 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5594 == TREE_CODE_REDUCTION);
5595 orig_code = gimple_assign_rhs_code (orig_stmt);
5596 gcc_assert (vectype_out);
5597 vec_mode = TYPE_MODE (vectype_out);
5598 }
5599 else
5600 {
5601 /* Regular reduction: use the same vectype and tree-code as used for
5602 the vector code inside the loop can be used for the epilog code. */
5603 orig_code = code;
5604
5605 if (code == MINUS_EXPR)
5606 orig_code = PLUS_EXPR;
5607
5608 /* For simple condition reductions, replace with the actual expression
5609 we want to base our reduction around. */
5610 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5611 == INTEGER_INDUC_COND_REDUCTION)
5612 orig_code = MAX_EXPR;
5613 }
5614
5615 if (nested_cycle)
5616 {
5617 def_bb = gimple_bb (reduc_def_stmt);
5618 def_stmt_loop = def_bb->loop_father;
5619 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5620 loop_preheader_edge (def_stmt_loop));
5621 if (TREE_CODE (def_arg) == SSA_NAME
5622 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5623 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5624 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5625 && vinfo_for_stmt (def_arg_stmt)
5626 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5627 == vect_double_reduction_def)
5628 double_reduc = true;
5629 }
5630
5631 epilog_reduc_code = ERROR_MARK;
5632
5633 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == TREE_CODE_REDUCTION
5634 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5635 == INTEGER_INDUC_COND_REDUCTION)
5636 {
5637 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5638 {
5639 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5640 optab_default);
5641 if (!reduc_optab)
5642 {
5643 if (dump_enabled_p ())
5644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5645 "no optab for reduction.\n");
5646
5647 epilog_reduc_code = ERROR_MARK;
5648 }
5649 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5650 {
5651 optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5652 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5653 {
5654 if (dump_enabled_p ())
5655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5656 "reduc op not supported by target.\n");
5657
5658 epilog_reduc_code = ERROR_MARK;
5659 }
5660 }
5661
5662 /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5663 generated in the epilog using multiple expressions. This does not
5664 work for condition reductions. */
5665 if (epilog_reduc_code == ERROR_MARK
5666 && STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5667 == INTEGER_INDUC_COND_REDUCTION)
5668 {
5669 if (dump_enabled_p ())
5670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5671 "no reduc code for scalar code.\n");
5672 return false;
5673 }
5674 }
5675 else
5676 {
5677 if (!nested_cycle || double_reduc)
5678 {
5679 if (dump_enabled_p ())
5680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5681 "no reduc code for scalar code.\n");
5682
5683 return false;
5684 }
5685 }
5686 }
5687 else
5688 {
5689 int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5690 cr_index_scalar_type = make_unsigned_type (scalar_precision);
5691 cr_index_vector_type = build_vector_type
5692 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5693
5694 epilog_reduc_code = REDUC_MAX_EXPR;
5695 optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5696 optab_default);
5697 if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5698 == CODE_FOR_nothing)
5699 {
5700 if (dump_enabled_p ())
5701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5702 "reduc max op not supported by target.\n");
5703 return false;
5704 }
5705 }
5706
5707 if ((double_reduc
5708 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5709 || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5710 == INTEGER_INDUC_COND_REDUCTION)
5711 && ncopies > 1)
5712 {
5713 if (dump_enabled_p ())
5714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5715 "multiple types in double reduction or condition "
5716 "reduction.\n");
5717 return false;
5718 }
5719
5720 /* In case of widenning multiplication by a constant, we update the type
5721 of the constant to be the type of the other operand. We check that the
5722 constant fits the type in the pattern recognition pass. */
5723 if (code == DOT_PROD_EXPR
5724 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5725 {
5726 if (TREE_CODE (ops[0]) == INTEGER_CST)
5727 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5728 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5729 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5730 else
5731 {
5732 if (dump_enabled_p ())
5733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5734 "invalid types in dot-prod\n");
5735
5736 return false;
5737 }
5738 }
5739
5740 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5741 {
5742 widest_int ni;
5743
5744 if (! max_loop_iterations (loop, &ni))
5745 {
5746 if (dump_enabled_p ())
5747 dump_printf_loc (MSG_NOTE, vect_location,
5748 "loop count not known, cannot create cond "
5749 "reduction.\n");
5750 return false;
5751 }
5752 /* Convert backedges to iterations. */
5753 ni += 1;
5754
5755 /* The additional index will be the same type as the condition. Check
5756 that the loop can fit into this less one (because we'll use up the
5757 zero slot for when there are no matches). */
5758 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5759 if (wi::geu_p (ni, wi::to_widest (max_index)))
5760 {
5761 if (dump_enabled_p ())
5762 dump_printf_loc (MSG_NOTE, vect_location,
5763 "loop size is greater than data size.\n");
5764 return false;
5765 }
5766 }
5767
5768 if (!vec_stmt) /* transformation not required. */
5769 {
5770 if (first_p
5771 && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5772 reduc_index))
5773 return false;
5774 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5775 return true;
5776 }
5777
5778 /** Transform. **/
5779
5780 if (dump_enabled_p ())
5781 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5782
5783 /* FORNOW: Multiple types are not supported for condition. */
5784 if (code == COND_EXPR)
5785 gcc_assert (ncopies == 1);
5786
5787 /* Create the destination vector */
5788 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5789
5790 /* In case the vectorization factor (VF) is bigger than the number
5791 of elements that we can fit in a vectype (nunits), we have to generate
5792 more than one vector stmt - i.e - we need to "unroll" the
5793 vector stmt by a factor VF/nunits. For more details see documentation
5794 in vectorizable_operation. */
5795
5796 /* If the reduction is used in an outer loop we need to generate
5797 VF intermediate results, like so (e.g. for ncopies=2):
5798 r0 = phi (init, r0)
5799 r1 = phi (init, r1)
5800 r0 = x0 + r0;
5801 r1 = x1 + r1;
5802 (i.e. we generate VF results in 2 registers).
5803 In this case we have a separate def-use cycle for each copy, and therefore
5804 for each copy we get the vector def for the reduction variable from the
5805 respective phi node created for this copy.
5806
5807 Otherwise (the reduction is unused in the loop nest), we can combine
5808 together intermediate results, like so (e.g. for ncopies=2):
5809 r = phi (init, r)
5810 r = x0 + r;
5811 r = x1 + r;
5812 (i.e. we generate VF/2 results in a single register).
5813 In this case for each copy we get the vector def for the reduction variable
5814 from the vectorized reduction operation generated in the previous iteration.
5815 */
5816
5817 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5818 {
5819 single_defuse_cycle = true;
5820 epilog_copies = 1;
5821 }
5822 else
5823 epilog_copies = ncopies;
5824
5825 prev_stmt_info = NULL;
5826 prev_phi_info = NULL;
5827 if (slp_node)
5828 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5829 else
5830 {
5831 vec_num = 1;
5832 vec_oprnds0.create (1);
5833 if (op_type == ternary_op)
5834 vec_oprnds1.create (1);
5835 }
5836
5837 phis.create (vec_num);
5838 vect_defs.create (vec_num);
5839 if (!slp_node)
5840 vect_defs.quick_push (NULL_TREE);
5841
5842 for (j = 0; j < ncopies; j++)
5843 {
5844 if (j == 0 || !single_defuse_cycle)
5845 {
5846 for (i = 0; i < vec_num; i++)
5847 {
5848 /* Create the reduction-phi that defines the reduction
5849 operand. */
5850 new_phi = create_phi_node (vec_dest, loop->header);
5851 set_vinfo_for_stmt (new_phi,
5852 new_stmt_vec_info (new_phi, loop_vinfo));
5853 if (j == 0 || slp_node)
5854 phis.quick_push (new_phi);
5855 }
5856 }
5857
5858 if (code == COND_EXPR)
5859 {
5860 gcc_assert (!slp_node);
5861 vectorizable_condition (stmt, gsi, vec_stmt,
5862 PHI_RESULT (phis[0]),
5863 reduc_index, NULL);
5864 /* Multiple types are not supported for condition. */
5865 break;
5866 }
5867
5868 /* Handle uses. */
5869 if (j == 0)
5870 {
5871 op0 = ops[!reduc_index];
5872 if (op_type == ternary_op)
5873 {
5874 if (reduc_index == 0)
5875 op1 = ops[2];
5876 else
5877 op1 = ops[1];
5878 }
5879
5880 if (slp_node)
5881 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5882 slp_node, -1);
5883 else
5884 {
5885 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5886 stmt);
5887 vec_oprnds0.quick_push (loop_vec_def0);
5888 if (op_type == ternary_op)
5889 {
5890 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
5891 vec_oprnds1.quick_push (loop_vec_def1);
5892 }
5893 }
5894 }
5895 else
5896 {
5897 if (!slp_node)
5898 {
5899 enum vect_def_type dt;
5900 gimple *dummy_stmt;
5901
5902 vect_is_simple_use (ops[!reduc_index], loop_vinfo,
5903 &dummy_stmt, &dt);
5904 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5905 loop_vec_def0);
5906 vec_oprnds0[0] = loop_vec_def0;
5907 if (op_type == ternary_op)
5908 {
5909 vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
5910 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5911 loop_vec_def1);
5912 vec_oprnds1[0] = loop_vec_def1;
5913 }
5914 }
5915
5916 if (single_defuse_cycle)
5917 reduc_def = gimple_assign_lhs (new_stmt);
5918
5919 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5920 }
5921
5922 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5923 {
5924 if (slp_node)
5925 reduc_def = PHI_RESULT (phis[i]);
5926 else
5927 {
5928 if (!single_defuse_cycle || j == 0)
5929 reduc_def = PHI_RESULT (new_phi);
5930 }
5931
5932 def1 = ((op_type == ternary_op)
5933 ? vec_oprnds1[i] : NULL);
5934 if (op_type == binary_op)
5935 {
5936 if (reduc_index == 0)
5937 expr = build2 (code, vectype_out, reduc_def, def0);
5938 else
5939 expr = build2 (code, vectype_out, def0, reduc_def);
5940 }
5941 else
5942 {
5943 if (reduc_index == 0)
5944 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5945 else
5946 {
5947 if (reduc_index == 1)
5948 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5949 else
5950 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5951 }
5952 }
5953
5954 new_stmt = gimple_build_assign (vec_dest, expr);
5955 new_temp = make_ssa_name (vec_dest, new_stmt);
5956 gimple_assign_set_lhs (new_stmt, new_temp);
5957 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5958
5959 if (slp_node)
5960 {
5961 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5962 vect_defs.quick_push (new_temp);
5963 }
5964 else
5965 vect_defs[0] = new_temp;
5966 }
5967
5968 if (slp_node)
5969 continue;
5970
5971 if (j == 0)
5972 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5973 else
5974 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5975
5976 prev_stmt_info = vinfo_for_stmt (new_stmt);
5977 prev_phi_info = vinfo_for_stmt (new_phi);
5978 }
5979
5980 tree indx_before_incr, indx_after_incr, cond_name = NULL;
5981
5982 /* Finalize the reduction-phi (set its arguments) and create the
5983 epilog reduction code. */
5984 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5985 {
5986 new_temp = gimple_assign_lhs (*vec_stmt);
5987 vect_defs[0] = new_temp;
5988
5989 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5990 which is updated with the current index of the loop for every match of
5991 the original loop's cond_expr (VEC_STMT). This results in a vector
5992 containing the last time the condition passed for that vector lane.
5993 The first match will be a 1 to allow 0 to be used for non-matching
5994 indexes. If there are no matches at all then the vector will be all
5995 zeroes. */
5996 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5997 {
5998 int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
5999 int k;
6000
6001 gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
6002
6003 /* First we create a simple vector induction variable which starts
6004 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6005 vector size (STEP). */
6006
6007 /* Create a {1,2,3,...} vector. */
6008 tree *vtemp = XALLOCAVEC (tree, nunits_out);
6009 for (k = 0; k < nunits_out; ++k)
6010 vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6011 tree series_vect = build_vector (cr_index_vector_type, vtemp);
6012
6013 /* Create a vector of the step value. */
6014 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6015 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6016
6017 /* Create an induction variable. */
6018 gimple_stmt_iterator incr_gsi;
6019 bool insert_after;
6020 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6021 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6022 insert_after, &indx_before_incr, &indx_after_incr);
6023
6024 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6025 filled with zeros (VEC_ZERO). */
6026
6027 /* Create a vector of 0s. */
6028 tree zero = build_zero_cst (cr_index_scalar_type);
6029 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6030
6031 /* Create a vector phi node. */
6032 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6033 new_phi = create_phi_node (new_phi_tree, loop->header);
6034 set_vinfo_for_stmt (new_phi,
6035 new_stmt_vec_info (new_phi, loop_vinfo));
6036 add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6037 UNKNOWN_LOCATION);
6038
6039 /* Now take the condition from the loops original cond_expr
6040 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6041 every match uses values from the induction variable
6042 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6043 (NEW_PHI_TREE).
6044 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6045 the new cond_expr (INDEX_COND_EXPR). */
6046
6047 /* Turn the condition from vec_stmt into an ssa name. */
6048 gimple_stmt_iterator vec_stmt_gsi = gsi_for_stmt (*vec_stmt);
6049 tree ccompare = gimple_assign_rhs1 (*vec_stmt);
6050 tree ccompare_name = make_ssa_name (TREE_TYPE (ccompare));
6051 gimple *ccompare_stmt = gimple_build_assign (ccompare_name,
6052 ccompare);
6053 gsi_insert_before (&vec_stmt_gsi, ccompare_stmt, GSI_SAME_STMT);
6054 gimple_assign_set_rhs1 (*vec_stmt, ccompare_name);
6055 update_stmt (*vec_stmt);
6056
6057 /* Create a conditional, where the condition is taken from vec_stmt
6058 (CCOMPARE_NAME), then is the induction index (INDEX_BEFORE_INCR)
6059 and else is the phi (NEW_PHI_TREE). */
6060 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6061 ccompare_name, indx_before_incr,
6062 new_phi_tree);
6063 cond_name = make_ssa_name (cr_index_vector_type);
6064 gimple *index_condition = gimple_build_assign (cond_name,
6065 index_cond_expr);
6066 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6067 stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6068 loop_vinfo);
6069 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6070 set_vinfo_for_stmt (index_condition, index_vec_info);
6071
6072 /* Update the phi with the vec cond. */
6073 add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6074 UNKNOWN_LOCATION);
6075 }
6076 }
6077
6078 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6079 epilog_reduc_code, phis, reduc_index,
6080 double_reduc, slp_node, cond_name);
6081
6082 return true;
6083 }
6084
6085 /* Function vect_min_worthwhile_factor.
6086
6087 For a loop where we could vectorize the operation indicated by CODE,
6088 return the minimum vectorization factor that makes it worthwhile
6089 to use generic vectors. */
6090 int
6091 vect_min_worthwhile_factor (enum tree_code code)
6092 {
6093 switch (code)
6094 {
6095 case PLUS_EXPR:
6096 case MINUS_EXPR:
6097 case NEGATE_EXPR:
6098 return 4;
6099
6100 case BIT_AND_EXPR:
6101 case BIT_IOR_EXPR:
6102 case BIT_XOR_EXPR:
6103 case BIT_NOT_EXPR:
6104 return 2;
6105
6106 default:
6107 return INT_MAX;
6108 }
6109 }
6110
6111
6112 /* Function vectorizable_induction
6113
6114 Check if PHI performs an induction computation that can be vectorized.
6115 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6116 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6117 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
6118
6119 bool
6120 vectorizable_induction (gimple *phi,
6121 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6122 gimple **vec_stmt)
6123 {
6124 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6125 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6126 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6127 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6128 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6129 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6130 tree vec_def;
6131
6132 gcc_assert (ncopies >= 1);
6133 /* FORNOW. These restrictions should be relaxed. */
6134 if (nested_in_vect_loop_p (loop, phi))
6135 {
6136 imm_use_iterator imm_iter;
6137 use_operand_p use_p;
6138 gimple *exit_phi;
6139 edge latch_e;
6140 tree loop_arg;
6141
6142 if (ncopies > 1)
6143 {
6144 if (dump_enabled_p ())
6145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6146 "multiple types in nested loop.\n");
6147 return false;
6148 }
6149
6150 exit_phi = NULL;
6151 latch_e = loop_latch_edge (loop->inner);
6152 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6153 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6154 {
6155 gimple *use_stmt = USE_STMT (use_p);
6156 if (is_gimple_debug (use_stmt))
6157 continue;
6158
6159 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6160 {
6161 exit_phi = use_stmt;
6162 break;
6163 }
6164 }
6165 if (exit_phi)
6166 {
6167 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
6168 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6169 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6170 {
6171 if (dump_enabled_p ())
6172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6173 "inner-loop induction only used outside "
6174 "of the outer vectorized loop.\n");
6175 return false;
6176 }
6177 }
6178 }
6179
6180 if (!STMT_VINFO_RELEVANT_P (stmt_info))
6181 return false;
6182
6183 /* FORNOW: SLP not supported. */
6184 if (STMT_SLP_TYPE (stmt_info))
6185 return false;
6186
6187 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6188
6189 if (gimple_code (phi) != GIMPLE_PHI)
6190 return false;
6191
6192 if (!vec_stmt) /* transformation not required. */
6193 {
6194 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6195 if (dump_enabled_p ())
6196 dump_printf_loc (MSG_NOTE, vect_location,
6197 "=== vectorizable_induction ===\n");
6198 vect_model_induction_cost (stmt_info, ncopies);
6199 return true;
6200 }
6201
6202 /** Transform. **/
6203
6204 if (dump_enabled_p ())
6205 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6206
6207 vec_def = get_initial_def_for_induction (phi);
6208 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6209 return true;
6210 }
6211
6212 /* Function vectorizable_live_operation.
6213
6214 STMT computes a value that is used outside the loop. Check if
6215 it can be supported. */
6216
6217 bool
6218 vectorizable_live_operation (gimple *stmt,
6219 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6220 gimple **vec_stmt)
6221 {
6222 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6223 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6224 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6225 tree op;
6226 gimple *def_stmt;
6227 ssa_op_iter iter;
6228
6229 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6230
6231 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6232 return false;
6233
6234 if (!is_gimple_assign (stmt))
6235 {
6236 if (gimple_call_internal_p (stmt)
6237 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
6238 && gimple_call_lhs (stmt)
6239 && loop->simduid
6240 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
6241 && loop->simduid
6242 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
6243 {
6244 edge e = single_exit (loop);
6245 basic_block merge_bb = e->dest;
6246 imm_use_iterator imm_iter;
6247 use_operand_p use_p;
6248 tree lhs = gimple_call_lhs (stmt);
6249
6250 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
6251 {
6252 gimple *use_stmt = USE_STMT (use_p);
6253 if (gimple_code (use_stmt) == GIMPLE_PHI
6254 && gimple_bb (use_stmt) == merge_bb)
6255 {
6256 if (vec_stmt)
6257 {
6258 tree vfm1
6259 = build_int_cst (unsigned_type_node,
6260 loop_vinfo->vectorization_factor - 1);
6261 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
6262 }
6263 return true;
6264 }
6265 }
6266 }
6267
6268 return false;
6269 }
6270
6271 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6272 return false;
6273
6274 /* FORNOW. CHECKME. */
6275 if (nested_in_vect_loop_p (loop, stmt))
6276 return false;
6277
6278 /* FORNOW: support only if all uses are invariant. This means
6279 that the scalar operations can remain in place, unvectorized.
6280 The original last scalar value that they compute will be used. */
6281 FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
6282 {
6283 enum vect_def_type dt = vect_uninitialized_def;
6284
6285 if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &dt))
6286 {
6287 if (dump_enabled_p ())
6288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6289 "use not simple.\n");
6290 return false;
6291 }
6292
6293 if (dt != vect_external_def && dt != vect_constant_def)
6294 return false;
6295 }
6296
6297 /* No transformation is required for the cases we currently support. */
6298 return true;
6299 }
6300
6301 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
6302
6303 static void
6304 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6305 {
6306 ssa_op_iter op_iter;
6307 imm_use_iterator imm_iter;
6308 def_operand_p def_p;
6309 gimple *ustmt;
6310
6311 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6312 {
6313 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6314 {
6315 basic_block bb;
6316
6317 if (!is_gimple_debug (ustmt))
6318 continue;
6319
6320 bb = gimple_bb (ustmt);
6321
6322 if (!flow_bb_inside_loop_p (loop, bb))
6323 {
6324 if (gimple_debug_bind_p (ustmt))
6325 {
6326 if (dump_enabled_p ())
6327 dump_printf_loc (MSG_NOTE, vect_location,
6328 "killing debug use\n");
6329
6330 gimple_debug_bind_reset_value (ustmt);
6331 update_stmt (ustmt);
6332 }
6333 else
6334 gcc_unreachable ();
6335 }
6336 }
6337 }
6338 }
6339
6340
6341 /* This function builds ni_name = number of iterations. Statements
6342 are emitted on the loop preheader edge. */
6343
6344 static tree
6345 vect_build_loop_niters (loop_vec_info loop_vinfo)
6346 {
6347 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6348 if (TREE_CODE (ni) == INTEGER_CST)
6349 return ni;
6350 else
6351 {
6352 tree ni_name, var;
6353 gimple_seq stmts = NULL;
6354 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6355
6356 var = create_tmp_var (TREE_TYPE (ni), "niters");
6357 ni_name = force_gimple_operand (ni, &stmts, false, var);
6358 if (stmts)
6359 gsi_insert_seq_on_edge_immediate (pe, stmts);
6360
6361 return ni_name;
6362 }
6363 }
6364
6365
6366 /* This function generates the following statements:
6367
6368 ni_name = number of iterations loop executes
6369 ratio = ni_name / vf
6370 ratio_mult_vf_name = ratio * vf
6371
6372 and places them on the loop preheader edge. */
6373
6374 static void
6375 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6376 tree ni_name,
6377 tree *ratio_mult_vf_name_ptr,
6378 tree *ratio_name_ptr)
6379 {
6380 tree ni_minus_gap_name;
6381 tree var;
6382 tree ratio_name;
6383 tree ratio_mult_vf_name;
6384 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6385 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6386 tree log_vf;
6387
6388 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6389
6390 /* If epilogue loop is required because of data accesses with gaps, we
6391 subtract one iteration from the total number of iterations here for
6392 correct calculation of RATIO. */
6393 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6394 {
6395 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6396 ni_name,
6397 build_one_cst (TREE_TYPE (ni_name)));
6398 if (!is_gimple_val (ni_minus_gap_name))
6399 {
6400 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6401 gimple *stmts = NULL;
6402 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6403 true, var);
6404 gsi_insert_seq_on_edge_immediate (pe, stmts);
6405 }
6406 }
6407 else
6408 ni_minus_gap_name = ni_name;
6409
6410 /* Create: ratio = ni >> log2(vf) */
6411 /* ??? As we have ni == number of latch executions + 1, ni could
6412 have overflown to zero. So avoid computing ratio based on ni
6413 but compute it using the fact that we know ratio will be at least
6414 one, thus via (ni - vf) >> log2(vf) + 1. */
6415 ratio_name
6416 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6417 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6418 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6419 ni_minus_gap_name,
6420 build_int_cst
6421 (TREE_TYPE (ni_name), vf)),
6422 log_vf),
6423 build_int_cst (TREE_TYPE (ni_name), 1));
6424 if (!is_gimple_val (ratio_name))
6425 {
6426 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6427 gimple *stmts = NULL;
6428 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6429 gsi_insert_seq_on_edge_immediate (pe, stmts);
6430 }
6431 *ratio_name_ptr = ratio_name;
6432
6433 /* Create: ratio_mult_vf = ratio << log2 (vf). */
6434
6435 if (ratio_mult_vf_name_ptr)
6436 {
6437 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6438 ratio_name, log_vf);
6439 if (!is_gimple_val (ratio_mult_vf_name))
6440 {
6441 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6442 gimple *stmts = NULL;
6443 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6444 true, var);
6445 gsi_insert_seq_on_edge_immediate (pe, stmts);
6446 }
6447 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6448 }
6449
6450 return;
6451 }
6452
6453
6454 /* Function vect_transform_loop.
6455
6456 The analysis phase has determined that the loop is vectorizable.
6457 Vectorize the loop - created vectorized stmts to replace the scalar
6458 stmts in the loop, and update the loop exit condition. */
6459
6460 void
6461 vect_transform_loop (loop_vec_info loop_vinfo)
6462 {
6463 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6464 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6465 int nbbs = loop->num_nodes;
6466 int i;
6467 tree ratio = NULL;
6468 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6469 bool grouped_store;
6470 bool slp_scheduled = false;
6471 gimple *stmt, *pattern_stmt;
6472 gimple_seq pattern_def_seq = NULL;
6473 gimple_stmt_iterator pattern_def_si = gsi_none ();
6474 bool transform_pattern_stmt = false;
6475 bool check_profitability = false;
6476 int th;
6477 /* Record number of iterations before we started tampering with the profile. */
6478 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6479
6480 if (dump_enabled_p ())
6481 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6482
6483 /* If profile is inprecise, we have chance to fix it up. */
6484 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6485 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6486
6487 /* Use the more conservative vectorization threshold. If the number
6488 of iterations is constant assume the cost check has been performed
6489 by our caller. If the threshold makes all loops profitable that
6490 run at least the vectorization factor number of times checking
6491 is pointless, too. */
6492 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6493 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6494 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6495 {
6496 if (dump_enabled_p ())
6497 dump_printf_loc (MSG_NOTE, vect_location,
6498 "Profitability threshold is %d loop iterations.\n",
6499 th);
6500 check_profitability = true;
6501 }
6502
6503 /* Version the loop first, if required, so the profitability check
6504 comes first. */
6505
6506 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
6507 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
6508 {
6509 vect_loop_versioning (loop_vinfo, th, check_profitability);
6510 check_profitability = false;
6511 }
6512
6513 tree ni_name = vect_build_loop_niters (loop_vinfo);
6514 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6515
6516 /* Peel the loop if there are data refs with unknown alignment.
6517 Only one data ref with unknown store is allowed. */
6518
6519 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6520 {
6521 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6522 th, check_profitability);
6523 check_profitability = false;
6524 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6525 be re-computed. */
6526 ni_name = NULL_TREE;
6527 }
6528
6529 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6530 compile time constant), or it is a constant that doesn't divide by the
6531 vectorization factor, then an epilog loop needs to be created.
6532 We therefore duplicate the loop: the original loop will be vectorized,
6533 and will compute the first (n/VF) iterations. The second copy of the loop
6534 will remain scalar and will compute the remaining (n%VF) iterations.
6535 (VF is the vectorization factor). */
6536
6537 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6538 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6539 {
6540 tree ratio_mult_vf;
6541 if (!ni_name)
6542 ni_name = vect_build_loop_niters (loop_vinfo);
6543 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6544 &ratio);
6545 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6546 th, check_profitability);
6547 }
6548 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6549 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6550 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6551 else
6552 {
6553 if (!ni_name)
6554 ni_name = vect_build_loop_niters (loop_vinfo);
6555 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6556 }
6557
6558 /* 1) Make sure the loop header has exactly two entries
6559 2) Make sure we have a preheader basic block. */
6560
6561 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6562
6563 split_edge (loop_preheader_edge (loop));
6564
6565 /* FORNOW: the vectorizer supports only loops which body consist
6566 of one basic block (header + empty latch). When the vectorizer will
6567 support more involved loop forms, the order by which the BBs are
6568 traversed need to be reconsidered. */
6569
6570 for (i = 0; i < nbbs; i++)
6571 {
6572 basic_block bb = bbs[i];
6573 stmt_vec_info stmt_info;
6574
6575 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6576 gsi_next (&si))
6577 {
6578 gphi *phi = si.phi ();
6579 if (dump_enabled_p ())
6580 {
6581 dump_printf_loc (MSG_NOTE, vect_location,
6582 "------>vectorizing phi: ");
6583 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6584 dump_printf (MSG_NOTE, "\n");
6585 }
6586 stmt_info = vinfo_for_stmt (phi);
6587 if (!stmt_info)
6588 continue;
6589
6590 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6591 vect_loop_kill_debug_uses (loop, phi);
6592
6593 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6594 && !STMT_VINFO_LIVE_P (stmt_info))
6595 continue;
6596
6597 if (STMT_VINFO_VECTYPE (stmt_info)
6598 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6599 != (unsigned HOST_WIDE_INT) vectorization_factor)
6600 && dump_enabled_p ())
6601 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6602
6603 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6604 {
6605 if (dump_enabled_p ())
6606 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6607 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6608 }
6609 }
6610
6611 pattern_stmt = NULL;
6612 for (gimple_stmt_iterator si = gsi_start_bb (bb);
6613 !gsi_end_p (si) || transform_pattern_stmt;)
6614 {
6615 bool is_store;
6616
6617 if (transform_pattern_stmt)
6618 stmt = pattern_stmt;
6619 else
6620 {
6621 stmt = gsi_stmt (si);
6622 /* During vectorization remove existing clobber stmts. */
6623 if (gimple_clobber_p (stmt))
6624 {
6625 unlink_stmt_vdef (stmt);
6626 gsi_remove (&si, true);
6627 release_defs (stmt);
6628 continue;
6629 }
6630 }
6631
6632 if (dump_enabled_p ())
6633 {
6634 dump_printf_loc (MSG_NOTE, vect_location,
6635 "------>vectorizing statement: ");
6636 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6637 dump_printf (MSG_NOTE, "\n");
6638 }
6639
6640 stmt_info = vinfo_for_stmt (stmt);
6641
6642 /* vector stmts created in the outer-loop during vectorization of
6643 stmts in an inner-loop may not have a stmt_info, and do not
6644 need to be vectorized. */
6645 if (!stmt_info)
6646 {
6647 gsi_next (&si);
6648 continue;
6649 }
6650
6651 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6652 vect_loop_kill_debug_uses (loop, stmt);
6653
6654 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6655 && !STMT_VINFO_LIVE_P (stmt_info))
6656 {
6657 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6658 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6659 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6660 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6661 {
6662 stmt = pattern_stmt;
6663 stmt_info = vinfo_for_stmt (stmt);
6664 }
6665 else
6666 {
6667 gsi_next (&si);
6668 continue;
6669 }
6670 }
6671 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6672 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6673 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6674 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6675 transform_pattern_stmt = true;
6676
6677 /* If pattern statement has def stmts, vectorize them too. */
6678 if (is_pattern_stmt_p (stmt_info))
6679 {
6680 if (pattern_def_seq == NULL)
6681 {
6682 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6683 pattern_def_si = gsi_start (pattern_def_seq);
6684 }
6685 else if (!gsi_end_p (pattern_def_si))
6686 gsi_next (&pattern_def_si);
6687 if (pattern_def_seq != NULL)
6688 {
6689 gimple *pattern_def_stmt = NULL;
6690 stmt_vec_info pattern_def_stmt_info = NULL;
6691
6692 while (!gsi_end_p (pattern_def_si))
6693 {
6694 pattern_def_stmt = gsi_stmt (pattern_def_si);
6695 pattern_def_stmt_info
6696 = vinfo_for_stmt (pattern_def_stmt);
6697 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6698 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6699 break;
6700 gsi_next (&pattern_def_si);
6701 }
6702
6703 if (!gsi_end_p (pattern_def_si))
6704 {
6705 if (dump_enabled_p ())
6706 {
6707 dump_printf_loc (MSG_NOTE, vect_location,
6708 "==> vectorizing pattern def "
6709 "stmt: ");
6710 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6711 pattern_def_stmt, 0);
6712 dump_printf (MSG_NOTE, "\n");
6713 }
6714
6715 stmt = pattern_def_stmt;
6716 stmt_info = pattern_def_stmt_info;
6717 }
6718 else
6719 {
6720 pattern_def_si = gsi_none ();
6721 transform_pattern_stmt = false;
6722 }
6723 }
6724 else
6725 transform_pattern_stmt = false;
6726 }
6727
6728 if (STMT_VINFO_VECTYPE (stmt_info))
6729 {
6730 unsigned int nunits
6731 = (unsigned int)
6732 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6733 if (!STMT_SLP_TYPE (stmt_info)
6734 && nunits != (unsigned int) vectorization_factor
6735 && dump_enabled_p ())
6736 /* For SLP VF is set according to unrolling factor, and not
6737 to vector size, hence for SLP this print is not valid. */
6738 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6739 }
6740
6741 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6742 reached. */
6743 if (STMT_SLP_TYPE (stmt_info))
6744 {
6745 if (!slp_scheduled)
6746 {
6747 slp_scheduled = true;
6748
6749 if (dump_enabled_p ())
6750 dump_printf_loc (MSG_NOTE, vect_location,
6751 "=== scheduling SLP instances ===\n");
6752
6753 vect_schedule_slp (loop_vinfo);
6754 }
6755
6756 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6757 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6758 {
6759 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6760 {
6761 pattern_def_seq = NULL;
6762 gsi_next (&si);
6763 }
6764 continue;
6765 }
6766 }
6767
6768 /* -------- vectorize statement ------------ */
6769 if (dump_enabled_p ())
6770 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6771
6772 grouped_store = false;
6773 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6774 if (is_store)
6775 {
6776 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6777 {
6778 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6779 interleaving chain was completed - free all the stores in
6780 the chain. */
6781 gsi_next (&si);
6782 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6783 }
6784 else
6785 {
6786 /* Free the attached stmt_vec_info and remove the stmt. */
6787 gimple *store = gsi_stmt (si);
6788 free_stmt_vec_info (store);
6789 unlink_stmt_vdef (store);
6790 gsi_remove (&si, true);
6791 release_defs (store);
6792 }
6793
6794 /* Stores can only appear at the end of pattern statements. */
6795 gcc_assert (!transform_pattern_stmt);
6796 pattern_def_seq = NULL;
6797 }
6798 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6799 {
6800 pattern_def_seq = NULL;
6801 gsi_next (&si);
6802 }
6803 } /* stmts in BB */
6804 } /* BBs in loop */
6805
6806 slpeel_make_loop_iterate_ntimes (loop, ratio);
6807
6808 /* Reduce loop iterations by the vectorization factor. */
6809 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6810 expected_iterations / vectorization_factor);
6811 loop->nb_iterations_upper_bound
6812 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6813 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6814 && loop->nb_iterations_upper_bound != 0)
6815 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6816 if (loop->any_estimate)
6817 {
6818 loop->nb_iterations_estimate
6819 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6820 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6821 && loop->nb_iterations_estimate != 0)
6822 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6823 }
6824
6825 if (dump_enabled_p ())
6826 {
6827 dump_printf_loc (MSG_NOTE, vect_location,
6828 "LOOP VECTORIZED\n");
6829 if (loop->inner)
6830 dump_printf_loc (MSG_NOTE, vect_location,
6831 "OUTER LOOP VECTORIZED\n");
6832 dump_printf (MSG_NOTE, "\n");
6833 }
6834 }