invoke.texi: Document -ftree-loop-distribution.
[gcc.git] / gcc / tree-vectorizer.c
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 /* Loop Vectorization Pass.
22
23 This pass tries to vectorize loops. This first implementation focuses on
24 simple inner-most loops, with no conditional control flow, and a set of
25 simple operations which vector form can be expressed using existing
26 tree codes (PLUS, MULT etc).
27
28 For example, the vectorizer transforms the following simple loop:
29
30 short a[N]; short b[N]; short c[N]; int i;
31
32 for (i=0; i<N; i++){
33 a[i] = b[i] + c[i];
34 }
35
36 as if it was manually vectorized by rewriting the source code into:
37
38 typedef int __attribute__((mode(V8HI))) v8hi;
39 short a[N]; short b[N]; short c[N]; int i;
40 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
41 v8hi va, vb, vc;
42
43 for (i=0; i<N/8; i++){
44 vb = pb[i];
45 vc = pc[i];
46 va = vb + vc;
47 pa[i] = va;
48 }
49
50 The main entry to this pass is vectorize_loops(), in which
51 the vectorizer applies a set of analyses on a given set of loops,
52 followed by the actual vectorization transformation for the loops that
53 had successfully passed the analysis phase.
54
55 Throughout this pass we make a distinction between two types of
56 data: scalars (which are represented by SSA_NAMES), and memory references
57 ("data-refs"). These two types of data require different handling both
58 during analysis and transformation. The types of data-refs that the
59 vectorizer currently supports are ARRAY_REFS which base is an array DECL
60 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
61 accesses are required to have a simple (consecutive) access pattern.
62
63 Analysis phase:
64 ===============
65 The driver for the analysis phase is vect_analyze_loop_nest().
66 It applies a set of analyses, some of which rely on the scalar evolution
67 analyzer (scev) developed by Sebastian Pop.
68
69 During the analysis phase the vectorizer records some information
70 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
71 loop, as well as general information about the loop as a whole, which is
72 recorded in a "loop_vec_info" struct attached to each loop.
73
74 Transformation phase:
75 =====================
76 The loop transformation phase scans all the stmts in the loop, and
77 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
78 the loop that needs to be vectorized. It insert the vector code sequence
79 just before the scalar stmt S, and records a pointer to the vector code
80 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
81 attached to S). This pointer will be used for the vectorization of following
82 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
83 otherwise, we rely on dead code elimination for removing it.
84
85 For example, say stmt S1 was vectorized into stmt VS1:
86
87 VS1: vb = px[i];
88 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
89 S2: a = b;
90
91 To vectorize stmt S2, the vectorizer first finds the stmt that defines
92 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
93 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
94 resulting sequence would be:
95
96 VS1: vb = px[i];
97 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
98 VS2: va = vb;
99 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
100
101 Operands that are not SSA_NAMEs, are data-refs that appear in
102 load/store operations (like 'x[i]' in S1), and are handled differently.
103
104 Target modeling:
105 =================
106 Currently the only target specific information that is used is the
107 size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
108 support different sizes of vectors, for now will need to specify one value
109 for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
110
111 Since we only vectorize operations which vector form can be
112 expressed using existing tree codes, to verify that an operation is
113 supported, the vectorizer checks the relevant optab at the relevant
114 machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
115 the value found is CODE_FOR_nothing, then there's no target support, and
116 we can't vectorize the stmt.
117
118 For additional information on this project see:
119 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
120 */
121
122 #include "config.h"
123 #include "system.h"
124 #include "coretypes.h"
125 #include "tm.h"
126 #include "ggc.h"
127 #include "tree.h"
128 #include "target.h"
129 #include "rtl.h"
130 #include "basic-block.h"
131 #include "diagnostic.h"
132 #include "tree-flow.h"
133 #include "tree-dump.h"
134 #include "timevar.h"
135 #include "cfgloop.h"
136 #include "cfglayout.h"
137 #include "expr.h"
138 #include "recog.h"
139 #include "optabs.h"
140 #include "params.h"
141 #include "toplev.h"
142 #include "tree-chrec.h"
143 #include "tree-data-ref.h"
144 #include "tree-scalar-evolution.h"
145 #include "input.h"
146 #include "tree-vectorizer.h"
147 #include "tree-pass.h"
148
149 /*************************************************************************
150 General Vectorization Utilities
151 *************************************************************************/
152
153 /* vect_dump will be set to stderr or dump_file if exist. */
154 FILE *vect_dump;
155
156 /* vect_verbosity_level set to an invalid value
157 to mark that it's uninitialized. */
158 enum verbosity_levels vect_verbosity_level = MAX_VERBOSITY_LEVEL;
159
160 /* Loop location. */
161 static LOC vect_loop_location;
162
163 /* Bitmap of virtual variables to be renamed. */
164 bitmap vect_memsyms_to_rename;
165 \f
166 /*************************************************************************
167 Simple Loop Peeling Utilities
168
169 Utilities to support loop peeling for vectorization purposes.
170 *************************************************************************/
171
172
173 /* Renames the use *OP_P. */
174
175 static void
176 rename_use_op (use_operand_p op_p)
177 {
178 tree new_name;
179
180 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
181 return;
182
183 new_name = get_current_def (USE_FROM_PTR (op_p));
184
185 /* Something defined outside of the loop. */
186 if (!new_name)
187 return;
188
189 /* An ordinary ssa name defined in the loop. */
190
191 SET_USE (op_p, new_name);
192 }
193
194
195 /* Renames the variables in basic block BB. */
196
197 static void
198 rename_variables_in_bb (basic_block bb)
199 {
200 tree phi;
201 block_stmt_iterator bsi;
202 tree stmt;
203 use_operand_p use_p;
204 ssa_op_iter iter;
205 edge e;
206 edge_iterator ei;
207 struct loop *loop = bb->loop_father;
208
209 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
210 {
211 stmt = bsi_stmt (bsi);
212 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
213 rename_use_op (use_p);
214 }
215
216 FOR_EACH_EDGE (e, ei, bb->succs)
217 {
218 if (!flow_bb_inside_loop_p (loop, e->dest))
219 continue;
220 for (phi = phi_nodes (e->dest); phi; phi = PHI_CHAIN (phi))
221 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (phi, e));
222 }
223 }
224
225
226 /* Renames variables in new generated LOOP. */
227
228 void
229 rename_variables_in_loop (struct loop *loop)
230 {
231 unsigned i;
232 basic_block *bbs;
233
234 bbs = get_loop_body (loop);
235
236 for (i = 0; i < loop->num_nodes; i++)
237 rename_variables_in_bb (bbs[i]);
238
239 free (bbs);
240 }
241
242
243 /* Update the PHI nodes of NEW_LOOP.
244
245 NEW_LOOP is a duplicate of ORIG_LOOP.
246 AFTER indicates whether NEW_LOOP executes before or after ORIG_LOOP:
247 AFTER is true if NEW_LOOP executes after ORIG_LOOP, and false if it
248 executes before it. */
249
250 static void
251 slpeel_update_phis_for_duplicate_loop (struct loop *orig_loop,
252 struct loop *new_loop, bool after)
253 {
254 tree new_ssa_name;
255 tree phi_new, phi_orig;
256 tree def;
257 edge orig_loop_latch = loop_latch_edge (orig_loop);
258 edge orig_entry_e = loop_preheader_edge (orig_loop);
259 edge new_loop_exit_e = single_exit (new_loop);
260 edge new_loop_entry_e = loop_preheader_edge (new_loop);
261 edge entry_arg_e = (after ? orig_loop_latch : orig_entry_e);
262
263 /*
264 step 1. For each loop-header-phi:
265 Add the first phi argument for the phi in NEW_LOOP
266 (the one associated with the entry of NEW_LOOP)
267
268 step 2. For each loop-header-phi:
269 Add the second phi argument for the phi in NEW_LOOP
270 (the one associated with the latch of NEW_LOOP)
271
272 step 3. Update the phis in the successor block of NEW_LOOP.
273
274 case 1: NEW_LOOP was placed before ORIG_LOOP:
275 The successor block of NEW_LOOP is the header of ORIG_LOOP.
276 Updating the phis in the successor block can therefore be done
277 along with the scanning of the loop header phis, because the
278 header blocks of ORIG_LOOP and NEW_LOOP have exactly the same
279 phi nodes, organized in the same order.
280
281 case 2: NEW_LOOP was placed after ORIG_LOOP:
282 The successor block of NEW_LOOP is the original exit block of
283 ORIG_LOOP - the phis to be updated are the loop-closed-ssa phis.
284 We postpone updating these phis to a later stage (when
285 loop guards are added).
286 */
287
288
289 /* Scan the phis in the headers of the old and new loops
290 (they are organized in exactly the same order). */
291
292 for (phi_new = phi_nodes (new_loop->header),
293 phi_orig = phi_nodes (orig_loop->header);
294 phi_new && phi_orig;
295 phi_new = PHI_CHAIN (phi_new), phi_orig = PHI_CHAIN (phi_orig))
296 {
297 /* step 1. */
298 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, entry_arg_e);
299 add_phi_arg (phi_new, def, new_loop_entry_e);
300
301 /* step 2. */
302 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
303 if (TREE_CODE (def) != SSA_NAME)
304 continue;
305
306 new_ssa_name = get_current_def (def);
307 if (!new_ssa_name)
308 {
309 /* This only happens if there are no definitions
310 inside the loop. use the phi_result in this case. */
311 new_ssa_name = PHI_RESULT (phi_new);
312 }
313
314 /* An ordinary ssa name defined in the loop. */
315 add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop));
316
317 /* step 3 (case 1). */
318 if (!after)
319 {
320 gcc_assert (new_loop_exit_e == orig_entry_e);
321 SET_PHI_ARG_DEF (phi_orig,
322 new_loop_exit_e->dest_idx,
323 new_ssa_name);
324 }
325 }
326 }
327
328
329 /* Update PHI nodes for a guard of the LOOP.
330
331 Input:
332 - LOOP, GUARD_EDGE: LOOP is a loop for which we added guard code that
333 controls whether LOOP is to be executed. GUARD_EDGE is the edge that
334 originates from the guard-bb, skips LOOP and reaches the (unique) exit
335 bb of LOOP. This loop-exit-bb is an empty bb with one successor.
336 We denote this bb NEW_MERGE_BB because before the guard code was added
337 it had a single predecessor (the LOOP header), and now it became a merge
338 point of two paths - the path that ends with the LOOP exit-edge, and
339 the path that ends with GUARD_EDGE.
340 - NEW_EXIT_BB: New basic block that is added by this function between LOOP
341 and NEW_MERGE_BB. It is used to place loop-closed-ssa-form exit-phis.
342
343 ===> The CFG before the guard-code was added:
344 LOOP_header_bb:
345 loop_body
346 if (exit_loop) goto update_bb
347 else goto LOOP_header_bb
348 update_bb:
349
350 ==> The CFG after the guard-code was added:
351 guard_bb:
352 if (LOOP_guard_condition) goto new_merge_bb
353 else goto LOOP_header_bb
354 LOOP_header_bb:
355 loop_body
356 if (exit_loop_condition) goto new_merge_bb
357 else goto LOOP_header_bb
358 new_merge_bb:
359 goto update_bb
360 update_bb:
361
362 ==> The CFG after this function:
363 guard_bb:
364 if (LOOP_guard_condition) goto new_merge_bb
365 else goto LOOP_header_bb
366 LOOP_header_bb:
367 loop_body
368 if (exit_loop_condition) goto new_exit_bb
369 else goto LOOP_header_bb
370 new_exit_bb:
371 new_merge_bb:
372 goto update_bb
373 update_bb:
374
375 This function:
376 1. creates and updates the relevant phi nodes to account for the new
377 incoming edge (GUARD_EDGE) into NEW_MERGE_BB. This involves:
378 1.1. Create phi nodes at NEW_MERGE_BB.
379 1.2. Update the phi nodes at the successor of NEW_MERGE_BB (denoted
380 UPDATE_BB). UPDATE_BB was the exit-bb of LOOP before NEW_MERGE_BB
381 2. preserves loop-closed-ssa-form by creating the required phi nodes
382 at the exit of LOOP (i.e, in NEW_EXIT_BB).
383
384 There are two flavors to this function:
385
386 slpeel_update_phi_nodes_for_guard1:
387 Here the guard controls whether we enter or skip LOOP, where LOOP is a
388 prolog_loop (loop1 below), and the new phis created in NEW_MERGE_BB are
389 for variables that have phis in the loop header.
390
391 slpeel_update_phi_nodes_for_guard2:
392 Here the guard controls whether we enter or skip LOOP, where LOOP is an
393 epilog_loop (loop2 below), and the new phis created in NEW_MERGE_BB are
394 for variables that have phis in the loop exit.
395
396 I.E., the overall structure is:
397
398 loop1_preheader_bb:
399 guard1 (goto loop1/merg1_bb)
400 loop1
401 loop1_exit_bb:
402 guard2 (goto merge1_bb/merge2_bb)
403 merge1_bb
404 loop2
405 loop2_exit_bb
406 merge2_bb
407 next_bb
408
409 slpeel_update_phi_nodes_for_guard1 takes care of creating phis in
410 loop1_exit_bb and merge1_bb. These are entry phis (phis for the vars
411 that have phis in loop1->header).
412
413 slpeel_update_phi_nodes_for_guard2 takes care of creating phis in
414 loop2_exit_bb and merge2_bb. These are exit phis (phis for the vars
415 that have phis in next_bb). It also adds some of these phis to
416 loop1_exit_bb.
417
418 slpeel_update_phi_nodes_for_guard1 is always called before
419 slpeel_update_phi_nodes_for_guard2. They are both needed in order
420 to create correct data-flow and loop-closed-ssa-form.
421
422 Generally slpeel_update_phi_nodes_for_guard1 creates phis for variables
423 that change between iterations of a loop (and therefore have a phi-node
424 at the loop entry), whereas slpeel_update_phi_nodes_for_guard2 creates
425 phis for variables that are used out of the loop (and therefore have
426 loop-closed exit phis). Some variables may be both updated between
427 iterations and used after the loop. This is why in loop1_exit_bb we
428 may need both entry_phis (created by slpeel_update_phi_nodes_for_guard1)
429 and exit phis (created by slpeel_update_phi_nodes_for_guard2).
430
431 - IS_NEW_LOOP: if IS_NEW_LOOP is true, then LOOP is a newly created copy of
432 an original loop. i.e., we have:
433
434 orig_loop
435 guard_bb (goto LOOP/new_merge)
436 new_loop <-- LOOP
437 new_exit
438 new_merge
439 next_bb
440
441 If IS_NEW_LOOP is false, then LOOP is an original loop, in which case we
442 have:
443
444 new_loop
445 guard_bb (goto LOOP/new_merge)
446 orig_loop <-- LOOP
447 new_exit
448 new_merge
449 next_bb
450
451 The SSA names defined in the original loop have a current
452 reaching definition that that records the corresponding new
453 ssa-name used in the new duplicated loop copy.
454 */
455
456 /* Function slpeel_update_phi_nodes_for_guard1
457
458 Input:
459 - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
460 - DEFS - a bitmap of ssa names to mark new names for which we recorded
461 information.
462
463 In the context of the overall structure, we have:
464
465 loop1_preheader_bb:
466 guard1 (goto loop1/merg1_bb)
467 LOOP-> loop1
468 loop1_exit_bb:
469 guard2 (goto merge1_bb/merge2_bb)
470 merge1_bb
471 loop2
472 loop2_exit_bb
473 merge2_bb
474 next_bb
475
476 For each name updated between loop iterations (i.e - for each name that has
477 an entry (loop-header) phi in LOOP) we create a new phi in:
478 1. merge1_bb (to account for the edge from guard1)
479 2. loop1_exit_bb (an exit-phi to keep LOOP in loop-closed form)
480 */
481
482 static void
483 slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop,
484 bool is_new_loop, basic_block *new_exit_bb,
485 bitmap *defs)
486 {
487 tree orig_phi, new_phi;
488 tree update_phi, update_phi2;
489 tree guard_arg, loop_arg;
490 basic_block new_merge_bb = guard_edge->dest;
491 edge e = EDGE_SUCC (new_merge_bb, 0);
492 basic_block update_bb = e->dest;
493 basic_block orig_bb = loop->header;
494 edge new_exit_e;
495 tree current_new_name;
496 tree name;
497
498 /* Create new bb between loop and new_merge_bb. */
499 *new_exit_bb = split_edge (single_exit (loop));
500
501 new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
502
503 for (orig_phi = phi_nodes (orig_bb), update_phi = phi_nodes (update_bb);
504 orig_phi && update_phi;
505 orig_phi = PHI_CHAIN (orig_phi), update_phi = PHI_CHAIN (update_phi))
506 {
507 /* Virtual phi; Mark it for renaming. We actually want to call
508 mar_sym_for_renaming, but since all ssa renaming datastructures
509 are going to be freed before we get to call ssa_upate, we just
510 record this name for now in a bitmap, and will mark it for
511 renaming later. */
512 name = PHI_RESULT (orig_phi);
513 if (!is_gimple_reg (SSA_NAME_VAR (name)))
514 bitmap_set_bit (vect_memsyms_to_rename, DECL_UID (SSA_NAME_VAR (name)));
515
516 /** 1. Handle new-merge-point phis **/
517
518 /* 1.1. Generate new phi node in NEW_MERGE_BB: */
519 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
520 new_merge_bb);
521
522 /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
523 of LOOP. Set the two phi args in NEW_PHI for these edges: */
524 loop_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, EDGE_SUCC (loop->latch, 0));
525 guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, loop_preheader_edge (loop));
526
527 add_phi_arg (new_phi, loop_arg, new_exit_e);
528 add_phi_arg (new_phi, guard_arg, guard_edge);
529
530 /* 1.3. Update phi in successor block. */
531 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == loop_arg
532 || PHI_ARG_DEF_FROM_EDGE (update_phi, e) == guard_arg);
533 SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
534 update_phi2 = new_phi;
535
536
537 /** 2. Handle loop-closed-ssa-form phis **/
538
539 if (!is_gimple_reg (PHI_RESULT (orig_phi)))
540 continue;
541
542 /* 2.1. Generate new phi node in NEW_EXIT_BB: */
543 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
544 *new_exit_bb);
545
546 /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop. */
547 add_phi_arg (new_phi, loop_arg, single_exit (loop));
548
549 /* 2.3. Update phi in successor of NEW_EXIT_BB: */
550 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
551 SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
552
553 /* 2.4. Record the newly created name with set_current_def.
554 We want to find a name such that
555 name = get_current_def (orig_loop_name)
556 and to set its current definition as follows:
557 set_current_def (name, new_phi_name)
558
559 If LOOP is a new loop then loop_arg is already the name we're
560 looking for. If LOOP is the original loop, then loop_arg is
561 the orig_loop_name and the relevant name is recorded in its
562 current reaching definition. */
563 if (is_new_loop)
564 current_new_name = loop_arg;
565 else
566 {
567 current_new_name = get_current_def (loop_arg);
568 /* current_def is not available only if the variable does not
569 change inside the loop, in which case we also don't care
570 about recording a current_def for it because we won't be
571 trying to create loop-exit-phis for it. */
572 if (!current_new_name)
573 continue;
574 }
575 gcc_assert (get_current_def (current_new_name) == NULL_TREE);
576
577 set_current_def (current_new_name, PHI_RESULT (new_phi));
578 bitmap_set_bit (*defs, SSA_NAME_VERSION (current_new_name));
579 }
580
581 set_phi_nodes (new_merge_bb, phi_reverse (phi_nodes (new_merge_bb)));
582 }
583
584
585 /* Function slpeel_update_phi_nodes_for_guard2
586
587 Input:
588 - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
589
590 In the context of the overall structure, we have:
591
592 loop1_preheader_bb:
593 guard1 (goto loop1/merg1_bb)
594 loop1
595 loop1_exit_bb:
596 guard2 (goto merge1_bb/merge2_bb)
597 merge1_bb
598 LOOP-> loop2
599 loop2_exit_bb
600 merge2_bb
601 next_bb
602
603 For each name used out side the loop (i.e - for each name that has an exit
604 phi in next_bb) we create a new phi in:
605 1. merge2_bb (to account for the edge from guard_bb)
606 2. loop2_exit_bb (an exit-phi to keep LOOP in loop-closed form)
607 3. guard2 bb (an exit phi to keep the preceding loop in loop-closed form),
608 if needed (if it wasn't handled by slpeel_update_phis_nodes_for_phi1).
609 */
610
611 static void
612 slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop,
613 bool is_new_loop, basic_block *new_exit_bb)
614 {
615 tree orig_phi, new_phi;
616 tree update_phi, update_phi2;
617 tree guard_arg, loop_arg;
618 basic_block new_merge_bb = guard_edge->dest;
619 edge e = EDGE_SUCC (new_merge_bb, 0);
620 basic_block update_bb = e->dest;
621 edge new_exit_e;
622 tree orig_def, orig_def_new_name;
623 tree new_name, new_name2;
624 tree arg;
625
626 /* Create new bb between loop and new_merge_bb. */
627 *new_exit_bb = split_edge (single_exit (loop));
628
629 new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
630
631 for (update_phi = phi_nodes (update_bb); update_phi;
632 update_phi = PHI_CHAIN (update_phi))
633 {
634 orig_phi = update_phi;
635 orig_def = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
636 /* This loop-closed-phi actually doesn't represent a use
637 out of the loop - the phi arg is a constant. */
638 if (TREE_CODE (orig_def) != SSA_NAME)
639 continue;
640 orig_def_new_name = get_current_def (orig_def);
641 arg = NULL_TREE;
642
643 /** 1. Handle new-merge-point phis **/
644
645 /* 1.1. Generate new phi node in NEW_MERGE_BB: */
646 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
647 new_merge_bb);
648
649 /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
650 of LOOP. Set the two PHI args in NEW_PHI for these edges: */
651 new_name = orig_def;
652 new_name2 = NULL_TREE;
653 if (orig_def_new_name)
654 {
655 new_name = orig_def_new_name;
656 /* Some variables have both loop-entry-phis and loop-exit-phis.
657 Such variables were given yet newer names by phis placed in
658 guard_bb by slpeel_update_phi_nodes_for_guard1. I.e:
659 new_name2 = get_current_def (get_current_def (orig_name)). */
660 new_name2 = get_current_def (new_name);
661 }
662
663 if (is_new_loop)
664 {
665 guard_arg = orig_def;
666 loop_arg = new_name;
667 }
668 else
669 {
670 guard_arg = new_name;
671 loop_arg = orig_def;
672 }
673 if (new_name2)
674 guard_arg = new_name2;
675
676 add_phi_arg (new_phi, loop_arg, new_exit_e);
677 add_phi_arg (new_phi, guard_arg, guard_edge);
678
679 /* 1.3. Update phi in successor block. */
680 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == orig_def);
681 SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
682 update_phi2 = new_phi;
683
684
685 /** 2. Handle loop-closed-ssa-form phis **/
686
687 /* 2.1. Generate new phi node in NEW_EXIT_BB: */
688 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
689 *new_exit_bb);
690
691 /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop. */
692 add_phi_arg (new_phi, loop_arg, single_exit (loop));
693
694 /* 2.3. Update phi in successor of NEW_EXIT_BB: */
695 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
696 SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
697
698
699 /** 3. Handle loop-closed-ssa-form phis for first loop **/
700
701 /* 3.1. Find the relevant names that need an exit-phi in
702 GUARD_BB, i.e. names for which
703 slpeel_update_phi_nodes_for_guard1 had not already created a
704 phi node. This is the case for names that are used outside
705 the loop (and therefore need an exit phi) but are not updated
706 across loop iterations (and therefore don't have a
707 loop-header-phi).
708
709 slpeel_update_phi_nodes_for_guard1 is responsible for
710 creating loop-exit phis in GUARD_BB for names that have a
711 loop-header-phi. When such a phi is created we also record
712 the new name in its current definition. If this new name
713 exists, then guard_arg was set to this new name (see 1.2
714 above). Therefore, if guard_arg is not this new name, this
715 is an indication that an exit-phi in GUARD_BB was not yet
716 created, so we take care of it here. */
717 if (guard_arg == new_name2)
718 continue;
719 arg = guard_arg;
720
721 /* 3.2. Generate new phi node in GUARD_BB: */
722 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
723 guard_edge->src);
724
725 /* 3.3. GUARD_BB has one incoming edge: */
726 gcc_assert (EDGE_COUNT (guard_edge->src->preds) == 1);
727 add_phi_arg (new_phi, arg, EDGE_PRED (guard_edge->src, 0));
728
729 /* 3.4. Update phi in successor of GUARD_BB: */
730 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, guard_edge)
731 == guard_arg);
732 SET_PHI_ARG_DEF (update_phi2, guard_edge->dest_idx, PHI_RESULT (new_phi));
733 }
734
735 set_phi_nodes (new_merge_bb, phi_reverse (phi_nodes (new_merge_bb)));
736 }
737
738
739 /* Make the LOOP iterate NITERS times. This is done by adding a new IV
740 that starts at zero, increases by one and its limit is NITERS.
741
742 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
743
744 void
745 slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters)
746 {
747 tree indx_before_incr, indx_after_incr, cond_stmt, cond;
748 tree orig_cond;
749 edge exit_edge = single_exit (loop);
750 block_stmt_iterator loop_cond_bsi;
751 block_stmt_iterator incr_bsi;
752 bool insert_after;
753 tree init = build_int_cst (TREE_TYPE (niters), 0);
754 tree step = build_int_cst (TREE_TYPE (niters), 1);
755 LOC loop_loc;
756
757 orig_cond = get_loop_exit_condition (loop);
758 gcc_assert (orig_cond);
759 loop_cond_bsi = bsi_for_stmt (orig_cond);
760
761 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
762 create_iv (init, step, NULL_TREE, loop,
763 &incr_bsi, insert_after, &indx_before_incr, &indx_after_incr);
764
765 if (exit_edge->flags & EDGE_TRUE_VALUE) /* 'then' edge exits the loop. */
766 cond = build2 (GE_EXPR, boolean_type_node, indx_after_incr, niters);
767 else /* 'then' edge loops back. */
768 cond = build2 (LT_EXPR, boolean_type_node, indx_after_incr, niters);
769
770 cond_stmt = build3 (COND_EXPR, TREE_TYPE (orig_cond), cond,
771 NULL_TREE, NULL_TREE);
772 bsi_insert_before (&loop_cond_bsi, cond_stmt, BSI_SAME_STMT);
773
774 /* Remove old loop exit test: */
775 bsi_remove (&loop_cond_bsi, true);
776
777 loop_loc = find_loop_location (loop);
778 if (dump_file && (dump_flags & TDF_DETAILS))
779 {
780 if (loop_loc != UNKNOWN_LOC)
781 fprintf (dump_file, "\nloop at %s:%d: ",
782 LOC_FILE (loop_loc), LOC_LINE (loop_loc));
783 print_generic_expr (dump_file, cond_stmt, TDF_SLIM);
784 }
785
786 loop->nb_iterations = niters;
787 }
788
789
790 /* Given LOOP this function generates a new copy of it and puts it
791 on E which is either the entry or exit of LOOP. */
792
793 struct loop *
794 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
795 {
796 struct loop *new_loop;
797 basic_block *new_bbs, *bbs;
798 bool at_exit;
799 bool was_imm_dom;
800 basic_block exit_dest;
801 tree phi, phi_arg;
802 edge exit, new_exit;
803
804 at_exit = (e == single_exit (loop));
805 if (!at_exit && e != loop_preheader_edge (loop))
806 return NULL;
807
808 bbs = get_loop_body (loop);
809
810 /* Check whether duplication is possible. */
811 if (!can_copy_bbs_p (bbs, loop->num_nodes))
812 {
813 free (bbs);
814 return NULL;
815 }
816
817 /* Generate new loop structure. */
818 new_loop = duplicate_loop (loop, loop_outer (loop));
819 if (!new_loop)
820 {
821 free (bbs);
822 return NULL;
823 }
824
825 exit_dest = single_exit (loop)->dest;
826 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
827 exit_dest) == loop->header ?
828 true : false);
829
830 new_bbs = XNEWVEC (basic_block, loop->num_nodes);
831
832 exit = single_exit (loop);
833 copy_bbs (bbs, loop->num_nodes, new_bbs,
834 &exit, 1, &new_exit, NULL,
835 e->src);
836
837 /* Duplicating phi args at exit bbs as coming
838 also from exit of duplicated loop. */
839 for (phi = phi_nodes (exit_dest); phi; phi = PHI_CHAIN (phi))
840 {
841 phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, single_exit (loop));
842 if (phi_arg)
843 {
844 edge new_loop_exit_edge;
845
846 if (EDGE_SUCC (new_loop->header, 0)->dest == new_loop->latch)
847 new_loop_exit_edge = EDGE_SUCC (new_loop->header, 1);
848 else
849 new_loop_exit_edge = EDGE_SUCC (new_loop->header, 0);
850
851 add_phi_arg (phi, phi_arg, new_loop_exit_edge);
852 }
853 }
854
855 if (at_exit) /* Add the loop copy at exit. */
856 {
857 redirect_edge_and_branch_force (e, new_loop->header);
858 PENDING_STMT (e) = NULL;
859 set_immediate_dominator (CDI_DOMINATORS, new_loop->header, e->src);
860 if (was_imm_dom)
861 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header);
862 }
863 else /* Add the copy at entry. */
864 {
865 edge new_exit_e;
866 edge entry_e = loop_preheader_edge (loop);
867 basic_block preheader = entry_e->src;
868
869 if (!flow_bb_inside_loop_p (new_loop,
870 EDGE_SUCC (new_loop->header, 0)->dest))
871 new_exit_e = EDGE_SUCC (new_loop->header, 0);
872 else
873 new_exit_e = EDGE_SUCC (new_loop->header, 1);
874
875 redirect_edge_and_branch_force (new_exit_e, loop->header);
876 PENDING_STMT (new_exit_e) = NULL;
877 set_immediate_dominator (CDI_DOMINATORS, loop->header,
878 new_exit_e->src);
879
880 /* We have to add phi args to the loop->header here as coming
881 from new_exit_e edge. */
882 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
883 {
884 phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, entry_e);
885 if (phi_arg)
886 add_phi_arg (phi, phi_arg, new_exit_e);
887 }
888
889 redirect_edge_and_branch_force (entry_e, new_loop->header);
890 PENDING_STMT (entry_e) = NULL;
891 set_immediate_dominator (CDI_DOMINATORS, new_loop->header, preheader);
892 }
893
894 free (new_bbs);
895 free (bbs);
896
897 return new_loop;
898 }
899
900
901 /* Given the condition statement COND, put it as the last statement
902 of GUARD_BB; EXIT_BB is the basic block to skip the loop;
903 Assumes that this is the single exit of the guarded loop.
904 Returns the skip edge. */
905
906 static edge
907 slpeel_add_loop_guard (basic_block guard_bb, tree cond, basic_block exit_bb,
908 basic_block dom_bb)
909 {
910 block_stmt_iterator bsi;
911 edge new_e, enter_e;
912 tree cond_stmt;
913 tree gimplify_stmt_list;
914
915 enter_e = EDGE_SUCC (guard_bb, 0);
916 enter_e->flags &= ~EDGE_FALLTHRU;
917 enter_e->flags |= EDGE_FALSE_VALUE;
918 bsi = bsi_last (guard_bb);
919
920 cond =
921 force_gimple_operand (cond, &gimplify_stmt_list, true,
922 NULL_TREE);
923 cond_stmt = build3 (COND_EXPR, void_type_node, cond,
924 NULL_TREE, NULL_TREE);
925 if (gimplify_stmt_list)
926 bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
927
928 bsi = bsi_last (guard_bb);
929 bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
930
931 /* Add new edge to connect guard block to the merge/loop-exit block. */
932 new_e = make_edge (guard_bb, exit_bb, EDGE_TRUE_VALUE);
933 set_immediate_dominator (CDI_DOMINATORS, exit_bb, dom_bb);
934 return new_e;
935 }
936
937
938 /* This function verifies that the following restrictions apply to LOOP:
939 (1) it is innermost
940 (2) it consists of exactly 2 basic blocks - header, and an empty latch.
941 (3) it is single entry, single exit
942 (4) its exit condition is the last stmt in the header
943 (5) E is the entry/exit edge of LOOP.
944 */
945
946 bool
947 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
948 {
949 edge exit_e = single_exit (loop);
950 edge entry_e = loop_preheader_edge (loop);
951 tree orig_cond = get_loop_exit_condition (loop);
952 block_stmt_iterator loop_exit_bsi = bsi_last (exit_e->src);
953
954 if (need_ssa_update_p ())
955 return false;
956
957 if (loop->inner
958 /* All loops have an outer scope; the only case loop->outer is NULL is for
959 the function itself. */
960 || !loop_outer (loop)
961 || loop->num_nodes != 2
962 || !empty_block_p (loop->latch)
963 || !single_exit (loop)
964 /* Verify that new loop exit condition can be trivially modified. */
965 || (!orig_cond || orig_cond != bsi_stmt (loop_exit_bsi))
966 || (e != exit_e && e != entry_e))
967 return false;
968
969 return true;
970 }
971
972 #ifdef ENABLE_CHECKING
973 void
974 slpeel_verify_cfg_after_peeling (struct loop *first_loop,
975 struct loop *second_loop)
976 {
977 basic_block loop1_exit_bb = single_exit (first_loop)->dest;
978 basic_block loop2_entry_bb = loop_preheader_edge (second_loop)->src;
979 basic_block loop1_entry_bb = loop_preheader_edge (first_loop)->src;
980
981 /* A guard that controls whether the second_loop is to be executed or skipped
982 is placed in first_loop->exit. first_loopt->exit therefore has two
983 successors - one is the preheader of second_loop, and the other is a bb
984 after second_loop.
985 */
986 gcc_assert (EDGE_COUNT (loop1_exit_bb->succs) == 2);
987
988 /* 1. Verify that one of the successors of first_loopt->exit is the preheader
989 of second_loop. */
990
991 /* The preheader of new_loop is expected to have two predecessors:
992 first_loop->exit and the block that precedes first_loop. */
993
994 gcc_assert (EDGE_COUNT (loop2_entry_bb->preds) == 2
995 && ((EDGE_PRED (loop2_entry_bb, 0)->src == loop1_exit_bb
996 && EDGE_PRED (loop2_entry_bb, 1)->src == loop1_entry_bb)
997 || (EDGE_PRED (loop2_entry_bb, 1)->src == loop1_exit_bb
998 && EDGE_PRED (loop2_entry_bb, 0)->src == loop1_entry_bb)));
999
1000 /* Verify that the other successor of first_loopt->exit is after the
1001 second_loop. */
1002 /* TODO */
1003 }
1004 #endif
1005
1006 /* If the run time cost model check determines that vectorization is
1007 not profitable and hence scalar loop should be generated then set
1008 FIRST_NITERS to prologue peeled iterations. This will allow all the
1009 iterations to be executed in the prologue peeled scalar loop. */
1010
1011 void
1012 set_prologue_iterations (basic_block bb_before_first_loop,
1013 tree first_niters,
1014 struct loop *loop,
1015 unsigned int th)
1016 {
1017 edge e;
1018 basic_block cond_bb, then_bb;
1019 tree var, prologue_after_cost_adjust_name, stmt;
1020 block_stmt_iterator bsi;
1021 tree newphi;
1022 edge e_true, e_false, e_fallthru;
1023 tree cond_stmt;
1024 tree gimplify_stmt_list;
1025 tree cost_pre_condition = NULL_TREE;
1026 tree scalar_loop_iters =
1027 unshare_expr (LOOP_VINFO_NITERS_UNCHANGED (loop_vec_info_for_loop (loop)));
1028
1029 e = single_pred_edge (bb_before_first_loop);
1030 cond_bb = split_edge(e);
1031
1032 e = single_pred_edge (bb_before_first_loop);
1033 then_bb = split_edge(e);
1034 set_immediate_dominator (CDI_DOMINATORS, then_bb, cond_bb);
1035
1036 e_false = make_single_succ_edge (cond_bb, bb_before_first_loop,
1037 EDGE_FALSE_VALUE);
1038 set_immediate_dominator (CDI_DOMINATORS, bb_before_first_loop, cond_bb);
1039
1040 e_true = EDGE_PRED (then_bb, 0);
1041 e_true->flags &= ~EDGE_FALLTHRU;
1042 e_true->flags |= EDGE_TRUE_VALUE;
1043
1044 e_fallthru = EDGE_SUCC (then_bb, 0);
1045
1046 cost_pre_condition =
1047 build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1048 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1049 cost_pre_condition =
1050 force_gimple_operand (cost_pre_condition, &gimplify_stmt_list,
1051 true, NULL_TREE);
1052 cond_stmt = build3 (COND_EXPR, void_type_node, cost_pre_condition,
1053 NULL_TREE, NULL_TREE);
1054
1055 bsi = bsi_last (cond_bb);
1056 if (gimplify_stmt_list)
1057 bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
1058
1059 bsi = bsi_last (cond_bb);
1060 bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
1061
1062 var = create_tmp_var (TREE_TYPE (scalar_loop_iters),
1063 "prologue_after_cost_adjust");
1064 add_referenced_var (var);
1065 prologue_after_cost_adjust_name =
1066 force_gimple_operand (scalar_loop_iters, &stmt, false, var);
1067
1068 bsi = bsi_last (then_bb);
1069 if (stmt)
1070 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1071
1072 newphi = create_phi_node (var, bb_before_first_loop);
1073 add_phi_arg (newphi, prologue_after_cost_adjust_name, e_fallthru);
1074 add_phi_arg (newphi, first_niters, e_false);
1075
1076 first_niters = PHI_RESULT (newphi);
1077 }
1078
1079
1080 /* Function slpeel_tree_peel_loop_to_edge.
1081
1082 Peel the first (last) iterations of LOOP into a new prolog (epilog) loop
1083 that is placed on the entry (exit) edge E of LOOP. After this transformation
1084 we have two loops one after the other - first-loop iterates FIRST_NITERS
1085 times, and second-loop iterates the remainder NITERS - FIRST_NITERS times.
1086 If the cost model indicates that it is profitable to emit a scalar
1087 loop instead of the vector one, then the prolog (epilog) loop will iterate
1088 for the entire unchanged scalar iterations of the loop.
1089
1090 Input:
1091 - LOOP: the loop to be peeled.
1092 - E: the exit or entry edge of LOOP.
1093 If it is the entry edge, we peel the first iterations of LOOP. In this
1094 case first-loop is LOOP, and second-loop is the newly created loop.
1095 If it is the exit edge, we peel the last iterations of LOOP. In this
1096 case, first-loop is the newly created loop, and second-loop is LOOP.
1097 - NITERS: the number of iterations that LOOP iterates.
1098 - FIRST_NITERS: the number of iterations that the first-loop should iterate.
1099 - UPDATE_FIRST_LOOP_COUNT: specified whether this function is responsible
1100 for updating the loop bound of the first-loop to FIRST_NITERS. If it
1101 is false, the caller of this function may want to take care of this
1102 (this can be useful if we don't want new stmts added to first-loop).
1103 - TH: cost model profitability threshold of iterations for vectorization.
1104 - CHECK_PROFITABILITY: specify whether cost model check has not occured
1105 during versioning and hence needs to occur during
1106 prologue generation or whether cost model check
1107 has not occured during prologue generation and hence
1108 needs to occur during epilogue generation.
1109
1110
1111 Output:
1112 The function returns a pointer to the new loop-copy, or NULL if it failed
1113 to perform the transformation.
1114
1115 The function generates two if-then-else guards: one before the first loop,
1116 and the other before the second loop:
1117 The first guard is:
1118 if (FIRST_NITERS == 0) then skip the first loop,
1119 and go directly to the second loop.
1120 The second guard is:
1121 if (FIRST_NITERS == NITERS) then skip the second loop.
1122
1123 FORNOW only simple loops are supported (see slpeel_can_duplicate_loop_p).
1124 FORNOW the resulting code will not be in loop-closed-ssa form.
1125 */
1126
1127 struct loop*
1128 slpeel_tree_peel_loop_to_edge (struct loop *loop,
1129 edge e, tree first_niters,
1130 tree niters, bool update_first_loop_count,
1131 unsigned int th, bool check_profitability)
1132 {
1133 struct loop *new_loop = NULL, *first_loop, *second_loop;
1134 edge skip_e;
1135 tree pre_condition = NULL_TREE;
1136 bitmap definitions;
1137 basic_block bb_before_second_loop, bb_after_second_loop;
1138 basic_block bb_before_first_loop;
1139 basic_block bb_between_loops;
1140 basic_block new_exit_bb;
1141 edge exit_e = single_exit (loop);
1142 LOC loop_loc;
1143 tree cost_pre_condition = NULL_TREE;
1144
1145 if (!slpeel_can_duplicate_loop_p (loop, e))
1146 return NULL;
1147
1148 /* We have to initialize cfg_hooks. Then, when calling
1149 cfg_hooks->split_edge, the function tree_split_edge
1150 is actually called and, when calling cfg_hooks->duplicate_block,
1151 the function tree_duplicate_bb is called. */
1152 tree_register_cfg_hooks ();
1153
1154
1155 /* 1. Generate a copy of LOOP and put it on E (E is the entry/exit of LOOP).
1156 Resulting CFG would be:
1157
1158 first_loop:
1159 do {
1160 } while ...
1161
1162 second_loop:
1163 do {
1164 } while ...
1165
1166 orig_exit_bb:
1167 */
1168
1169 if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e)))
1170 {
1171 loop_loc = find_loop_location (loop);
1172 if (dump_file && (dump_flags & TDF_DETAILS))
1173 {
1174 if (loop_loc != UNKNOWN_LOC)
1175 fprintf (dump_file, "\n%s:%d: note: ",
1176 LOC_FILE (loop_loc), LOC_LINE (loop_loc));
1177 fprintf (dump_file, "tree_duplicate_loop_to_edge_cfg failed.\n");
1178 }
1179 return NULL;
1180 }
1181
1182 if (e == exit_e)
1183 {
1184 /* NEW_LOOP was placed after LOOP. */
1185 first_loop = loop;
1186 second_loop = new_loop;
1187 }
1188 else
1189 {
1190 /* NEW_LOOP was placed before LOOP. */
1191 first_loop = new_loop;
1192 second_loop = loop;
1193 }
1194
1195 definitions = ssa_names_to_replace ();
1196 slpeel_update_phis_for_duplicate_loop (loop, new_loop, e == exit_e);
1197 rename_variables_in_loop (new_loop);
1198
1199
1200 /* 2. Add the guard code in one of the following ways:
1201
1202 2.a Add the guard that controls whether the first loop is executed.
1203 This occurs when this function is invoked for prologue or epilogiue
1204 generation and when the cost model check can be done at compile time.
1205
1206 Resulting CFG would be:
1207
1208 bb_before_first_loop:
1209 if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1210 GOTO first-loop
1211
1212 first_loop:
1213 do {
1214 } while ...
1215
1216 bb_before_second_loop:
1217
1218 second_loop:
1219 do {
1220 } while ...
1221
1222 orig_exit_bb:
1223
1224 2.b Add the cost model check that allows the prologue
1225 to iterate for the entire unchanged scalar
1226 iterations of the loop in the event that the cost
1227 model indicates that the scalar loop is more
1228 profitable than the vector one. This occurs when
1229 this function is invoked for prologue generation
1230 and the cost model check needs to be done at run
1231 time.
1232
1233 Resulting CFG after prologue peeling would be:
1234
1235 if (scalar_loop_iterations <= th)
1236 FIRST_NITERS = scalar_loop_iterations
1237
1238 bb_before_first_loop:
1239 if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1240 GOTO first-loop
1241
1242 first_loop:
1243 do {
1244 } while ...
1245
1246 bb_before_second_loop:
1247
1248 second_loop:
1249 do {
1250 } while ...
1251
1252 orig_exit_bb:
1253
1254 2.c Add the cost model check that allows the epilogue
1255 to iterate for the entire unchanged scalar
1256 iterations of the loop in the event that the cost
1257 model indicates that the scalar loop is more
1258 profitable than the vector one. This occurs when
1259 this function is invoked for epilogue generation
1260 and the cost model check needs to be done at run
1261 time.
1262
1263 Resulting CFG after prologue peeling would be:
1264
1265 bb_before_first_loop:
1266 if ((scalar_loop_iterations <= th)
1267 ||
1268 FIRST_NITERS == 0) GOTO bb_before_second_loop
1269 GOTO first-loop
1270
1271 first_loop:
1272 do {
1273 } while ...
1274
1275 bb_before_second_loop:
1276
1277 second_loop:
1278 do {
1279 } while ...
1280
1281 orig_exit_bb:
1282 */
1283
1284 bb_before_first_loop = split_edge (loop_preheader_edge (first_loop));
1285 bb_before_second_loop = split_edge (single_exit (first_loop));
1286
1287 /* Epilogue peeling. */
1288 if (!update_first_loop_count)
1289 {
1290 pre_condition =
1291 fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1292 build_int_cst (TREE_TYPE (first_niters), 0));
1293 if (check_profitability)
1294 {
1295 tree scalar_loop_iters
1296 = unshare_expr (LOOP_VINFO_NITERS_UNCHANGED
1297 (loop_vec_info_for_loop (loop)));
1298 cost_pre_condition =
1299 build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1300 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1301
1302 pre_condition = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
1303 cost_pre_condition, pre_condition);
1304 }
1305 }
1306
1307 /* Prologue peeling. */
1308 else
1309 {
1310 if (check_profitability)
1311 set_prologue_iterations (bb_before_first_loop, first_niters,
1312 loop, th);
1313
1314 pre_condition =
1315 fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1316 build_int_cst (TREE_TYPE (first_niters), 0));
1317 }
1318
1319 skip_e = slpeel_add_loop_guard (bb_before_first_loop, pre_condition,
1320 bb_before_second_loop, bb_before_first_loop);
1321 slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop,
1322 first_loop == new_loop,
1323 &new_exit_bb, &definitions);
1324
1325
1326 /* 3. Add the guard that controls whether the second loop is executed.
1327 Resulting CFG would be:
1328
1329 bb_before_first_loop:
1330 if (FIRST_NITERS == 0) GOTO bb_before_second_loop (skip first loop)
1331 GOTO first-loop
1332
1333 first_loop:
1334 do {
1335 } while ...
1336
1337 bb_between_loops:
1338 if (FIRST_NITERS == NITERS) GOTO bb_after_second_loop (skip second loop)
1339 GOTO bb_before_second_loop
1340
1341 bb_before_second_loop:
1342
1343 second_loop:
1344 do {
1345 } while ...
1346
1347 bb_after_second_loop:
1348
1349 orig_exit_bb:
1350 */
1351
1352 bb_between_loops = new_exit_bb;
1353 bb_after_second_loop = split_edge (single_exit (second_loop));
1354
1355 pre_condition =
1356 fold_build2 (EQ_EXPR, boolean_type_node, first_niters, niters);
1357 skip_e = slpeel_add_loop_guard (bb_between_loops, pre_condition,
1358 bb_after_second_loop, bb_before_first_loop);
1359 slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop,
1360 second_loop == new_loop, &new_exit_bb);
1361
1362 /* 4. Make first-loop iterate FIRST_NITERS times, if requested.
1363 */
1364 if (update_first_loop_count)
1365 slpeel_make_loop_iterate_ntimes (first_loop, first_niters);
1366
1367 BITMAP_FREE (definitions);
1368 delete_update_ssa ();
1369
1370 return new_loop;
1371 }
1372
1373 /* Function vect_get_loop_location.
1374
1375 Extract the location of the loop in the source code.
1376 If the loop is not well formed for vectorization, an estimated
1377 location is calculated.
1378 Return the loop location if succeed and NULL if not. */
1379
1380 LOC
1381 find_loop_location (struct loop *loop)
1382 {
1383 tree node = NULL_TREE;
1384 basic_block bb;
1385 block_stmt_iterator si;
1386
1387 if (!loop)
1388 return UNKNOWN_LOC;
1389
1390 node = get_loop_exit_condition (loop);
1391
1392 if (node && CAN_HAVE_LOCATION_P (node) && EXPR_HAS_LOCATION (node)
1393 && EXPR_FILENAME (node) && EXPR_LINENO (node))
1394 return EXPR_LOC (node);
1395
1396 /* If we got here the loop is probably not "well formed",
1397 try to estimate the loop location */
1398
1399 if (!loop->header)
1400 return UNKNOWN_LOC;
1401
1402 bb = loop->header;
1403
1404 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1405 {
1406 node = bsi_stmt (si);
1407 if (node && CAN_HAVE_LOCATION_P (node) && EXPR_HAS_LOCATION (node))
1408 return EXPR_LOC (node);
1409 }
1410
1411 return UNKNOWN_LOC;
1412 }
1413
1414
1415 /*************************************************************************
1416 Vectorization Debug Information.
1417 *************************************************************************/
1418
1419 /* Function vect_set_verbosity_level.
1420
1421 Called from toplev.c upon detection of the
1422 -ftree-vectorizer-verbose=N option. */
1423
1424 void
1425 vect_set_verbosity_level (const char *val)
1426 {
1427 unsigned int vl;
1428
1429 vl = atoi (val);
1430 if (vl < MAX_VERBOSITY_LEVEL)
1431 vect_verbosity_level = vl;
1432 else
1433 vect_verbosity_level = MAX_VERBOSITY_LEVEL - 1;
1434 }
1435
1436
1437 /* Function vect_set_dump_settings.
1438
1439 Fix the verbosity level of the vectorizer if the
1440 requested level was not set explicitly using the flag
1441 -ftree-vectorizer-verbose=N.
1442 Decide where to print the debugging information (dump_file/stderr).
1443 If the user defined the verbosity level, but there is no dump file,
1444 print to stderr, otherwise print to the dump file. */
1445
1446 static void
1447 vect_set_dump_settings (void)
1448 {
1449 vect_dump = dump_file;
1450
1451 /* Check if the verbosity level was defined by the user: */
1452 if (vect_verbosity_level != MAX_VERBOSITY_LEVEL)
1453 {
1454 /* If there is no dump file, print to stderr. */
1455 if (!dump_file)
1456 vect_dump = stderr;
1457 return;
1458 }
1459
1460 /* User didn't specify verbosity level: */
1461 if (dump_file && (dump_flags & TDF_DETAILS))
1462 vect_verbosity_level = REPORT_DETAILS;
1463 else if (dump_file && (dump_flags & TDF_STATS))
1464 vect_verbosity_level = REPORT_UNVECTORIZED_LOOPS;
1465 else
1466 vect_verbosity_level = REPORT_NONE;
1467
1468 gcc_assert (dump_file || vect_verbosity_level == REPORT_NONE);
1469 }
1470
1471
1472 /* Function debug_loop_details.
1473
1474 For vectorization debug dumps. */
1475
1476 bool
1477 vect_print_dump_info (enum verbosity_levels vl)
1478 {
1479 if (vl > vect_verbosity_level)
1480 return false;
1481
1482 if (!current_function_decl || !vect_dump)
1483 return false;
1484
1485 if (vect_loop_location == UNKNOWN_LOC)
1486 fprintf (vect_dump, "\n%s:%d: note: ",
1487 DECL_SOURCE_FILE (current_function_decl),
1488 DECL_SOURCE_LINE (current_function_decl));
1489 else
1490 fprintf (vect_dump, "\n%s:%d: note: ",
1491 LOC_FILE (vect_loop_location), LOC_LINE (vect_loop_location));
1492
1493 return true;
1494 }
1495
1496
1497 /*************************************************************************
1498 Vectorization Utilities.
1499 *************************************************************************/
1500
1501 /* Function new_stmt_vec_info.
1502
1503 Create and initialize a new stmt_vec_info struct for STMT. */
1504
1505 stmt_vec_info
1506 new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
1507 {
1508 stmt_vec_info res;
1509 res = (stmt_vec_info) xcalloc (1, sizeof (struct _stmt_vec_info));
1510
1511 STMT_VINFO_TYPE (res) = undef_vec_info_type;
1512 STMT_VINFO_STMT (res) = stmt;
1513 STMT_VINFO_LOOP_VINFO (res) = loop_vinfo;
1514 STMT_VINFO_RELEVANT (res) = 0;
1515 STMT_VINFO_LIVE_P (res) = false;
1516 STMT_VINFO_VECTYPE (res) = NULL;
1517 STMT_VINFO_VEC_STMT (res) = NULL;
1518 STMT_VINFO_IN_PATTERN_P (res) = false;
1519 STMT_VINFO_RELATED_STMT (res) = NULL;
1520 STMT_VINFO_DATA_REF (res) = NULL;
1521
1522 STMT_VINFO_DR_BASE_ADDRESS (res) = NULL;
1523 STMT_VINFO_DR_OFFSET (res) = NULL;
1524 STMT_VINFO_DR_INIT (res) = NULL;
1525 STMT_VINFO_DR_STEP (res) = NULL;
1526 STMT_VINFO_DR_ALIGNED_TO (res) = NULL;
1527
1528 if (TREE_CODE (stmt) == PHI_NODE && is_loop_header_bb_p (bb_for_stmt (stmt)))
1529 STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type;
1530 else
1531 STMT_VINFO_DEF_TYPE (res) = vect_loop_def;
1532 STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5);
1533 STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0;
1534 STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0;
1535 STMT_SLP_TYPE (res) = 0;
1536 DR_GROUP_FIRST_DR (res) = NULL_TREE;
1537 DR_GROUP_NEXT_DR (res) = NULL_TREE;
1538 DR_GROUP_SIZE (res) = 0;
1539 DR_GROUP_STORE_COUNT (res) = 0;
1540 DR_GROUP_GAP (res) = 0;
1541 DR_GROUP_SAME_DR_STMT (res) = NULL_TREE;
1542 DR_GROUP_READ_WRITE_DEPENDENCE (res) = false;
1543
1544 return res;
1545 }
1546
1547
1548 /* Function bb_in_loop_p
1549
1550 Used as predicate for dfs order traversal of the loop bbs. */
1551
1552 static bool
1553 bb_in_loop_p (const_basic_block bb, const void *data)
1554 {
1555 const struct loop *const loop = (const struct loop *)data;
1556 if (flow_bb_inside_loop_p (loop, bb))
1557 return true;
1558 return false;
1559 }
1560
1561
1562 /* Function new_loop_vec_info.
1563
1564 Create and initialize a new loop_vec_info struct for LOOP, as well as
1565 stmt_vec_info structs for all the stmts in LOOP. */
1566
1567 loop_vec_info
1568 new_loop_vec_info (struct loop *loop)
1569 {
1570 loop_vec_info res;
1571 basic_block *bbs;
1572 block_stmt_iterator si;
1573 unsigned int i, nbbs;
1574
1575 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1576 LOOP_VINFO_LOOP (res) = loop;
1577
1578 bbs = get_loop_body (loop);
1579
1580 /* Create/Update stmt_info for all stmts in the loop. */
1581 for (i = 0; i < loop->num_nodes; i++)
1582 {
1583 basic_block bb = bbs[i];
1584 tree phi;
1585
1586 /* BBs in a nested inner-loop will have been already processed (because
1587 we will have called vect_analyze_loop_form for any nested inner-loop).
1588 Therefore, for stmts in an inner-loop we just want to update the
1589 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
1590 loop_info of the outer-loop we are currently considering to vectorize
1591 (instead of the loop_info of the inner-loop).
1592 For stmts in other BBs we need to create a stmt_info from scratch. */
1593 if (bb->loop_father != loop)
1594 {
1595 /* Inner-loop bb. */
1596 gcc_assert (loop->inner && bb->loop_father == loop->inner);
1597 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1598 {
1599 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1600 loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1601 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1602 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1603 }
1604 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1605 {
1606 tree stmt = bsi_stmt (si);
1607 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1608 loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1609 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1610 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1611 }
1612 }
1613 else
1614 {
1615 /* bb in current nest. */
1616 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1617 {
1618 stmt_ann_t ann = get_stmt_ann (phi);
1619 set_stmt_info (ann, new_stmt_vec_info (phi, res));
1620 }
1621
1622 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1623 {
1624 tree stmt = bsi_stmt (si);
1625 stmt_ann_t ann = stmt_ann (stmt);
1626 set_stmt_info (ann, new_stmt_vec_info (stmt, res));
1627 }
1628 }
1629 }
1630
1631 /* CHECKME: We want to visit all BBs before their successors (except for
1632 latch blocks, for which this assertion wouldn't hold). In the simple
1633 case of the loop forms we allow, a dfs order of the BBs would the same
1634 as reversed postorder traversal, so we are safe. */
1635
1636 free (bbs);
1637 bbs = XCNEWVEC (basic_block, loop->num_nodes);
1638 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1639 bbs, loop->num_nodes, loop);
1640 gcc_assert (nbbs == loop->num_nodes);
1641
1642 LOOP_VINFO_BBS (res) = bbs;
1643 LOOP_VINFO_NITERS (res) = NULL;
1644 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1645 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
1646 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1647 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
1648 LOOP_VINFO_VECT_FACTOR (res) = 0;
1649 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
1650 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
1651 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1652 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
1653 VEC_alloc (tree, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
1654 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
1655 VEC_alloc (ddr_p, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1656 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (tree, heap, 10);
1657 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
1658 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1659
1660 return res;
1661 }
1662
1663
1664 /* Function destroy_loop_vec_info.
1665
1666 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1667 stmts in the loop. */
1668
1669 void
1670 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1671 {
1672 struct loop *loop;
1673 basic_block *bbs;
1674 int nbbs;
1675 block_stmt_iterator si;
1676 int j;
1677 VEC (slp_instance, heap) *slp_instances;
1678 slp_instance instance;
1679
1680 if (!loop_vinfo)
1681 return;
1682
1683 loop = LOOP_VINFO_LOOP (loop_vinfo);
1684
1685 bbs = LOOP_VINFO_BBS (loop_vinfo);
1686 nbbs = loop->num_nodes;
1687
1688 if (!clean_stmts)
1689 {
1690 free (LOOP_VINFO_BBS (loop_vinfo));
1691 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1692 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1693 VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1694
1695 free (loop_vinfo);
1696 loop->aux = NULL;
1697 return;
1698 }
1699
1700 for (j = 0; j < nbbs; j++)
1701 {
1702 basic_block bb = bbs[j];
1703 tree phi;
1704 stmt_vec_info stmt_info;
1705
1706 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1707 {
1708 stmt_ann_t ann = stmt_ann (phi);
1709
1710 stmt_info = vinfo_for_stmt (phi);
1711 free (stmt_info);
1712 set_stmt_info (ann, NULL);
1713 }
1714
1715 for (si = bsi_start (bb); !bsi_end_p (si); )
1716 {
1717 tree stmt = bsi_stmt (si);
1718 stmt_ann_t ann = stmt_ann (stmt);
1719 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1720
1721 if (stmt_info)
1722 {
1723 /* Check if this is a "pattern stmt" (introduced by the
1724 vectorizer during the pattern recognition pass). */
1725 bool remove_stmt_p = false;
1726 tree orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1727 if (orig_stmt)
1728 {
1729 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
1730 if (orig_stmt_info
1731 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
1732 remove_stmt_p = true;
1733 }
1734
1735 /* Free stmt_vec_info. */
1736 VEC_free (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
1737 free (stmt_info);
1738 set_stmt_info (ann, NULL);
1739
1740 /* Remove dead "pattern stmts". */
1741 if (remove_stmt_p)
1742 bsi_remove (&si, true);
1743 }
1744 bsi_next (&si);
1745 }
1746 }
1747
1748 free (LOOP_VINFO_BBS (loop_vinfo));
1749 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1750 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1751 VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1752 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1753 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1754 for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
1755 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
1756 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
1757
1758 free (loop_vinfo);
1759 loop->aux = NULL;
1760 }
1761
1762
1763 /* Function vect_force_dr_alignment_p.
1764
1765 Returns whether the alignment of a DECL can be forced to be aligned
1766 on ALIGNMENT bit boundary. */
1767
1768 bool
1769 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
1770 {
1771 if (TREE_CODE (decl) != VAR_DECL)
1772 return false;
1773
1774 if (DECL_EXTERNAL (decl))
1775 return false;
1776
1777 if (TREE_ASM_WRITTEN (decl))
1778 return false;
1779
1780 if (TREE_STATIC (decl))
1781 return (alignment <= MAX_OFILE_ALIGNMENT);
1782 else
1783 /* This used to be PREFERRED_STACK_BOUNDARY, however, that is not 100%
1784 correct until someone implements forced stack alignment. */
1785 return (alignment <= STACK_BOUNDARY);
1786 }
1787
1788
1789 /* Function get_vectype_for_scalar_type.
1790
1791 Returns the vector type corresponding to SCALAR_TYPE as supported
1792 by the target. */
1793
1794 tree
1795 get_vectype_for_scalar_type (tree scalar_type)
1796 {
1797 enum machine_mode inner_mode = TYPE_MODE (scalar_type);
1798 int nbytes = GET_MODE_SIZE (inner_mode);
1799 int nunits;
1800 tree vectype;
1801
1802 if (nbytes == 0 || nbytes >= UNITS_PER_SIMD_WORD)
1803 return NULL_TREE;
1804
1805 /* FORNOW: Only a single vector size per target (UNITS_PER_SIMD_WORD)
1806 is expected. */
1807 nunits = UNITS_PER_SIMD_WORD / nbytes;
1808
1809 vectype = build_vector_type (scalar_type, nunits);
1810 if (vect_print_dump_info (REPORT_DETAILS))
1811 {
1812 fprintf (vect_dump, "get vectype with %d units of type ", nunits);
1813 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
1814 }
1815
1816 if (!vectype)
1817 return NULL_TREE;
1818
1819 if (vect_print_dump_info (REPORT_DETAILS))
1820 {
1821 fprintf (vect_dump, "vectype: ");
1822 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1823 }
1824
1825 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1826 && !INTEGRAL_MODE_P (TYPE_MODE (vectype)))
1827 {
1828 if (vect_print_dump_info (REPORT_DETAILS))
1829 fprintf (vect_dump, "mode not supported by target.");
1830 return NULL_TREE;
1831 }
1832
1833 return vectype;
1834 }
1835
1836
1837 /* Function vect_supportable_dr_alignment
1838
1839 Return whether the data reference DR is supported with respect to its
1840 alignment. */
1841
1842 enum dr_alignment_support
1843 vect_supportable_dr_alignment (struct data_reference *dr)
1844 {
1845 tree stmt = DR_STMT (dr);
1846 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1847 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1848 enum machine_mode mode = (int) TYPE_MODE (vectype);
1849 struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info));
1850 bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
1851 bool invariant_in_outerloop = false;
1852
1853 if (aligned_access_p (dr))
1854 return dr_aligned;
1855
1856 if (nested_in_vect_loop)
1857 {
1858 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
1859 invariant_in_outerloop =
1860 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
1861 }
1862
1863 /* Possibly unaligned access. */
1864
1865 /* We can choose between using the implicit realignment scheme (generating
1866 a misaligned_move stmt) and the explicit realignment scheme (generating
1867 aligned loads with a REALIGN_LOAD). There are two variants to the explicit
1868 realignment scheme: optimized, and unoptimized.
1869 We can optimize the realignment only if the step between consecutive
1870 vector loads is equal to the vector size. Since the vector memory
1871 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
1872 is guaranteed that the misalignment amount remains the same throughout the
1873 execution of the vectorized loop. Therefore, we can create the
1874 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
1875 at the loop preheader.
1876
1877 However, in the case of outer-loop vectorization, when vectorizing a
1878 memory access in the inner-loop nested within the LOOP that is now being
1879 vectorized, while it is guaranteed that the misalignment of the
1880 vectorized memory access will remain the same in different outer-loop
1881 iterations, it is *not* guaranteed that is will remain the same throughout
1882 the execution of the inner-loop. This is because the inner-loop advances
1883 with the original scalar step (and not in steps of VS). If the inner-loop
1884 step happens to be a multiple of VS, then the misalignment remains fixed
1885 and we can use the optimized realignment scheme. For example:
1886
1887 for (i=0; i<N; i++)
1888 for (j=0; j<M; j++)
1889 s += a[i+j];
1890
1891 When vectorizing the i-loop in the above example, the step between
1892 consecutive vector loads is 1, and so the misalignment does not remain
1893 fixed across the execution of the inner-loop, and the realignment cannot
1894 be optimized (as illustrated in the following pseudo vectorized loop):
1895
1896 for (i=0; i<N; i+=4)
1897 for (j=0; j<M; j++){
1898 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
1899 // when j is {0,1,2,3,4,5,6,7,...} respectively.
1900 // (assuming that we start from an aligned address).
1901 }
1902
1903 We therefore have to use the unoptimized realignment scheme:
1904
1905 for (i=0; i<N; i+=4)
1906 for (j=k; j<M; j+=4)
1907 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
1908 // that the misalignment of the initial address is
1909 // 0).
1910
1911 The loop can then be vectorized as follows:
1912
1913 for (k=0; k<4; k++){
1914 rt = get_realignment_token (&vp[k]);
1915 for (i=0; i<N; i+=4){
1916 v1 = vp[i+k];
1917 for (j=k; j<M; j+=4){
1918 v2 = vp[i+j+VS-1];
1919 va = REALIGN_LOAD <v1,v2,rt>;
1920 vs += va;
1921 v1 = v2;
1922 }
1923 }
1924 } */
1925
1926 if (DR_IS_READ (dr))
1927 {
1928 if (optab_handler (vec_realign_load_optab, mode)->insn_code !=
1929 CODE_FOR_nothing
1930 && (!targetm.vectorize.builtin_mask_for_load
1931 || targetm.vectorize.builtin_mask_for_load ()))
1932 {
1933 if (nested_in_vect_loop
1934 && TREE_INT_CST_LOW (DR_STEP (dr)) != UNITS_PER_SIMD_WORD)
1935 return dr_explicit_realign;
1936 else
1937 return dr_explicit_realign_optimized;
1938 }
1939
1940 if (optab_handler (movmisalign_optab, mode)->insn_code !=
1941 CODE_FOR_nothing)
1942 /* Can't software pipeline the loads, but can at least do them. */
1943 return dr_unaligned_supported;
1944 }
1945
1946 /* Unsupported. */
1947 return dr_unaligned_unsupported;
1948 }
1949
1950
1951 /* Function vect_is_simple_use.
1952
1953 Input:
1954 LOOP - the loop that is being vectorized.
1955 OPERAND - operand of a stmt in LOOP.
1956 DEF - the defining stmt in case OPERAND is an SSA_NAME.
1957
1958 Returns whether a stmt with OPERAND can be vectorized.
1959 Supportable operands are constants, loop invariants, and operands that are
1960 defined by the current iteration of the loop. Unsupportable operands are
1961 those that are defined by a previous iteration of the loop (as is the case
1962 in reduction/induction computations). */
1963
1964 bool
1965 vect_is_simple_use (tree operand, loop_vec_info loop_vinfo, tree *def_stmt,
1966 tree *def, enum vect_def_type *dt)
1967 {
1968 basic_block bb;
1969 stmt_vec_info stmt_vinfo;
1970 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1971
1972 *def_stmt = NULL_TREE;
1973 *def = NULL_TREE;
1974
1975 if (vect_print_dump_info (REPORT_DETAILS))
1976 {
1977 fprintf (vect_dump, "vect_is_simple_use: operand ");
1978 print_generic_expr (vect_dump, operand, TDF_SLIM);
1979 }
1980
1981 if (TREE_CODE (operand) == INTEGER_CST || TREE_CODE (operand) == REAL_CST)
1982 {
1983 *dt = vect_constant_def;
1984 return true;
1985 }
1986 if (is_gimple_min_invariant (operand))
1987 {
1988 *def = operand;
1989 *dt = vect_invariant_def;
1990 return true;
1991 }
1992
1993 if (TREE_CODE (operand) == PAREN_EXPR)
1994 {
1995 if (vect_print_dump_info (REPORT_DETAILS))
1996 fprintf (vect_dump, "non-associatable copy.");
1997 operand = TREE_OPERAND (operand, 0);
1998 }
1999 if (TREE_CODE (operand) != SSA_NAME)
2000 {
2001 if (vect_print_dump_info (REPORT_DETAILS))
2002 fprintf (vect_dump, "not ssa-name.");
2003 return false;
2004 }
2005
2006 *def_stmt = SSA_NAME_DEF_STMT (operand);
2007 if (*def_stmt == NULL_TREE )
2008 {
2009 if (vect_print_dump_info (REPORT_DETAILS))
2010 fprintf (vect_dump, "no def_stmt.");
2011 return false;
2012 }
2013
2014 if (vect_print_dump_info (REPORT_DETAILS))
2015 {
2016 fprintf (vect_dump, "def_stmt: ");
2017 print_generic_expr (vect_dump, *def_stmt, TDF_SLIM);
2018 }
2019
2020 /* empty stmt is expected only in case of a function argument.
2021 (Otherwise - we expect a phi_node or a GIMPLE_MODIFY_STMT). */
2022 if (IS_EMPTY_STMT (*def_stmt))
2023 {
2024 tree arg = TREE_OPERAND (*def_stmt, 0);
2025 if (is_gimple_min_invariant (arg))
2026 {
2027 *def = operand;
2028 *dt = vect_invariant_def;
2029 return true;
2030 }
2031
2032 if (vect_print_dump_info (REPORT_DETAILS))
2033 fprintf (vect_dump, "Unexpected empty stmt.");
2034 return false;
2035 }
2036
2037 bb = bb_for_stmt (*def_stmt);
2038 if (!flow_bb_inside_loop_p (loop, bb))
2039 *dt = vect_invariant_def;
2040 else
2041 {
2042 stmt_vinfo = vinfo_for_stmt (*def_stmt);
2043 *dt = STMT_VINFO_DEF_TYPE (stmt_vinfo);
2044 }
2045
2046 if (*dt == vect_unknown_def_type)
2047 {
2048 if (vect_print_dump_info (REPORT_DETAILS))
2049 fprintf (vect_dump, "Unsupported pattern.");
2050 return false;
2051 }
2052
2053 if (vect_print_dump_info (REPORT_DETAILS))
2054 fprintf (vect_dump, "type of def: %d.",*dt);
2055
2056 switch (TREE_CODE (*def_stmt))
2057 {
2058 case PHI_NODE:
2059 *def = PHI_RESULT (*def_stmt);
2060 break;
2061
2062 case GIMPLE_MODIFY_STMT:
2063 *def = GIMPLE_STMT_OPERAND (*def_stmt, 0);
2064 break;
2065
2066 default:
2067 if (vect_print_dump_info (REPORT_DETAILS))
2068 fprintf (vect_dump, "unsupported defining stmt: ");
2069 return false;
2070 }
2071
2072 return true;
2073 }
2074
2075
2076 /* Function supportable_widening_operation
2077
2078 Check whether an operation represented by the code CODE is a
2079 widening operation that is supported by the target platform in
2080 vector form (i.e., when operating on arguments of type VECTYPE).
2081
2082 Widening operations we currently support are NOP (CONVERT), FLOAT
2083 and WIDEN_MULT. This function checks if these operations are supported
2084 by the target platform either directly (via vector tree-codes), or via
2085 target builtins.
2086
2087 Output:
2088 - CODE1 and CODE2 are codes of vector operations to be used when
2089 vectorizing the operation, if available.
2090 - DECL1 and DECL2 are decls of target builtin functions to be used
2091 when vectorizing the operation, if available. In this case,
2092 CODE1 and CODE2 are CALL_EXPR. */
2093
2094 bool
2095 supportable_widening_operation (enum tree_code code, tree stmt, tree vectype,
2096 tree *decl1, tree *decl2,
2097 enum tree_code *code1, enum tree_code *code2)
2098 {
2099 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2100 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
2101 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2102 bool ordered_p;
2103 enum machine_mode vec_mode;
2104 enum insn_code icode1, icode2;
2105 optab optab1, optab2;
2106 tree expr = GIMPLE_STMT_OPERAND (stmt, 1);
2107 tree type = TREE_TYPE (expr);
2108 tree wide_vectype = get_vectype_for_scalar_type (type);
2109 enum tree_code c1, c2;
2110
2111 /* The result of a vectorized widening operation usually requires two vectors
2112 (because the widened results do not fit int one vector). The generated
2113 vector results would normally be expected to be generated in the same
2114 order as in the original scalar computation. i.e. if 8 results are
2115 generated in each vector iteration, they are to be organized as follows:
2116 vect1: [res1,res2,res3,res4], vect2: [res5,res6,res7,res8].
2117
2118 However, in the special case that the result of the widening operation is
2119 used in a reduction computation only, the order doesn't matter (because
2120 when vectorizing a reduction we change the order of the computation).
2121 Some targets can take advantage of this and generate more efficient code.
2122 For example, targets like Altivec, that support widen_mult using a sequence
2123 of {mult_even,mult_odd} generate the following vectors:
2124 vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8].
2125
2126 When vectorizaing outer-loops, we execute the inner-loop sequentially
2127 (each vectorized inner-loop iteration contributes to VF outer-loop
2128 iterations in parallel). We therefore don't allow to change the order
2129 of the computation in the inner-loop during outer-loop vectorization. */
2130
2131 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
2132 && !nested_in_vect_loop_p (vect_loop, stmt))
2133 ordered_p = false;
2134 else
2135 ordered_p = true;
2136
2137 if (!ordered_p
2138 && code == WIDEN_MULT_EXPR
2139 && targetm.vectorize.builtin_mul_widen_even
2140 && targetm.vectorize.builtin_mul_widen_even (vectype)
2141 && targetm.vectorize.builtin_mul_widen_odd
2142 && targetm.vectorize.builtin_mul_widen_odd (vectype))
2143 {
2144 if (vect_print_dump_info (REPORT_DETAILS))
2145 fprintf (vect_dump, "Unordered widening operation detected.");
2146
2147 *code1 = *code2 = CALL_EXPR;
2148 *decl1 = targetm.vectorize.builtin_mul_widen_even (vectype);
2149 *decl2 = targetm.vectorize.builtin_mul_widen_odd (vectype);
2150 return true;
2151 }
2152
2153 switch (code)
2154 {
2155 case WIDEN_MULT_EXPR:
2156 if (BYTES_BIG_ENDIAN)
2157 {
2158 c1 = VEC_WIDEN_MULT_HI_EXPR;
2159 c2 = VEC_WIDEN_MULT_LO_EXPR;
2160 }
2161 else
2162 {
2163 c2 = VEC_WIDEN_MULT_HI_EXPR;
2164 c1 = VEC_WIDEN_MULT_LO_EXPR;
2165 }
2166 break;
2167
2168 case NOP_EXPR:
2169 case CONVERT_EXPR:
2170 if (BYTES_BIG_ENDIAN)
2171 {
2172 c1 = VEC_UNPACK_HI_EXPR;
2173 c2 = VEC_UNPACK_LO_EXPR;
2174 }
2175 else
2176 {
2177 c2 = VEC_UNPACK_HI_EXPR;
2178 c1 = VEC_UNPACK_LO_EXPR;
2179 }
2180 break;
2181
2182 case FLOAT_EXPR:
2183 if (BYTES_BIG_ENDIAN)
2184 {
2185 c1 = VEC_UNPACK_FLOAT_HI_EXPR;
2186 c2 = VEC_UNPACK_FLOAT_LO_EXPR;
2187 }
2188 else
2189 {
2190 c2 = VEC_UNPACK_FLOAT_HI_EXPR;
2191 c1 = VEC_UNPACK_FLOAT_LO_EXPR;
2192 }
2193 break;
2194
2195 case FIX_TRUNC_EXPR:
2196 /* ??? Not yet implemented due to missing VEC_UNPACK_FIX_TRUNC_HI_EXPR/
2197 VEC_UNPACK_FIX_TRUNC_LO_EXPR tree codes and optabs used for
2198 computing the operation. */
2199 return false;
2200
2201 default:
2202 gcc_unreachable ();
2203 }
2204
2205 if (code == FIX_TRUNC_EXPR)
2206 {
2207 /* The signedness is determined from output operand. */
2208 optab1 = optab_for_tree_code (c1, type);
2209 optab2 = optab_for_tree_code (c2, type);
2210 }
2211 else
2212 {
2213 optab1 = optab_for_tree_code (c1, vectype);
2214 optab2 = optab_for_tree_code (c2, vectype);
2215 }
2216
2217 if (!optab1 || !optab2)
2218 return false;
2219
2220 vec_mode = TYPE_MODE (vectype);
2221 if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
2222 || insn_data[icode1].operand[0].mode != TYPE_MODE (wide_vectype)
2223 || (icode2 = optab_handler (optab2, vec_mode)->insn_code)
2224 == CODE_FOR_nothing
2225 || insn_data[icode2].operand[0].mode != TYPE_MODE (wide_vectype))
2226 return false;
2227
2228 *code1 = c1;
2229 *code2 = c2;
2230 return true;
2231 }
2232
2233
2234 /* Function supportable_narrowing_operation
2235
2236 Check whether an operation represented by the code CODE is a
2237 narrowing operation that is supported by the target platform in
2238 vector form (i.e., when operating on arguments of type VECTYPE).
2239
2240 Narrowing operations we currently support are NOP (CONVERT) and
2241 FIX_TRUNC. This function checks if these operations are supported by
2242 the target platform directly via vector tree-codes.
2243
2244 Output:
2245 - CODE1 is the code of a vector operation to be used when
2246 vectorizing the operation, if available. */
2247
2248 bool
2249 supportable_narrowing_operation (enum tree_code code,
2250 const_tree stmt, const_tree vectype,
2251 enum tree_code *code1)
2252 {
2253 enum machine_mode vec_mode;
2254 enum insn_code icode1;
2255 optab optab1;
2256 tree expr = GIMPLE_STMT_OPERAND (stmt, 1);
2257 tree type = TREE_TYPE (expr);
2258 tree narrow_vectype = get_vectype_for_scalar_type (type);
2259 enum tree_code c1;
2260
2261 switch (code)
2262 {
2263 case NOP_EXPR:
2264 case CONVERT_EXPR:
2265 c1 = VEC_PACK_TRUNC_EXPR;
2266 break;
2267
2268 case FIX_TRUNC_EXPR:
2269 c1 = VEC_PACK_FIX_TRUNC_EXPR;
2270 break;
2271
2272 case FLOAT_EXPR:
2273 /* ??? Not yet implemented due to missing VEC_PACK_FLOAT_EXPR
2274 tree code and optabs used for computing the operation. */
2275 return false;
2276
2277 default:
2278 gcc_unreachable ();
2279 }
2280
2281 if (code == FIX_TRUNC_EXPR)
2282 /* The signedness is determined from output operand. */
2283 optab1 = optab_for_tree_code (c1, type);
2284 else
2285 optab1 = optab_for_tree_code (c1, vectype);
2286
2287 if (!optab1)
2288 return false;
2289
2290 vec_mode = TYPE_MODE (vectype);
2291 if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
2292 || insn_data[icode1].operand[0].mode != TYPE_MODE (narrow_vectype))
2293 return false;
2294
2295 *code1 = c1;
2296 return true;
2297 }
2298
2299
2300 /* Function reduction_code_for_scalar_code
2301
2302 Input:
2303 CODE - tree_code of a reduction operations.
2304
2305 Output:
2306 REDUC_CODE - the corresponding tree-code to be used to reduce the
2307 vector of partial results into a single scalar result (which
2308 will also reside in a vector).
2309
2310 Return TRUE if a corresponding REDUC_CODE was found, FALSE otherwise. */
2311
2312 bool
2313 reduction_code_for_scalar_code (enum tree_code code,
2314 enum tree_code *reduc_code)
2315 {
2316 switch (code)
2317 {
2318 case MAX_EXPR:
2319 *reduc_code = REDUC_MAX_EXPR;
2320 return true;
2321
2322 case MIN_EXPR:
2323 *reduc_code = REDUC_MIN_EXPR;
2324 return true;
2325
2326 case PLUS_EXPR:
2327 *reduc_code = REDUC_PLUS_EXPR;
2328 return true;
2329
2330 default:
2331 return false;
2332 }
2333 }
2334
2335
2336 /* Function vect_is_simple_reduction
2337
2338 Detect a cross-iteration def-use cycle that represents a simple
2339 reduction computation. We look for the following pattern:
2340
2341 loop_header:
2342 a1 = phi < a0, a2 >
2343 a3 = ...
2344 a2 = operation (a3, a1)
2345
2346 such that:
2347 1. operation is commutative and associative and it is safe to
2348 change the order of the computation.
2349 2. no uses for a2 in the loop (a2 is used out of the loop)
2350 3. no uses of a1 in the loop besides the reduction operation.
2351
2352 Condition 1 is tested here.
2353 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. */
2354
2355 tree
2356 vect_is_simple_reduction (loop_vec_info loop_info, tree phi)
2357 {
2358 struct loop *loop = (bb_for_stmt (phi))->loop_father;
2359 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2360 edge latch_e = loop_latch_edge (loop);
2361 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2362 tree def_stmt, def1, def2;
2363 enum tree_code code;
2364 int op_type;
2365 tree operation, op1, op2;
2366 tree type;
2367 int nloop_uses;
2368 tree name;
2369 imm_use_iterator imm_iter;
2370 use_operand_p use_p;
2371
2372 gcc_assert (loop == vect_loop || flow_loop_nested_p (vect_loop, loop));
2373
2374 name = PHI_RESULT (phi);
2375 nloop_uses = 0;
2376 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2377 {
2378 tree use_stmt = USE_STMT (use_p);
2379 if (flow_bb_inside_loop_p (loop, bb_for_stmt (use_stmt))
2380 && vinfo_for_stmt (use_stmt)
2381 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2382 nloop_uses++;
2383 if (nloop_uses > 1)
2384 {
2385 if (vect_print_dump_info (REPORT_DETAILS))
2386 fprintf (vect_dump, "reduction used in loop.");
2387 return NULL_TREE;
2388 }
2389 }
2390
2391 if (TREE_CODE (loop_arg) != SSA_NAME)
2392 {
2393 if (vect_print_dump_info (REPORT_DETAILS))
2394 {
2395 fprintf (vect_dump, "reduction: not ssa_name: ");
2396 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
2397 }
2398 return NULL_TREE;
2399 }
2400
2401 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2402 if (!def_stmt)
2403 {
2404 if (vect_print_dump_info (REPORT_DETAILS))
2405 fprintf (vect_dump, "reduction: no def_stmt.");
2406 return NULL_TREE;
2407 }
2408
2409 if (TREE_CODE (def_stmt) != GIMPLE_MODIFY_STMT)
2410 {
2411 if (vect_print_dump_info (REPORT_DETAILS))
2412 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
2413 return NULL_TREE;
2414 }
2415
2416 name = GIMPLE_STMT_OPERAND (def_stmt, 0);
2417 nloop_uses = 0;
2418 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2419 {
2420 tree use_stmt = USE_STMT (use_p);
2421 if (flow_bb_inside_loop_p (loop, bb_for_stmt (use_stmt))
2422 && vinfo_for_stmt (use_stmt)
2423 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2424 nloop_uses++;
2425 if (nloop_uses > 1)
2426 {
2427 if (vect_print_dump_info (REPORT_DETAILS))
2428 fprintf (vect_dump, "reduction used in loop.");
2429 return NULL_TREE;
2430 }
2431 }
2432
2433 operation = GIMPLE_STMT_OPERAND (def_stmt, 1);
2434 code = TREE_CODE (operation);
2435 if (!commutative_tree_code (code) || !associative_tree_code (code))
2436 {
2437 if (vect_print_dump_info (REPORT_DETAILS))
2438 {
2439 fprintf (vect_dump, "reduction: not commutative/associative: ");
2440 print_generic_expr (vect_dump, operation, TDF_SLIM);
2441 }
2442 return NULL_TREE;
2443 }
2444
2445 op_type = TREE_OPERAND_LENGTH (operation);
2446 if (op_type != binary_op)
2447 {
2448 if (vect_print_dump_info (REPORT_DETAILS))
2449 {
2450 fprintf (vect_dump, "reduction: not binary operation: ");
2451 print_generic_expr (vect_dump, operation, TDF_SLIM);
2452 }
2453 return NULL_TREE;
2454 }
2455
2456 op1 = TREE_OPERAND (operation, 0);
2457 op2 = TREE_OPERAND (operation, 1);
2458 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
2459 {
2460 if (vect_print_dump_info (REPORT_DETAILS))
2461 {
2462 fprintf (vect_dump, "reduction: uses not ssa_names: ");
2463 print_generic_expr (vect_dump, operation, TDF_SLIM);
2464 }
2465 return NULL_TREE;
2466 }
2467
2468 /* Check that it's ok to change the order of the computation. */
2469 type = TREE_TYPE (operation);
2470 if (TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op1))
2471 || TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op2)))
2472 {
2473 if (vect_print_dump_info (REPORT_DETAILS))
2474 {
2475 fprintf (vect_dump, "reduction: multiple types: operation type: ");
2476 print_generic_expr (vect_dump, type, TDF_SLIM);
2477 fprintf (vect_dump, ", operands types: ");
2478 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2479 fprintf (vect_dump, ",");
2480 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2481 }
2482 return NULL_TREE;
2483 }
2484
2485 /* Generally, when vectorizing a reduction we change the order of the
2486 computation. This may change the behavior of the program in some
2487 cases, so we need to check that this is ok. One exception is when
2488 vectorizing an outer-loop: the inner-loop is executed sequentially,
2489 and therefore vectorizing reductions in the inner-loop durint
2490 outer-loop vectorization is safe. */
2491
2492 /* CHECKME: check for !flag_finite_math_only too? */
2493 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2494 && !nested_in_vect_loop_p (vect_loop, def_stmt))
2495 {
2496 /* Changing the order of operations changes the semantics. */
2497 if (vect_print_dump_info (REPORT_DETAILS))
2498 {
2499 fprintf (vect_dump, "reduction: unsafe fp math optimization: ");
2500 print_generic_expr (vect_dump, operation, TDF_SLIM);
2501 }
2502 return NULL_TREE;
2503 }
2504 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2505 && !nested_in_vect_loop_p (vect_loop, def_stmt))
2506 {
2507 /* Changing the order of operations changes the semantics. */
2508 if (vect_print_dump_info (REPORT_DETAILS))
2509 {
2510 fprintf (vect_dump, "reduction: unsafe int math optimization: ");
2511 print_generic_expr (vect_dump, operation, TDF_SLIM);
2512 }
2513 return NULL_TREE;
2514 }
2515 else if (SAT_FIXED_POINT_TYPE_P (type))
2516 {
2517 /* Changing the order of operations changes the semantics. */
2518 if (vect_print_dump_info (REPORT_DETAILS))
2519 {
2520 fprintf (vect_dump, "reduction: unsafe fixed-point math optimization: ");
2521 print_generic_expr (vect_dump, operation, TDF_SLIM);
2522 }
2523 return NULL_TREE;
2524 }
2525
2526 /* reduction is safe. we're dealing with one of the following:
2527 1) integer arithmetic and no trapv
2528 2) floating point arithmetic, and special flags permit this optimization.
2529 */
2530 def1 = SSA_NAME_DEF_STMT (op1);
2531 def2 = SSA_NAME_DEF_STMT (op2);
2532 if (!def1 || !def2 || IS_EMPTY_STMT (def1) || IS_EMPTY_STMT (def2))
2533 {
2534 if (vect_print_dump_info (REPORT_DETAILS))
2535 {
2536 fprintf (vect_dump, "reduction: no defs for operands: ");
2537 print_generic_expr (vect_dump, operation, TDF_SLIM);
2538 }
2539 return NULL_TREE;
2540 }
2541
2542
2543 /* Check that one def is the reduction def, defined by PHI,
2544 the other def is either defined in the loop ("vect_loop_def"),
2545 or it's an induction (defined by a loop-header phi-node). */
2546
2547 if (def2 == phi
2548 && flow_bb_inside_loop_p (loop, bb_for_stmt (def1))
2549 && (TREE_CODE (def1) == GIMPLE_MODIFY_STMT
2550 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def
2551 || (TREE_CODE (def1) == PHI_NODE
2552 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_loop_def
2553 && !is_loop_header_bb_p (bb_for_stmt (def1)))))
2554 {
2555 if (vect_print_dump_info (REPORT_DETAILS))
2556 {
2557 fprintf (vect_dump, "detected reduction:");
2558 print_generic_expr (vect_dump, operation, TDF_SLIM);
2559 }
2560 return def_stmt;
2561 }
2562 else if (def1 == phi
2563 && flow_bb_inside_loop_p (loop, bb_for_stmt (def2))
2564 && (TREE_CODE (def2) == GIMPLE_MODIFY_STMT
2565 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def
2566 || (TREE_CODE (def2) == PHI_NODE
2567 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_loop_def
2568 && !is_loop_header_bb_p (bb_for_stmt (def2)))))
2569 {
2570 /* Swap operands (just for simplicity - so that the rest of the code
2571 can assume that the reduction variable is always the last (second)
2572 argument). */
2573 if (vect_print_dump_info (REPORT_DETAILS))
2574 {
2575 fprintf (vect_dump, "detected reduction: need to swap operands:");
2576 print_generic_expr (vect_dump, operation, TDF_SLIM);
2577 }
2578 swap_tree_operands (def_stmt, &TREE_OPERAND (operation, 0),
2579 &TREE_OPERAND (operation, 1));
2580 return def_stmt;
2581 }
2582 else
2583 {
2584 if (vect_print_dump_info (REPORT_DETAILS))
2585 {
2586 fprintf (vect_dump, "reduction: unknown pattern.");
2587 print_generic_expr (vect_dump, operation, TDF_SLIM);
2588 }
2589 return NULL_TREE;
2590 }
2591 }
2592
2593
2594 /* Function vect_is_simple_iv_evolution.
2595
2596 FORNOW: A simple evolution of an induction variables in the loop is
2597 considered a polynomial evolution with constant step. */
2598
2599 bool
2600 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
2601 tree * step)
2602 {
2603 tree init_expr;
2604 tree step_expr;
2605 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
2606
2607 /* When there is no evolution in this loop, the evolution function
2608 is not "simple". */
2609 if (evolution_part == NULL_TREE)
2610 return false;
2611
2612 /* When the evolution is a polynomial of degree >= 2
2613 the evolution function is not "simple". */
2614 if (tree_is_chrec (evolution_part))
2615 return false;
2616
2617 step_expr = evolution_part;
2618 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
2619
2620 if (vect_print_dump_info (REPORT_DETAILS))
2621 {
2622 fprintf (vect_dump, "step: ");
2623 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
2624 fprintf (vect_dump, ", init: ");
2625 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
2626 }
2627
2628 *init = init_expr;
2629 *step = step_expr;
2630
2631 if (TREE_CODE (step_expr) != INTEGER_CST)
2632 {
2633 if (vect_print_dump_info (REPORT_DETAILS))
2634 fprintf (vect_dump, "step unknown.");
2635 return false;
2636 }
2637
2638 return true;
2639 }
2640
2641
2642 /* Function vectorize_loops.
2643
2644 Entry Point to loop vectorization phase. */
2645
2646 unsigned
2647 vectorize_loops (void)
2648 {
2649 unsigned int i;
2650 unsigned int num_vectorized_loops = 0;
2651 unsigned int vect_loops_num;
2652 loop_iterator li;
2653 struct loop *loop;
2654
2655 vect_loops_num = number_of_loops ();
2656
2657 /* Bail out if there are no loops. */
2658 if (vect_loops_num <= 1)
2659 return 0;
2660
2661 /* Fix the verbosity level if not defined explicitly by the user. */
2662 vect_set_dump_settings ();
2663
2664 /* Allocate the bitmap that records which virtual variables that
2665 need to be renamed. */
2666 vect_memsyms_to_rename = BITMAP_ALLOC (NULL);
2667
2668 /* ----------- Analyze loops. ----------- */
2669
2670 /* If some loop was duplicated, it gets bigger number
2671 than all previously defined loops. This fact allows us to run
2672 only over initial loops skipping newly generated ones. */
2673 FOR_EACH_LOOP (li, loop, 0)
2674 {
2675 loop_vec_info loop_vinfo;
2676
2677 vect_loop_location = find_loop_location (loop);
2678 loop_vinfo = vect_analyze_loop (loop);
2679 loop->aux = loop_vinfo;
2680
2681 if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
2682 continue;
2683
2684 vect_transform_loop (loop_vinfo);
2685 num_vectorized_loops++;
2686 }
2687 vect_loop_location = UNKNOWN_LOC;
2688
2689 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)
2690 || (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
2691 && num_vectorized_loops > 0))
2692 fprintf (vect_dump, "vectorized %u loops in function.\n",
2693 num_vectorized_loops);
2694
2695 /* ----------- Finalize. ----------- */
2696
2697 BITMAP_FREE (vect_memsyms_to_rename);
2698
2699 for (i = 1; i < vect_loops_num; i++)
2700 {
2701 loop_vec_info loop_vinfo;
2702
2703 loop = get_loop (i);
2704 if (!loop)
2705 continue;
2706 loop_vinfo = loop->aux;
2707 destroy_loop_vec_info (loop_vinfo, true);
2708 loop->aux = NULL;
2709 }
2710
2711 return num_vectorized_loops > 0 ? TODO_cleanup_cfg : 0;
2712 }
2713
2714 /* Increase alignment of global arrays to improve vectorization potential.
2715 TODO:
2716 - Consider also structs that have an array field.
2717 - Use ipa analysis to prune arrays that can't be vectorized?
2718 This should involve global alignment analysis and in the future also
2719 array padding. */
2720
2721 static unsigned int
2722 increase_alignment (void)
2723 {
2724 struct varpool_node *vnode;
2725
2726 /* Increase the alignment of all global arrays for vectorization. */
2727 for (vnode = varpool_nodes_queue;
2728 vnode;
2729 vnode = vnode->next_needed)
2730 {
2731 tree vectype, decl = vnode->decl;
2732 unsigned int alignment;
2733
2734 if (TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
2735 continue;
2736 vectype = get_vectype_for_scalar_type (TREE_TYPE (TREE_TYPE (decl)));
2737 if (!vectype)
2738 continue;
2739 alignment = TYPE_ALIGN (vectype);
2740 if (DECL_ALIGN (decl) >= alignment)
2741 continue;
2742
2743 if (vect_can_force_dr_alignment_p (decl, alignment))
2744 {
2745 DECL_ALIGN (decl) = TYPE_ALIGN (vectype);
2746 DECL_USER_ALIGN (decl) = 1;
2747 if (dump_file)
2748 {
2749 fprintf (dump_file, "Increasing alignment of decl: ");
2750 print_generic_expr (dump_file, decl, TDF_SLIM);
2751 }
2752 }
2753 }
2754 return 0;
2755 }
2756
2757 static bool
2758 gate_increase_alignment (void)
2759 {
2760 return flag_section_anchors && flag_tree_vectorize;
2761 }
2762
2763 struct tree_opt_pass pass_ipa_increase_alignment =
2764 {
2765 "increase_alignment", /* name */
2766 gate_increase_alignment, /* gate */
2767 increase_alignment, /* execute */
2768 NULL, /* sub */
2769 NULL, /* next */
2770 0, /* static_pass_number */
2771 0, /* tv_id */
2772 0, /* properties_required */
2773 0, /* properties_provided */
2774 0, /* properties_destroyed */
2775 0, /* todo_flags_start */
2776 0, /* todo_flags_finish */
2777 0 /* letter */
2778 };