tree-ssa-reassoc.c (fini_reassoc): Use the statistics infrastructure.
[gcc.git] / gcc / tree-vectorizer.c
1 /* Loop Vectorization
2 Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 /* Loop Vectorization Pass.
22
23 This pass tries to vectorize loops. This first implementation focuses on
24 simple inner-most loops, with no conditional control flow, and a set of
25 simple operations which vector form can be expressed using existing
26 tree codes (PLUS, MULT etc).
27
28 For example, the vectorizer transforms the following simple loop:
29
30 short a[N]; short b[N]; short c[N]; int i;
31
32 for (i=0; i<N; i++){
33 a[i] = b[i] + c[i];
34 }
35
36 as if it was manually vectorized by rewriting the source code into:
37
38 typedef int __attribute__((mode(V8HI))) v8hi;
39 short a[N]; short b[N]; short c[N]; int i;
40 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
41 v8hi va, vb, vc;
42
43 for (i=0; i<N/8; i++){
44 vb = pb[i];
45 vc = pc[i];
46 va = vb + vc;
47 pa[i] = va;
48 }
49
50 The main entry to this pass is vectorize_loops(), in which
51 the vectorizer applies a set of analyses on a given set of loops,
52 followed by the actual vectorization transformation for the loops that
53 had successfully passed the analysis phase.
54
55 Throughout this pass we make a distinction between two types of
56 data: scalars (which are represented by SSA_NAMES), and memory references
57 ("data-refs"). These two types of data require different handling both
58 during analysis and transformation. The types of data-refs that the
59 vectorizer currently supports are ARRAY_REFS which base is an array DECL
60 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
61 accesses are required to have a simple (consecutive) access pattern.
62
63 Analysis phase:
64 ===============
65 The driver for the analysis phase is vect_analyze_loop_nest().
66 It applies a set of analyses, some of which rely on the scalar evolution
67 analyzer (scev) developed by Sebastian Pop.
68
69 During the analysis phase the vectorizer records some information
70 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
71 loop, as well as general information about the loop as a whole, which is
72 recorded in a "loop_vec_info" struct attached to each loop.
73
74 Transformation phase:
75 =====================
76 The loop transformation phase scans all the stmts in the loop, and
77 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
78 the loop that needs to be vectorized. It insert the vector code sequence
79 just before the scalar stmt S, and records a pointer to the vector code
80 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
81 attached to S). This pointer will be used for the vectorization of following
82 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
83 otherwise, we rely on dead code elimination for removing it.
84
85 For example, say stmt S1 was vectorized into stmt VS1:
86
87 VS1: vb = px[i];
88 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
89 S2: a = b;
90
91 To vectorize stmt S2, the vectorizer first finds the stmt that defines
92 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
93 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
94 resulting sequence would be:
95
96 VS1: vb = px[i];
97 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
98 VS2: va = vb;
99 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
100
101 Operands that are not SSA_NAMEs, are data-refs that appear in
102 load/store operations (like 'x[i]' in S1), and are handled differently.
103
104 Target modeling:
105 =================
106 Currently the only target specific information that is used is the
107 size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
108 support different sizes of vectors, for now will need to specify one value
109 for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
110
111 Since we only vectorize operations which vector form can be
112 expressed using existing tree codes, to verify that an operation is
113 supported, the vectorizer checks the relevant optab at the relevant
114 machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
115 the value found is CODE_FOR_nothing, then there's no target support, and
116 we can't vectorize the stmt.
117
118 For additional information on this project see:
119 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
120 */
121
122 #include "config.h"
123 #include "system.h"
124 #include "coretypes.h"
125 #include "tm.h"
126 #include "ggc.h"
127 #include "tree.h"
128 #include "target.h"
129 #include "rtl.h"
130 #include "basic-block.h"
131 #include "diagnostic.h"
132 #include "tree-flow.h"
133 #include "tree-dump.h"
134 #include "timevar.h"
135 #include "cfgloop.h"
136 #include "cfglayout.h"
137 #include "expr.h"
138 #include "recog.h"
139 #include "optabs.h"
140 #include "params.h"
141 #include "toplev.h"
142 #include "tree-chrec.h"
143 #include "tree-data-ref.h"
144 #include "tree-scalar-evolution.h"
145 #include "input.h"
146 #include "tree-vectorizer.h"
147 #include "tree-pass.h"
148
149 /*************************************************************************
150 General Vectorization Utilities
151 *************************************************************************/
152
153 /* vect_dump will be set to stderr or dump_file if exist. */
154 FILE *vect_dump;
155
156 /* vect_verbosity_level set to an invalid value
157 to mark that it's uninitialized. */
158 enum verbosity_levels vect_verbosity_level = MAX_VERBOSITY_LEVEL;
159
160 /* Loop location. */
161 static LOC vect_loop_location;
162
163 /* Bitmap of virtual variables to be renamed. */
164 bitmap vect_memsyms_to_rename;
165 \f
166 /*************************************************************************
167 Simple Loop Peeling Utilities
168
169 Utilities to support loop peeling for vectorization purposes.
170 *************************************************************************/
171
172
173 /* Renames the use *OP_P. */
174
175 static void
176 rename_use_op (use_operand_p op_p)
177 {
178 tree new_name;
179
180 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
181 return;
182
183 new_name = get_current_def (USE_FROM_PTR (op_p));
184
185 /* Something defined outside of the loop. */
186 if (!new_name)
187 return;
188
189 /* An ordinary ssa name defined in the loop. */
190
191 SET_USE (op_p, new_name);
192 }
193
194
195 /* Renames the variables in basic block BB. */
196
197 static void
198 rename_variables_in_bb (basic_block bb)
199 {
200 tree phi;
201 block_stmt_iterator bsi;
202 tree stmt;
203 use_operand_p use_p;
204 ssa_op_iter iter;
205 edge e;
206 edge_iterator ei;
207 struct loop *loop = bb->loop_father;
208
209 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
210 {
211 stmt = bsi_stmt (bsi);
212 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
213 rename_use_op (use_p);
214 }
215
216 FOR_EACH_EDGE (e, ei, bb->succs)
217 {
218 if (!flow_bb_inside_loop_p (loop, e->dest))
219 continue;
220 for (phi = phi_nodes (e->dest); phi; phi = PHI_CHAIN (phi))
221 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (phi, e));
222 }
223 }
224
225
226 /* Renames variables in new generated LOOP. */
227
228 void
229 rename_variables_in_loop (struct loop *loop)
230 {
231 unsigned i;
232 basic_block *bbs;
233
234 bbs = get_loop_body (loop);
235
236 for (i = 0; i < loop->num_nodes; i++)
237 rename_variables_in_bb (bbs[i]);
238
239 free (bbs);
240 }
241
242
243 /* Update the PHI nodes of NEW_LOOP.
244
245 NEW_LOOP is a duplicate of ORIG_LOOP.
246 AFTER indicates whether NEW_LOOP executes before or after ORIG_LOOP:
247 AFTER is true if NEW_LOOP executes after ORIG_LOOP, and false if it
248 executes before it. */
249
250 static void
251 slpeel_update_phis_for_duplicate_loop (struct loop *orig_loop,
252 struct loop *new_loop, bool after)
253 {
254 tree new_ssa_name;
255 tree phi_new, phi_orig;
256 tree def;
257 edge orig_loop_latch = loop_latch_edge (orig_loop);
258 edge orig_entry_e = loop_preheader_edge (orig_loop);
259 edge new_loop_exit_e = single_exit (new_loop);
260 edge new_loop_entry_e = loop_preheader_edge (new_loop);
261 edge entry_arg_e = (after ? orig_loop_latch : orig_entry_e);
262
263 /*
264 step 1. For each loop-header-phi:
265 Add the first phi argument for the phi in NEW_LOOP
266 (the one associated with the entry of NEW_LOOP)
267
268 step 2. For each loop-header-phi:
269 Add the second phi argument for the phi in NEW_LOOP
270 (the one associated with the latch of NEW_LOOP)
271
272 step 3. Update the phis in the successor block of NEW_LOOP.
273
274 case 1: NEW_LOOP was placed before ORIG_LOOP:
275 The successor block of NEW_LOOP is the header of ORIG_LOOP.
276 Updating the phis in the successor block can therefore be done
277 along with the scanning of the loop header phis, because the
278 header blocks of ORIG_LOOP and NEW_LOOP have exactly the same
279 phi nodes, organized in the same order.
280
281 case 2: NEW_LOOP was placed after ORIG_LOOP:
282 The successor block of NEW_LOOP is the original exit block of
283 ORIG_LOOP - the phis to be updated are the loop-closed-ssa phis.
284 We postpone updating these phis to a later stage (when
285 loop guards are added).
286 */
287
288
289 /* Scan the phis in the headers of the old and new loops
290 (they are organized in exactly the same order). */
291
292 for (phi_new = phi_nodes (new_loop->header),
293 phi_orig = phi_nodes (orig_loop->header);
294 phi_new && phi_orig;
295 phi_new = PHI_CHAIN (phi_new), phi_orig = PHI_CHAIN (phi_orig))
296 {
297 /* step 1. */
298 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, entry_arg_e);
299 add_phi_arg (phi_new, def, new_loop_entry_e);
300
301 /* step 2. */
302 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
303 if (TREE_CODE (def) != SSA_NAME)
304 continue;
305
306 new_ssa_name = get_current_def (def);
307 if (!new_ssa_name)
308 {
309 /* This only happens if there are no definitions
310 inside the loop. use the phi_result in this case. */
311 new_ssa_name = PHI_RESULT (phi_new);
312 }
313
314 /* An ordinary ssa name defined in the loop. */
315 add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop));
316
317 /* step 3 (case 1). */
318 if (!after)
319 {
320 gcc_assert (new_loop_exit_e == orig_entry_e);
321 SET_PHI_ARG_DEF (phi_orig,
322 new_loop_exit_e->dest_idx,
323 new_ssa_name);
324 }
325 }
326 }
327
328
329 /* Update PHI nodes for a guard of the LOOP.
330
331 Input:
332 - LOOP, GUARD_EDGE: LOOP is a loop for which we added guard code that
333 controls whether LOOP is to be executed. GUARD_EDGE is the edge that
334 originates from the guard-bb, skips LOOP and reaches the (unique) exit
335 bb of LOOP. This loop-exit-bb is an empty bb with one successor.
336 We denote this bb NEW_MERGE_BB because before the guard code was added
337 it had a single predecessor (the LOOP header), and now it became a merge
338 point of two paths - the path that ends with the LOOP exit-edge, and
339 the path that ends with GUARD_EDGE.
340 - NEW_EXIT_BB: New basic block that is added by this function between LOOP
341 and NEW_MERGE_BB. It is used to place loop-closed-ssa-form exit-phis.
342
343 ===> The CFG before the guard-code was added:
344 LOOP_header_bb:
345 loop_body
346 if (exit_loop) goto update_bb
347 else goto LOOP_header_bb
348 update_bb:
349
350 ==> The CFG after the guard-code was added:
351 guard_bb:
352 if (LOOP_guard_condition) goto new_merge_bb
353 else goto LOOP_header_bb
354 LOOP_header_bb:
355 loop_body
356 if (exit_loop_condition) goto new_merge_bb
357 else goto LOOP_header_bb
358 new_merge_bb:
359 goto update_bb
360 update_bb:
361
362 ==> The CFG after this function:
363 guard_bb:
364 if (LOOP_guard_condition) goto new_merge_bb
365 else goto LOOP_header_bb
366 LOOP_header_bb:
367 loop_body
368 if (exit_loop_condition) goto new_exit_bb
369 else goto LOOP_header_bb
370 new_exit_bb:
371 new_merge_bb:
372 goto update_bb
373 update_bb:
374
375 This function:
376 1. creates and updates the relevant phi nodes to account for the new
377 incoming edge (GUARD_EDGE) into NEW_MERGE_BB. This involves:
378 1.1. Create phi nodes at NEW_MERGE_BB.
379 1.2. Update the phi nodes at the successor of NEW_MERGE_BB (denoted
380 UPDATE_BB). UPDATE_BB was the exit-bb of LOOP before NEW_MERGE_BB
381 2. preserves loop-closed-ssa-form by creating the required phi nodes
382 at the exit of LOOP (i.e, in NEW_EXIT_BB).
383
384 There are two flavors to this function:
385
386 slpeel_update_phi_nodes_for_guard1:
387 Here the guard controls whether we enter or skip LOOP, where LOOP is a
388 prolog_loop (loop1 below), and the new phis created in NEW_MERGE_BB are
389 for variables that have phis in the loop header.
390
391 slpeel_update_phi_nodes_for_guard2:
392 Here the guard controls whether we enter or skip LOOP, where LOOP is an
393 epilog_loop (loop2 below), and the new phis created in NEW_MERGE_BB are
394 for variables that have phis in the loop exit.
395
396 I.E., the overall structure is:
397
398 loop1_preheader_bb:
399 guard1 (goto loop1/merg1_bb)
400 loop1
401 loop1_exit_bb:
402 guard2 (goto merge1_bb/merge2_bb)
403 merge1_bb
404 loop2
405 loop2_exit_bb
406 merge2_bb
407 next_bb
408
409 slpeel_update_phi_nodes_for_guard1 takes care of creating phis in
410 loop1_exit_bb and merge1_bb. These are entry phis (phis for the vars
411 that have phis in loop1->header).
412
413 slpeel_update_phi_nodes_for_guard2 takes care of creating phis in
414 loop2_exit_bb and merge2_bb. These are exit phis (phis for the vars
415 that have phis in next_bb). It also adds some of these phis to
416 loop1_exit_bb.
417
418 slpeel_update_phi_nodes_for_guard1 is always called before
419 slpeel_update_phi_nodes_for_guard2. They are both needed in order
420 to create correct data-flow and loop-closed-ssa-form.
421
422 Generally slpeel_update_phi_nodes_for_guard1 creates phis for variables
423 that change between iterations of a loop (and therefore have a phi-node
424 at the loop entry), whereas slpeel_update_phi_nodes_for_guard2 creates
425 phis for variables that are used out of the loop (and therefore have
426 loop-closed exit phis). Some variables may be both updated between
427 iterations and used after the loop. This is why in loop1_exit_bb we
428 may need both entry_phis (created by slpeel_update_phi_nodes_for_guard1)
429 and exit phis (created by slpeel_update_phi_nodes_for_guard2).
430
431 - IS_NEW_LOOP: if IS_NEW_LOOP is true, then LOOP is a newly created copy of
432 an original loop. i.e., we have:
433
434 orig_loop
435 guard_bb (goto LOOP/new_merge)
436 new_loop <-- LOOP
437 new_exit
438 new_merge
439 next_bb
440
441 If IS_NEW_LOOP is false, then LOOP is an original loop, in which case we
442 have:
443
444 new_loop
445 guard_bb (goto LOOP/new_merge)
446 orig_loop <-- LOOP
447 new_exit
448 new_merge
449 next_bb
450
451 The SSA names defined in the original loop have a current
452 reaching definition that that records the corresponding new
453 ssa-name used in the new duplicated loop copy.
454 */
455
456 /* Function slpeel_update_phi_nodes_for_guard1
457
458 Input:
459 - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
460 - DEFS - a bitmap of ssa names to mark new names for which we recorded
461 information.
462
463 In the context of the overall structure, we have:
464
465 loop1_preheader_bb:
466 guard1 (goto loop1/merg1_bb)
467 LOOP-> loop1
468 loop1_exit_bb:
469 guard2 (goto merge1_bb/merge2_bb)
470 merge1_bb
471 loop2
472 loop2_exit_bb
473 merge2_bb
474 next_bb
475
476 For each name updated between loop iterations (i.e - for each name that has
477 an entry (loop-header) phi in LOOP) we create a new phi in:
478 1. merge1_bb (to account for the edge from guard1)
479 2. loop1_exit_bb (an exit-phi to keep LOOP in loop-closed form)
480 */
481
482 static void
483 slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop,
484 bool is_new_loop, basic_block *new_exit_bb,
485 bitmap *defs)
486 {
487 tree orig_phi, new_phi;
488 tree update_phi, update_phi2;
489 tree guard_arg, loop_arg;
490 basic_block new_merge_bb = guard_edge->dest;
491 edge e = EDGE_SUCC (new_merge_bb, 0);
492 basic_block update_bb = e->dest;
493 basic_block orig_bb = loop->header;
494 edge new_exit_e;
495 tree current_new_name;
496 tree name;
497
498 /* Create new bb between loop and new_merge_bb. */
499 *new_exit_bb = split_edge (single_exit (loop));
500
501 new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
502
503 for (orig_phi = phi_nodes (orig_bb), update_phi = phi_nodes (update_bb);
504 orig_phi && update_phi;
505 orig_phi = PHI_CHAIN (orig_phi), update_phi = PHI_CHAIN (update_phi))
506 {
507 /* Virtual phi; Mark it for renaming. We actually want to call
508 mar_sym_for_renaming, but since all ssa renaming datastructures
509 are going to be freed before we get to call ssa_upate, we just
510 record this name for now in a bitmap, and will mark it for
511 renaming later. */
512 name = PHI_RESULT (orig_phi);
513 if (!is_gimple_reg (SSA_NAME_VAR (name)))
514 bitmap_set_bit (vect_memsyms_to_rename, DECL_UID (SSA_NAME_VAR (name)));
515
516 /** 1. Handle new-merge-point phis **/
517
518 /* 1.1. Generate new phi node in NEW_MERGE_BB: */
519 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
520 new_merge_bb);
521
522 /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
523 of LOOP. Set the two phi args in NEW_PHI for these edges: */
524 loop_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, EDGE_SUCC (loop->latch, 0));
525 guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, loop_preheader_edge (loop));
526
527 add_phi_arg (new_phi, loop_arg, new_exit_e);
528 add_phi_arg (new_phi, guard_arg, guard_edge);
529
530 /* 1.3. Update phi in successor block. */
531 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == loop_arg
532 || PHI_ARG_DEF_FROM_EDGE (update_phi, e) == guard_arg);
533 SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
534 update_phi2 = new_phi;
535
536
537 /** 2. Handle loop-closed-ssa-form phis **/
538
539 if (!is_gimple_reg (PHI_RESULT (orig_phi)))
540 continue;
541
542 /* 2.1. Generate new phi node in NEW_EXIT_BB: */
543 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
544 *new_exit_bb);
545
546 /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop. */
547 add_phi_arg (new_phi, loop_arg, single_exit (loop));
548
549 /* 2.3. Update phi in successor of NEW_EXIT_BB: */
550 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
551 SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
552
553 /* 2.4. Record the newly created name with set_current_def.
554 We want to find a name such that
555 name = get_current_def (orig_loop_name)
556 and to set its current definition as follows:
557 set_current_def (name, new_phi_name)
558
559 If LOOP is a new loop then loop_arg is already the name we're
560 looking for. If LOOP is the original loop, then loop_arg is
561 the orig_loop_name and the relevant name is recorded in its
562 current reaching definition. */
563 if (is_new_loop)
564 current_new_name = loop_arg;
565 else
566 {
567 current_new_name = get_current_def (loop_arg);
568 /* current_def is not available only if the variable does not
569 change inside the loop, in which case we also don't care
570 about recording a current_def for it because we won't be
571 trying to create loop-exit-phis for it. */
572 if (!current_new_name)
573 continue;
574 }
575 gcc_assert (get_current_def (current_new_name) == NULL_TREE);
576
577 set_current_def (current_new_name, PHI_RESULT (new_phi));
578 bitmap_set_bit (*defs, SSA_NAME_VERSION (current_new_name));
579 }
580
581 set_phi_nodes (new_merge_bb, phi_reverse (phi_nodes (new_merge_bb)));
582 }
583
584
585 /* Function slpeel_update_phi_nodes_for_guard2
586
587 Input:
588 - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
589
590 In the context of the overall structure, we have:
591
592 loop1_preheader_bb:
593 guard1 (goto loop1/merg1_bb)
594 loop1
595 loop1_exit_bb:
596 guard2 (goto merge1_bb/merge2_bb)
597 merge1_bb
598 LOOP-> loop2
599 loop2_exit_bb
600 merge2_bb
601 next_bb
602
603 For each name used out side the loop (i.e - for each name that has an exit
604 phi in next_bb) we create a new phi in:
605 1. merge2_bb (to account for the edge from guard_bb)
606 2. loop2_exit_bb (an exit-phi to keep LOOP in loop-closed form)
607 3. guard2 bb (an exit phi to keep the preceding loop in loop-closed form),
608 if needed (if it wasn't handled by slpeel_update_phis_nodes_for_phi1).
609 */
610
611 static void
612 slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop,
613 bool is_new_loop, basic_block *new_exit_bb)
614 {
615 tree orig_phi, new_phi;
616 tree update_phi, update_phi2;
617 tree guard_arg, loop_arg;
618 basic_block new_merge_bb = guard_edge->dest;
619 edge e = EDGE_SUCC (new_merge_bb, 0);
620 basic_block update_bb = e->dest;
621 edge new_exit_e;
622 tree orig_def, orig_def_new_name;
623 tree new_name, new_name2;
624 tree arg;
625
626 /* Create new bb between loop and new_merge_bb. */
627 *new_exit_bb = split_edge (single_exit (loop));
628
629 new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
630
631 for (update_phi = phi_nodes (update_bb); update_phi;
632 update_phi = PHI_CHAIN (update_phi))
633 {
634 orig_phi = update_phi;
635 orig_def = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
636 /* This loop-closed-phi actually doesn't represent a use
637 out of the loop - the phi arg is a constant. */
638 if (TREE_CODE (orig_def) != SSA_NAME)
639 continue;
640 orig_def_new_name = get_current_def (orig_def);
641 arg = NULL_TREE;
642
643 /** 1. Handle new-merge-point phis **/
644
645 /* 1.1. Generate new phi node in NEW_MERGE_BB: */
646 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
647 new_merge_bb);
648
649 /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
650 of LOOP. Set the two PHI args in NEW_PHI for these edges: */
651 new_name = orig_def;
652 new_name2 = NULL_TREE;
653 if (orig_def_new_name)
654 {
655 new_name = orig_def_new_name;
656 /* Some variables have both loop-entry-phis and loop-exit-phis.
657 Such variables were given yet newer names by phis placed in
658 guard_bb by slpeel_update_phi_nodes_for_guard1. I.e:
659 new_name2 = get_current_def (get_current_def (orig_name)). */
660 new_name2 = get_current_def (new_name);
661 }
662
663 if (is_new_loop)
664 {
665 guard_arg = orig_def;
666 loop_arg = new_name;
667 }
668 else
669 {
670 guard_arg = new_name;
671 loop_arg = orig_def;
672 }
673 if (new_name2)
674 guard_arg = new_name2;
675
676 add_phi_arg (new_phi, loop_arg, new_exit_e);
677 add_phi_arg (new_phi, guard_arg, guard_edge);
678
679 /* 1.3. Update phi in successor block. */
680 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == orig_def);
681 SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
682 update_phi2 = new_phi;
683
684
685 /** 2. Handle loop-closed-ssa-form phis **/
686
687 /* 2.1. Generate new phi node in NEW_EXIT_BB: */
688 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
689 *new_exit_bb);
690
691 /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop. */
692 add_phi_arg (new_phi, loop_arg, single_exit (loop));
693
694 /* 2.3. Update phi in successor of NEW_EXIT_BB: */
695 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
696 SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
697
698
699 /** 3. Handle loop-closed-ssa-form phis for first loop **/
700
701 /* 3.1. Find the relevant names that need an exit-phi in
702 GUARD_BB, i.e. names for which
703 slpeel_update_phi_nodes_for_guard1 had not already created a
704 phi node. This is the case for names that are used outside
705 the loop (and therefore need an exit phi) but are not updated
706 across loop iterations (and therefore don't have a
707 loop-header-phi).
708
709 slpeel_update_phi_nodes_for_guard1 is responsible for
710 creating loop-exit phis in GUARD_BB for names that have a
711 loop-header-phi. When such a phi is created we also record
712 the new name in its current definition. If this new name
713 exists, then guard_arg was set to this new name (see 1.2
714 above). Therefore, if guard_arg is not this new name, this
715 is an indication that an exit-phi in GUARD_BB was not yet
716 created, so we take care of it here. */
717 if (guard_arg == new_name2)
718 continue;
719 arg = guard_arg;
720
721 /* 3.2. Generate new phi node in GUARD_BB: */
722 new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
723 guard_edge->src);
724
725 /* 3.3. GUARD_BB has one incoming edge: */
726 gcc_assert (EDGE_COUNT (guard_edge->src->preds) == 1);
727 add_phi_arg (new_phi, arg, EDGE_PRED (guard_edge->src, 0));
728
729 /* 3.4. Update phi in successor of GUARD_BB: */
730 gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, guard_edge)
731 == guard_arg);
732 SET_PHI_ARG_DEF (update_phi2, guard_edge->dest_idx, PHI_RESULT (new_phi));
733 }
734
735 set_phi_nodes (new_merge_bb, phi_reverse (phi_nodes (new_merge_bb)));
736 }
737
738
739 /* Make the LOOP iterate NITERS times. This is done by adding a new IV
740 that starts at zero, increases by one and its limit is NITERS.
741
742 Assumption: the exit-condition of LOOP is the last stmt in the loop. */
743
744 void
745 slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters)
746 {
747 tree indx_before_incr, indx_after_incr, cond_stmt, cond;
748 tree orig_cond;
749 edge exit_edge = single_exit (loop);
750 block_stmt_iterator loop_cond_bsi;
751 block_stmt_iterator incr_bsi;
752 bool insert_after;
753 tree init = build_int_cst (TREE_TYPE (niters), 0);
754 tree step = build_int_cst (TREE_TYPE (niters), 1);
755 LOC loop_loc;
756
757 orig_cond = get_loop_exit_condition (loop);
758 gcc_assert (orig_cond);
759 loop_cond_bsi = bsi_for_stmt (orig_cond);
760
761 standard_iv_increment_position (loop, &incr_bsi, &insert_after);
762 create_iv (init, step, NULL_TREE, loop,
763 &incr_bsi, insert_after, &indx_before_incr, &indx_after_incr);
764
765 if (exit_edge->flags & EDGE_TRUE_VALUE) /* 'then' edge exits the loop. */
766 cond = build2 (GE_EXPR, boolean_type_node, indx_after_incr, niters);
767 else /* 'then' edge loops back. */
768 cond = build2 (LT_EXPR, boolean_type_node, indx_after_incr, niters);
769
770 cond_stmt = build3 (COND_EXPR, TREE_TYPE (orig_cond), cond,
771 NULL_TREE, NULL_TREE);
772 bsi_insert_before (&loop_cond_bsi, cond_stmt, BSI_SAME_STMT);
773
774 /* Remove old loop exit test: */
775 bsi_remove (&loop_cond_bsi, true);
776
777 loop_loc = find_loop_location (loop);
778 if (dump_file && (dump_flags & TDF_DETAILS))
779 {
780 if (loop_loc != UNKNOWN_LOC)
781 fprintf (dump_file, "\nloop at %s:%d: ",
782 LOC_FILE (loop_loc), LOC_LINE (loop_loc));
783 print_generic_expr (dump_file, cond_stmt, TDF_SLIM);
784 }
785
786 loop->nb_iterations = niters;
787 }
788
789
790 /* Given LOOP this function generates a new copy of it and puts it
791 on E which is either the entry or exit of LOOP. */
792
793 struct loop *
794 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
795 {
796 struct loop *new_loop;
797 basic_block *new_bbs, *bbs;
798 bool at_exit;
799 bool was_imm_dom;
800 basic_block exit_dest;
801 tree phi, phi_arg;
802 edge exit, new_exit;
803
804 at_exit = (e == single_exit (loop));
805 if (!at_exit && e != loop_preheader_edge (loop))
806 return NULL;
807
808 bbs = get_loop_body (loop);
809
810 /* Check whether duplication is possible. */
811 if (!can_copy_bbs_p (bbs, loop->num_nodes))
812 {
813 free (bbs);
814 return NULL;
815 }
816
817 /* Generate new loop structure. */
818 new_loop = duplicate_loop (loop, loop_outer (loop));
819 if (!new_loop)
820 {
821 free (bbs);
822 return NULL;
823 }
824
825 exit_dest = single_exit (loop)->dest;
826 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
827 exit_dest) == loop->header ?
828 true : false);
829
830 new_bbs = XNEWVEC (basic_block, loop->num_nodes);
831
832 exit = single_exit (loop);
833 copy_bbs (bbs, loop->num_nodes, new_bbs,
834 &exit, 1, &new_exit, NULL,
835 e->src);
836
837 /* Duplicating phi args at exit bbs as coming
838 also from exit of duplicated loop. */
839 for (phi = phi_nodes (exit_dest); phi; phi = PHI_CHAIN (phi))
840 {
841 phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, single_exit (loop));
842 if (phi_arg)
843 {
844 edge new_loop_exit_edge;
845
846 if (EDGE_SUCC (new_loop->header, 0)->dest == new_loop->latch)
847 new_loop_exit_edge = EDGE_SUCC (new_loop->header, 1);
848 else
849 new_loop_exit_edge = EDGE_SUCC (new_loop->header, 0);
850
851 add_phi_arg (phi, phi_arg, new_loop_exit_edge);
852 }
853 }
854
855 if (at_exit) /* Add the loop copy at exit. */
856 {
857 redirect_edge_and_branch_force (e, new_loop->header);
858 PENDING_STMT (e) = NULL;
859 set_immediate_dominator (CDI_DOMINATORS, new_loop->header, e->src);
860 if (was_imm_dom)
861 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header);
862 }
863 else /* Add the copy at entry. */
864 {
865 edge new_exit_e;
866 edge entry_e = loop_preheader_edge (loop);
867 basic_block preheader = entry_e->src;
868
869 if (!flow_bb_inside_loop_p (new_loop,
870 EDGE_SUCC (new_loop->header, 0)->dest))
871 new_exit_e = EDGE_SUCC (new_loop->header, 0);
872 else
873 new_exit_e = EDGE_SUCC (new_loop->header, 1);
874
875 redirect_edge_and_branch_force (new_exit_e, loop->header);
876 PENDING_STMT (new_exit_e) = NULL;
877 set_immediate_dominator (CDI_DOMINATORS, loop->header,
878 new_exit_e->src);
879
880 /* We have to add phi args to the loop->header here as coming
881 from new_exit_e edge. */
882 for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
883 {
884 phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, entry_e);
885 if (phi_arg)
886 add_phi_arg (phi, phi_arg, new_exit_e);
887 }
888
889 redirect_edge_and_branch_force (entry_e, new_loop->header);
890 PENDING_STMT (entry_e) = NULL;
891 set_immediate_dominator (CDI_DOMINATORS, new_loop->header, preheader);
892 }
893
894 free (new_bbs);
895 free (bbs);
896
897 return new_loop;
898 }
899
900
901 /* Given the condition statement COND, put it as the last statement
902 of GUARD_BB; EXIT_BB is the basic block to skip the loop;
903 Assumes that this is the single exit of the guarded loop.
904 Returns the skip edge. */
905
906 static edge
907 slpeel_add_loop_guard (basic_block guard_bb, tree cond, basic_block exit_bb,
908 basic_block dom_bb)
909 {
910 block_stmt_iterator bsi;
911 edge new_e, enter_e;
912 tree cond_stmt;
913 tree gimplify_stmt_list;
914
915 enter_e = EDGE_SUCC (guard_bb, 0);
916 enter_e->flags &= ~EDGE_FALLTHRU;
917 enter_e->flags |= EDGE_FALSE_VALUE;
918 bsi = bsi_last (guard_bb);
919
920 cond =
921 force_gimple_operand (cond, &gimplify_stmt_list, true,
922 NULL_TREE);
923 cond_stmt = build3 (COND_EXPR, void_type_node, cond,
924 NULL_TREE, NULL_TREE);
925 if (gimplify_stmt_list)
926 bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
927
928 bsi = bsi_last (guard_bb);
929 bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
930
931 /* Add new edge to connect guard block to the merge/loop-exit block. */
932 new_e = make_edge (guard_bb, exit_bb, EDGE_TRUE_VALUE);
933 set_immediate_dominator (CDI_DOMINATORS, exit_bb, dom_bb);
934 return new_e;
935 }
936
937
938 /* This function verifies that the following restrictions apply to LOOP:
939 (1) it is innermost
940 (2) it consists of exactly 2 basic blocks - header, and an empty latch.
941 (3) it is single entry, single exit
942 (4) its exit condition is the last stmt in the header
943 (5) E is the entry/exit edge of LOOP.
944 */
945
946 bool
947 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
948 {
949 edge exit_e = single_exit (loop);
950 edge entry_e = loop_preheader_edge (loop);
951 tree orig_cond = get_loop_exit_condition (loop);
952 block_stmt_iterator loop_exit_bsi = bsi_last (exit_e->src);
953
954 if (need_ssa_update_p ())
955 return false;
956
957 if (loop->inner
958 /* All loops have an outer scope; the only case loop->outer is NULL is for
959 the function itself. */
960 || !loop_outer (loop)
961 || loop->num_nodes != 2
962 || !empty_block_p (loop->latch)
963 || !single_exit (loop)
964 /* Verify that new loop exit condition can be trivially modified. */
965 || (!orig_cond || orig_cond != bsi_stmt (loop_exit_bsi))
966 || (e != exit_e && e != entry_e))
967 return false;
968
969 return true;
970 }
971
972 #ifdef ENABLE_CHECKING
973 void
974 slpeel_verify_cfg_after_peeling (struct loop *first_loop,
975 struct loop *second_loop)
976 {
977 basic_block loop1_exit_bb = single_exit (first_loop)->dest;
978 basic_block loop2_entry_bb = loop_preheader_edge (second_loop)->src;
979 basic_block loop1_entry_bb = loop_preheader_edge (first_loop)->src;
980
981 /* A guard that controls whether the second_loop is to be executed or skipped
982 is placed in first_loop->exit. first_loopt->exit therefore has two
983 successors - one is the preheader of second_loop, and the other is a bb
984 after second_loop.
985 */
986 gcc_assert (EDGE_COUNT (loop1_exit_bb->succs) == 2);
987
988 /* 1. Verify that one of the successors of first_loopt->exit is the preheader
989 of second_loop. */
990
991 /* The preheader of new_loop is expected to have two predecessors:
992 first_loop->exit and the block that precedes first_loop. */
993
994 gcc_assert (EDGE_COUNT (loop2_entry_bb->preds) == 2
995 && ((EDGE_PRED (loop2_entry_bb, 0)->src == loop1_exit_bb
996 && EDGE_PRED (loop2_entry_bb, 1)->src == loop1_entry_bb)
997 || (EDGE_PRED (loop2_entry_bb, 1)->src == loop1_exit_bb
998 && EDGE_PRED (loop2_entry_bb, 0)->src == loop1_entry_bb)));
999
1000 /* Verify that the other successor of first_loopt->exit is after the
1001 second_loop. */
1002 /* TODO */
1003 }
1004 #endif
1005
1006 /* If the run time cost model check determines that vectorization is
1007 not profitable and hence scalar loop should be generated then set
1008 FIRST_NITERS to prologue peeled iterations. This will allow all the
1009 iterations to be executed in the prologue peeled scalar loop. */
1010
1011 void
1012 set_prologue_iterations (basic_block bb_before_first_loop,
1013 tree first_niters,
1014 struct loop *loop,
1015 unsigned int th)
1016 {
1017 edge e;
1018 basic_block cond_bb, then_bb;
1019 tree var, prologue_after_cost_adjust_name, stmt;
1020 block_stmt_iterator bsi;
1021 tree newphi;
1022 edge e_true, e_false, e_fallthru;
1023 tree cond_stmt;
1024 tree gimplify_stmt_list;
1025 tree cost_pre_condition = NULL_TREE;
1026 tree scalar_loop_iters =
1027 unshare_expr (LOOP_VINFO_NITERS_UNCHANGED (loop_vec_info_for_loop (loop)));
1028
1029 e = single_pred_edge (bb_before_first_loop);
1030 cond_bb = split_edge(e);
1031
1032 e = single_pred_edge (bb_before_first_loop);
1033 then_bb = split_edge(e);
1034 set_immediate_dominator (CDI_DOMINATORS, then_bb, cond_bb);
1035
1036 e_false = make_single_succ_edge (cond_bb, bb_before_first_loop,
1037 EDGE_FALSE_VALUE);
1038 set_immediate_dominator (CDI_DOMINATORS, bb_before_first_loop, cond_bb);
1039
1040 e_true = EDGE_PRED (then_bb, 0);
1041 e_true->flags &= ~EDGE_FALLTHRU;
1042 e_true->flags |= EDGE_TRUE_VALUE;
1043
1044 e_fallthru = EDGE_SUCC (then_bb, 0);
1045
1046 cost_pre_condition =
1047 build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1048 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1049 cost_pre_condition =
1050 force_gimple_operand (cost_pre_condition, &gimplify_stmt_list,
1051 true, NULL_TREE);
1052 cond_stmt = build3 (COND_EXPR, void_type_node, cost_pre_condition,
1053 NULL_TREE, NULL_TREE);
1054
1055 bsi = bsi_last (cond_bb);
1056 if (gimplify_stmt_list)
1057 bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
1058
1059 bsi = bsi_last (cond_bb);
1060 bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
1061
1062 var = create_tmp_var (TREE_TYPE (scalar_loop_iters),
1063 "prologue_after_cost_adjust");
1064 add_referenced_var (var);
1065 prologue_after_cost_adjust_name =
1066 force_gimple_operand (scalar_loop_iters, &stmt, false, var);
1067
1068 bsi = bsi_last (then_bb);
1069 if (stmt)
1070 bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1071
1072 newphi = create_phi_node (var, bb_before_first_loop);
1073 add_phi_arg (newphi, prologue_after_cost_adjust_name, e_fallthru);
1074 add_phi_arg (newphi, first_niters, e_false);
1075
1076 first_niters = PHI_RESULT (newphi);
1077 }
1078
1079
1080 /* Function slpeel_tree_peel_loop_to_edge.
1081
1082 Peel the first (last) iterations of LOOP into a new prolog (epilog) loop
1083 that is placed on the entry (exit) edge E of LOOP. After this transformation
1084 we have two loops one after the other - first-loop iterates FIRST_NITERS
1085 times, and second-loop iterates the remainder NITERS - FIRST_NITERS times.
1086 If the cost model indicates that it is profitable to emit a scalar
1087 loop instead of the vector one, then the prolog (epilog) loop will iterate
1088 for the entire unchanged scalar iterations of the loop.
1089
1090 Input:
1091 - LOOP: the loop to be peeled.
1092 - E: the exit or entry edge of LOOP.
1093 If it is the entry edge, we peel the first iterations of LOOP. In this
1094 case first-loop is LOOP, and second-loop is the newly created loop.
1095 If it is the exit edge, we peel the last iterations of LOOP. In this
1096 case, first-loop is the newly created loop, and second-loop is LOOP.
1097 - NITERS: the number of iterations that LOOP iterates.
1098 - FIRST_NITERS: the number of iterations that the first-loop should iterate.
1099 - UPDATE_FIRST_LOOP_COUNT: specified whether this function is responsible
1100 for updating the loop bound of the first-loop to FIRST_NITERS. If it
1101 is false, the caller of this function may want to take care of this
1102 (this can be useful if we don't want new stmts added to first-loop).
1103 - TH: cost model profitability threshold of iterations for vectorization.
1104 - CHECK_PROFITABILITY: specify whether cost model check has not occured
1105 during versioning and hence needs to occur during
1106 prologue generation or whether cost model check
1107 has not occured during prologue generation and hence
1108 needs to occur during epilogue generation.
1109
1110
1111 Output:
1112 The function returns a pointer to the new loop-copy, or NULL if it failed
1113 to perform the transformation.
1114
1115 The function generates two if-then-else guards: one before the first loop,
1116 and the other before the second loop:
1117 The first guard is:
1118 if (FIRST_NITERS == 0) then skip the first loop,
1119 and go directly to the second loop.
1120 The second guard is:
1121 if (FIRST_NITERS == NITERS) then skip the second loop.
1122
1123 FORNOW only simple loops are supported (see slpeel_can_duplicate_loop_p).
1124 FORNOW the resulting code will not be in loop-closed-ssa form.
1125 */
1126
1127 struct loop*
1128 slpeel_tree_peel_loop_to_edge (struct loop *loop,
1129 edge e, tree first_niters,
1130 tree niters, bool update_first_loop_count,
1131 unsigned int th, bool check_profitability)
1132 {
1133 struct loop *new_loop = NULL, *first_loop, *second_loop;
1134 edge skip_e;
1135 tree pre_condition = NULL_TREE;
1136 bitmap definitions;
1137 basic_block bb_before_second_loop, bb_after_second_loop;
1138 basic_block bb_before_first_loop;
1139 basic_block bb_between_loops;
1140 basic_block new_exit_bb;
1141 edge exit_e = single_exit (loop);
1142 LOC loop_loc;
1143 tree cost_pre_condition = NULL_TREE;
1144
1145 if (!slpeel_can_duplicate_loop_p (loop, e))
1146 return NULL;
1147
1148 /* We have to initialize cfg_hooks. Then, when calling
1149 cfg_hooks->split_edge, the function tree_split_edge
1150 is actually called and, when calling cfg_hooks->duplicate_block,
1151 the function tree_duplicate_bb is called. */
1152 tree_register_cfg_hooks ();
1153
1154
1155 /* 1. Generate a copy of LOOP and put it on E (E is the entry/exit of LOOP).
1156 Resulting CFG would be:
1157
1158 first_loop:
1159 do {
1160 } while ...
1161
1162 second_loop:
1163 do {
1164 } while ...
1165
1166 orig_exit_bb:
1167 */
1168
1169 if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e)))
1170 {
1171 loop_loc = find_loop_location (loop);
1172 if (dump_file && (dump_flags & TDF_DETAILS))
1173 {
1174 if (loop_loc != UNKNOWN_LOC)
1175 fprintf (dump_file, "\n%s:%d: note: ",
1176 LOC_FILE (loop_loc), LOC_LINE (loop_loc));
1177 fprintf (dump_file, "tree_duplicate_loop_to_edge_cfg failed.\n");
1178 }
1179 return NULL;
1180 }
1181
1182 if (e == exit_e)
1183 {
1184 /* NEW_LOOP was placed after LOOP. */
1185 first_loop = loop;
1186 second_loop = new_loop;
1187 }
1188 else
1189 {
1190 /* NEW_LOOP was placed before LOOP. */
1191 first_loop = new_loop;
1192 second_loop = loop;
1193 }
1194
1195 definitions = ssa_names_to_replace ();
1196 slpeel_update_phis_for_duplicate_loop (loop, new_loop, e == exit_e);
1197 rename_variables_in_loop (new_loop);
1198
1199
1200 /* 2. Add the guard code in one of the following ways:
1201
1202 2.a Add the guard that controls whether the first loop is executed.
1203 This occurs when this function is invoked for prologue or epilogiue
1204 generation and when the cost model check can be done at compile time.
1205
1206 Resulting CFG would be:
1207
1208 bb_before_first_loop:
1209 if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1210 GOTO first-loop
1211
1212 first_loop:
1213 do {
1214 } while ...
1215
1216 bb_before_second_loop:
1217
1218 second_loop:
1219 do {
1220 } while ...
1221
1222 orig_exit_bb:
1223
1224 2.b Add the cost model check that allows the prologue
1225 to iterate for the entire unchanged scalar
1226 iterations of the loop in the event that the cost
1227 model indicates that the scalar loop is more
1228 profitable than the vector one. This occurs when
1229 this function is invoked for prologue generation
1230 and the cost model check needs to be done at run
1231 time.
1232
1233 Resulting CFG after prologue peeling would be:
1234
1235 if (scalar_loop_iterations <= th)
1236 FIRST_NITERS = scalar_loop_iterations
1237
1238 bb_before_first_loop:
1239 if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1240 GOTO first-loop
1241
1242 first_loop:
1243 do {
1244 } while ...
1245
1246 bb_before_second_loop:
1247
1248 second_loop:
1249 do {
1250 } while ...
1251
1252 orig_exit_bb:
1253
1254 2.c Add the cost model check that allows the epilogue
1255 to iterate for the entire unchanged scalar
1256 iterations of the loop in the event that the cost
1257 model indicates that the scalar loop is more
1258 profitable than the vector one. This occurs when
1259 this function is invoked for epilogue generation
1260 and the cost model check needs to be done at run
1261 time.
1262
1263 Resulting CFG after prologue peeling would be:
1264
1265 bb_before_first_loop:
1266 if ((scalar_loop_iterations <= th)
1267 ||
1268 FIRST_NITERS == 0) GOTO bb_before_second_loop
1269 GOTO first-loop
1270
1271 first_loop:
1272 do {
1273 } while ...
1274
1275 bb_before_second_loop:
1276
1277 second_loop:
1278 do {
1279 } while ...
1280
1281 orig_exit_bb:
1282 */
1283
1284 bb_before_first_loop = split_edge (loop_preheader_edge (first_loop));
1285 bb_before_second_loop = split_edge (single_exit (first_loop));
1286
1287 /* Epilogue peeling. */
1288 if (!update_first_loop_count)
1289 {
1290 pre_condition =
1291 fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1292 build_int_cst (TREE_TYPE (first_niters), 0));
1293 if (check_profitability)
1294 {
1295 tree scalar_loop_iters
1296 = unshare_expr (LOOP_VINFO_NITERS_UNCHANGED
1297 (loop_vec_info_for_loop (loop)));
1298 cost_pre_condition =
1299 build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1300 build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1301
1302 pre_condition = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
1303 cost_pre_condition, pre_condition);
1304 }
1305 }
1306
1307 /* Prologue peeling. */
1308 else
1309 {
1310 if (check_profitability)
1311 set_prologue_iterations (bb_before_first_loop, first_niters,
1312 loop, th);
1313
1314 pre_condition =
1315 fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1316 build_int_cst (TREE_TYPE (first_niters), 0));
1317 }
1318
1319 skip_e = slpeel_add_loop_guard (bb_before_first_loop, pre_condition,
1320 bb_before_second_loop, bb_before_first_loop);
1321 slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop,
1322 first_loop == new_loop,
1323 &new_exit_bb, &definitions);
1324
1325
1326 /* 3. Add the guard that controls whether the second loop is executed.
1327 Resulting CFG would be:
1328
1329 bb_before_first_loop:
1330 if (FIRST_NITERS == 0) GOTO bb_before_second_loop (skip first loop)
1331 GOTO first-loop
1332
1333 first_loop:
1334 do {
1335 } while ...
1336
1337 bb_between_loops:
1338 if (FIRST_NITERS == NITERS) GOTO bb_after_second_loop (skip second loop)
1339 GOTO bb_before_second_loop
1340
1341 bb_before_second_loop:
1342
1343 second_loop:
1344 do {
1345 } while ...
1346
1347 bb_after_second_loop:
1348
1349 orig_exit_bb:
1350 */
1351
1352 bb_between_loops = new_exit_bb;
1353 bb_after_second_loop = split_edge (single_exit (second_loop));
1354
1355 pre_condition =
1356 fold_build2 (EQ_EXPR, boolean_type_node, first_niters, niters);
1357 skip_e = slpeel_add_loop_guard (bb_between_loops, pre_condition,
1358 bb_after_second_loop, bb_before_first_loop);
1359 slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop,
1360 second_loop == new_loop, &new_exit_bb);
1361
1362 /* 4. Make first-loop iterate FIRST_NITERS times, if requested.
1363 */
1364 if (update_first_loop_count)
1365 slpeel_make_loop_iterate_ntimes (first_loop, first_niters);
1366
1367 BITMAP_FREE (definitions);
1368 delete_update_ssa ();
1369
1370 return new_loop;
1371 }
1372
1373 /* Function vect_get_loop_location.
1374
1375 Extract the location of the loop in the source code.
1376 If the loop is not well formed for vectorization, an estimated
1377 location is calculated.
1378 Return the loop location if succeed and NULL if not. */
1379
1380 LOC
1381 find_loop_location (struct loop *loop)
1382 {
1383 tree node = NULL_TREE;
1384 basic_block bb;
1385 block_stmt_iterator si;
1386
1387 if (!loop)
1388 return UNKNOWN_LOC;
1389
1390 node = get_loop_exit_condition (loop);
1391
1392 if (node && CAN_HAVE_LOCATION_P (node) && EXPR_HAS_LOCATION (node)
1393 && EXPR_FILENAME (node) && EXPR_LINENO (node))
1394 return EXPR_LOC (node);
1395
1396 /* If we got here the loop is probably not "well formed",
1397 try to estimate the loop location */
1398
1399 if (!loop->header)
1400 return UNKNOWN_LOC;
1401
1402 bb = loop->header;
1403
1404 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1405 {
1406 node = bsi_stmt (si);
1407 if (node && CAN_HAVE_LOCATION_P (node) && EXPR_HAS_LOCATION (node))
1408 return EXPR_LOC (node);
1409 }
1410
1411 return UNKNOWN_LOC;
1412 }
1413
1414
1415 /*************************************************************************
1416 Vectorization Debug Information.
1417 *************************************************************************/
1418
1419 /* Function vect_set_verbosity_level.
1420
1421 Called from toplev.c upon detection of the
1422 -ftree-vectorizer-verbose=N option. */
1423
1424 void
1425 vect_set_verbosity_level (const char *val)
1426 {
1427 unsigned int vl;
1428
1429 vl = atoi (val);
1430 if (vl < MAX_VERBOSITY_LEVEL)
1431 vect_verbosity_level = vl;
1432 else
1433 vect_verbosity_level = MAX_VERBOSITY_LEVEL - 1;
1434 }
1435
1436
1437 /* Function vect_set_dump_settings.
1438
1439 Fix the verbosity level of the vectorizer if the
1440 requested level was not set explicitly using the flag
1441 -ftree-vectorizer-verbose=N.
1442 Decide where to print the debugging information (dump_file/stderr).
1443 If the user defined the verbosity level, but there is no dump file,
1444 print to stderr, otherwise print to the dump file. */
1445
1446 static void
1447 vect_set_dump_settings (void)
1448 {
1449 vect_dump = dump_file;
1450
1451 /* Check if the verbosity level was defined by the user: */
1452 if (vect_verbosity_level != MAX_VERBOSITY_LEVEL)
1453 {
1454 /* If there is no dump file, print to stderr. */
1455 if (!dump_file)
1456 vect_dump = stderr;
1457 return;
1458 }
1459
1460 /* User didn't specify verbosity level: */
1461 if (dump_file && (dump_flags & TDF_DETAILS))
1462 vect_verbosity_level = REPORT_DETAILS;
1463 else if (dump_file && (dump_flags & TDF_STATS))
1464 vect_verbosity_level = REPORT_UNVECTORIZED_LOOPS;
1465 else
1466 vect_verbosity_level = REPORT_NONE;
1467
1468 gcc_assert (dump_file || vect_verbosity_level == REPORT_NONE);
1469 }
1470
1471
1472 /* Function debug_loop_details.
1473
1474 For vectorization debug dumps. */
1475
1476 bool
1477 vect_print_dump_info (enum verbosity_levels vl)
1478 {
1479 if (vl > vect_verbosity_level)
1480 return false;
1481
1482 if (!current_function_decl || !vect_dump)
1483 return false;
1484
1485 if (vect_loop_location == UNKNOWN_LOC)
1486 fprintf (vect_dump, "\n%s:%d: note: ",
1487 DECL_SOURCE_FILE (current_function_decl),
1488 DECL_SOURCE_LINE (current_function_decl));
1489 else
1490 fprintf (vect_dump, "\n%s:%d: note: ",
1491 LOC_FILE (vect_loop_location), LOC_LINE (vect_loop_location));
1492
1493 return true;
1494 }
1495
1496
1497 /*************************************************************************
1498 Vectorization Utilities.
1499 *************************************************************************/
1500
1501 /* Function new_stmt_vec_info.
1502
1503 Create and initialize a new stmt_vec_info struct for STMT. */
1504
1505 stmt_vec_info
1506 new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
1507 {
1508 stmt_vec_info res;
1509 res = (stmt_vec_info) xcalloc (1, sizeof (struct _stmt_vec_info));
1510
1511 STMT_VINFO_TYPE (res) = undef_vec_info_type;
1512 STMT_VINFO_STMT (res) = stmt;
1513 STMT_VINFO_LOOP_VINFO (res) = loop_vinfo;
1514 STMT_VINFO_RELEVANT (res) = 0;
1515 STMT_VINFO_LIVE_P (res) = false;
1516 STMT_VINFO_VECTYPE (res) = NULL;
1517 STMT_VINFO_VEC_STMT (res) = NULL;
1518 STMT_VINFO_IN_PATTERN_P (res) = false;
1519 STMT_VINFO_RELATED_STMT (res) = NULL;
1520 STMT_VINFO_DATA_REF (res) = NULL;
1521
1522 STMT_VINFO_DR_BASE_ADDRESS (res) = NULL;
1523 STMT_VINFO_DR_OFFSET (res) = NULL;
1524 STMT_VINFO_DR_INIT (res) = NULL;
1525 STMT_VINFO_DR_STEP (res) = NULL;
1526 STMT_VINFO_DR_ALIGNED_TO (res) = NULL;
1527
1528 if (TREE_CODE (stmt) == PHI_NODE && is_loop_header_bb_p (bb_for_stmt (stmt)))
1529 STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type;
1530 else
1531 STMT_VINFO_DEF_TYPE (res) = vect_loop_def;
1532 STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5);
1533 STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0;
1534 STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0;
1535 STMT_SLP_TYPE (res) = 0;
1536 DR_GROUP_FIRST_DR (res) = NULL_TREE;
1537 DR_GROUP_NEXT_DR (res) = NULL_TREE;
1538 DR_GROUP_SIZE (res) = 0;
1539 DR_GROUP_STORE_COUNT (res) = 0;
1540 DR_GROUP_GAP (res) = 0;
1541 DR_GROUP_SAME_DR_STMT (res) = NULL_TREE;
1542 DR_GROUP_READ_WRITE_DEPENDENCE (res) = false;
1543
1544 return res;
1545 }
1546
1547
1548 /* Free stmt vectorization related info. */
1549
1550 void
1551 free_stmt_vec_info (tree stmt)
1552 {
1553 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1554
1555 if (!stmt_info)
1556 return;
1557
1558 VEC_free (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
1559 free (stmt_info);
1560 set_stmt_info (stmt_ann (stmt), NULL);
1561 }
1562
1563
1564 /* Function bb_in_loop_p
1565
1566 Used as predicate for dfs order traversal of the loop bbs. */
1567
1568 static bool
1569 bb_in_loop_p (const_basic_block bb, const void *data)
1570 {
1571 const struct loop *const loop = (const struct loop *)data;
1572 if (flow_bb_inside_loop_p (loop, bb))
1573 return true;
1574 return false;
1575 }
1576
1577
1578 /* Function new_loop_vec_info.
1579
1580 Create and initialize a new loop_vec_info struct for LOOP, as well as
1581 stmt_vec_info structs for all the stmts in LOOP. */
1582
1583 loop_vec_info
1584 new_loop_vec_info (struct loop *loop)
1585 {
1586 loop_vec_info res;
1587 basic_block *bbs;
1588 block_stmt_iterator si;
1589 unsigned int i, nbbs;
1590
1591 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1592 LOOP_VINFO_LOOP (res) = loop;
1593
1594 bbs = get_loop_body (loop);
1595
1596 /* Create/Update stmt_info for all stmts in the loop. */
1597 for (i = 0; i < loop->num_nodes; i++)
1598 {
1599 basic_block bb = bbs[i];
1600 tree phi;
1601
1602 /* BBs in a nested inner-loop will have been already processed (because
1603 we will have called vect_analyze_loop_form for any nested inner-loop).
1604 Therefore, for stmts in an inner-loop we just want to update the
1605 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
1606 loop_info of the outer-loop we are currently considering to vectorize
1607 (instead of the loop_info of the inner-loop).
1608 For stmts in other BBs we need to create a stmt_info from scratch. */
1609 if (bb->loop_father != loop)
1610 {
1611 /* Inner-loop bb. */
1612 gcc_assert (loop->inner && bb->loop_father == loop->inner);
1613 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1614 {
1615 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1616 loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1617 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1618 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1619 }
1620 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1621 {
1622 tree stmt = bsi_stmt (si);
1623 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1624 loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1625 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1626 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1627 }
1628 }
1629 else
1630 {
1631 /* bb in current nest. */
1632 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1633 {
1634 stmt_ann_t ann = get_stmt_ann (phi);
1635 set_stmt_info (ann, new_stmt_vec_info (phi, res));
1636 }
1637
1638 for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1639 {
1640 tree stmt = bsi_stmt (si);
1641 stmt_ann_t ann = stmt_ann (stmt);
1642 set_stmt_info (ann, new_stmt_vec_info (stmt, res));
1643 }
1644 }
1645 }
1646
1647 /* CHECKME: We want to visit all BBs before their successors (except for
1648 latch blocks, for which this assertion wouldn't hold). In the simple
1649 case of the loop forms we allow, a dfs order of the BBs would the same
1650 as reversed postorder traversal, so we are safe. */
1651
1652 free (bbs);
1653 bbs = XCNEWVEC (basic_block, loop->num_nodes);
1654 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1655 bbs, loop->num_nodes, loop);
1656 gcc_assert (nbbs == loop->num_nodes);
1657
1658 LOOP_VINFO_BBS (res) = bbs;
1659 LOOP_VINFO_NITERS (res) = NULL;
1660 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1661 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
1662 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1663 LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
1664 LOOP_VINFO_VECT_FACTOR (res) = 0;
1665 LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
1666 LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
1667 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1668 LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
1669 VEC_alloc (tree, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
1670 LOOP_VINFO_MAY_ALIAS_DDRS (res) =
1671 VEC_alloc (ddr_p, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1672 LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (tree, heap, 10);
1673 LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
1674 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1675
1676 return res;
1677 }
1678
1679
1680 /* Function destroy_loop_vec_info.
1681
1682 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1683 stmts in the loop. */
1684
1685 void
1686 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1687 {
1688 struct loop *loop;
1689 basic_block *bbs;
1690 int nbbs;
1691 block_stmt_iterator si;
1692 int j;
1693 VEC (slp_instance, heap) *slp_instances;
1694 slp_instance instance;
1695
1696 if (!loop_vinfo)
1697 return;
1698
1699 loop = LOOP_VINFO_LOOP (loop_vinfo);
1700
1701 bbs = LOOP_VINFO_BBS (loop_vinfo);
1702 nbbs = loop->num_nodes;
1703
1704 if (!clean_stmts)
1705 {
1706 free (LOOP_VINFO_BBS (loop_vinfo));
1707 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1708 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1709 VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1710
1711 free (loop_vinfo);
1712 loop->aux = NULL;
1713 return;
1714 }
1715
1716 for (j = 0; j < nbbs; j++)
1717 {
1718 basic_block bb = bbs[j];
1719 tree phi;
1720
1721 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1722 free_stmt_vec_info (phi);
1723
1724 for (si = bsi_start (bb); !bsi_end_p (si); )
1725 {
1726 tree stmt = bsi_stmt (si);
1727 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1728
1729 if (stmt_info)
1730 {
1731 /* Check if this is a "pattern stmt" (introduced by the
1732 vectorizer during the pattern recognition pass). */
1733 bool remove_stmt_p = false;
1734 tree orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1735 if (orig_stmt)
1736 {
1737 stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
1738 if (orig_stmt_info
1739 && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
1740 remove_stmt_p = true;
1741 }
1742
1743 /* Free stmt_vec_info. */
1744 free_stmt_vec_info (stmt);
1745
1746 /* Remove dead "pattern stmts". */
1747 if (remove_stmt_p)
1748 bsi_remove (&si, true);
1749 }
1750 bsi_next (&si);
1751 }
1752 }
1753
1754 free (LOOP_VINFO_BBS (loop_vinfo));
1755 free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1756 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1757 VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1758 VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1759 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1760 for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
1761 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
1762 VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
1763 VEC_free (tree, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
1764
1765 free (loop_vinfo);
1766 loop->aux = NULL;
1767 }
1768
1769
1770 /* Function vect_force_dr_alignment_p.
1771
1772 Returns whether the alignment of a DECL can be forced to be aligned
1773 on ALIGNMENT bit boundary. */
1774
1775 bool
1776 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
1777 {
1778 if (TREE_CODE (decl) != VAR_DECL)
1779 return false;
1780
1781 if (DECL_EXTERNAL (decl))
1782 return false;
1783
1784 if (TREE_ASM_WRITTEN (decl))
1785 return false;
1786
1787 if (TREE_STATIC (decl))
1788 return (alignment <= MAX_OFILE_ALIGNMENT);
1789 else
1790 /* This used to be PREFERRED_STACK_BOUNDARY, however, that is not 100%
1791 correct until someone implements forced stack alignment. */
1792 return (alignment <= STACK_BOUNDARY);
1793 }
1794
1795
1796 /* Function get_vectype_for_scalar_type.
1797
1798 Returns the vector type corresponding to SCALAR_TYPE as supported
1799 by the target. */
1800
1801 tree
1802 get_vectype_for_scalar_type (tree scalar_type)
1803 {
1804 enum machine_mode inner_mode = TYPE_MODE (scalar_type);
1805 int nbytes = GET_MODE_SIZE (inner_mode);
1806 int nunits;
1807 tree vectype;
1808
1809 if (nbytes == 0 || nbytes >= UNITS_PER_SIMD_WORD)
1810 return NULL_TREE;
1811
1812 /* FORNOW: Only a single vector size per target (UNITS_PER_SIMD_WORD)
1813 is expected. */
1814 nunits = UNITS_PER_SIMD_WORD / nbytes;
1815
1816 vectype = build_vector_type (scalar_type, nunits);
1817 if (vect_print_dump_info (REPORT_DETAILS))
1818 {
1819 fprintf (vect_dump, "get vectype with %d units of type ", nunits);
1820 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
1821 }
1822
1823 if (!vectype)
1824 return NULL_TREE;
1825
1826 if (vect_print_dump_info (REPORT_DETAILS))
1827 {
1828 fprintf (vect_dump, "vectype: ");
1829 print_generic_expr (vect_dump, vectype, TDF_SLIM);
1830 }
1831
1832 if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1833 && !INTEGRAL_MODE_P (TYPE_MODE (vectype)))
1834 {
1835 if (vect_print_dump_info (REPORT_DETAILS))
1836 fprintf (vect_dump, "mode not supported by target.");
1837 return NULL_TREE;
1838 }
1839
1840 return vectype;
1841 }
1842
1843
1844 /* Function vect_supportable_dr_alignment
1845
1846 Return whether the data reference DR is supported with respect to its
1847 alignment. */
1848
1849 enum dr_alignment_support
1850 vect_supportable_dr_alignment (struct data_reference *dr)
1851 {
1852 tree stmt = DR_STMT (dr);
1853 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1854 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1855 enum machine_mode mode = (int) TYPE_MODE (vectype);
1856 struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info));
1857 bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
1858 bool invariant_in_outerloop = false;
1859
1860 if (aligned_access_p (dr))
1861 return dr_aligned;
1862
1863 if (nested_in_vect_loop)
1864 {
1865 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
1866 invariant_in_outerloop =
1867 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
1868 }
1869
1870 /* Possibly unaligned access. */
1871
1872 /* We can choose between using the implicit realignment scheme (generating
1873 a misaligned_move stmt) and the explicit realignment scheme (generating
1874 aligned loads with a REALIGN_LOAD). There are two variants to the explicit
1875 realignment scheme: optimized, and unoptimized.
1876 We can optimize the realignment only if the step between consecutive
1877 vector loads is equal to the vector size. Since the vector memory
1878 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
1879 is guaranteed that the misalignment amount remains the same throughout the
1880 execution of the vectorized loop. Therefore, we can create the
1881 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
1882 at the loop preheader.
1883
1884 However, in the case of outer-loop vectorization, when vectorizing a
1885 memory access in the inner-loop nested within the LOOP that is now being
1886 vectorized, while it is guaranteed that the misalignment of the
1887 vectorized memory access will remain the same in different outer-loop
1888 iterations, it is *not* guaranteed that is will remain the same throughout
1889 the execution of the inner-loop. This is because the inner-loop advances
1890 with the original scalar step (and not in steps of VS). If the inner-loop
1891 step happens to be a multiple of VS, then the misalignment remains fixed
1892 and we can use the optimized realignment scheme. For example:
1893
1894 for (i=0; i<N; i++)
1895 for (j=0; j<M; j++)
1896 s += a[i+j];
1897
1898 When vectorizing the i-loop in the above example, the step between
1899 consecutive vector loads is 1, and so the misalignment does not remain
1900 fixed across the execution of the inner-loop, and the realignment cannot
1901 be optimized (as illustrated in the following pseudo vectorized loop):
1902
1903 for (i=0; i<N; i+=4)
1904 for (j=0; j<M; j++){
1905 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
1906 // when j is {0,1,2,3,4,5,6,7,...} respectively.
1907 // (assuming that we start from an aligned address).
1908 }
1909
1910 We therefore have to use the unoptimized realignment scheme:
1911
1912 for (i=0; i<N; i+=4)
1913 for (j=k; j<M; j+=4)
1914 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
1915 // that the misalignment of the initial address is
1916 // 0).
1917
1918 The loop can then be vectorized as follows:
1919
1920 for (k=0; k<4; k++){
1921 rt = get_realignment_token (&vp[k]);
1922 for (i=0; i<N; i+=4){
1923 v1 = vp[i+k];
1924 for (j=k; j<M; j+=4){
1925 v2 = vp[i+j+VS-1];
1926 va = REALIGN_LOAD <v1,v2,rt>;
1927 vs += va;
1928 v1 = v2;
1929 }
1930 }
1931 } */
1932
1933 if (DR_IS_READ (dr))
1934 {
1935 if (optab_handler (vec_realign_load_optab, mode)->insn_code !=
1936 CODE_FOR_nothing
1937 && (!targetm.vectorize.builtin_mask_for_load
1938 || targetm.vectorize.builtin_mask_for_load ()))
1939 {
1940 if (nested_in_vect_loop
1941 && TREE_INT_CST_LOW (DR_STEP (dr)) != UNITS_PER_SIMD_WORD)
1942 return dr_explicit_realign;
1943 else
1944 return dr_explicit_realign_optimized;
1945 }
1946
1947 if (optab_handler (movmisalign_optab, mode)->insn_code !=
1948 CODE_FOR_nothing)
1949 /* Can't software pipeline the loads, but can at least do them. */
1950 return dr_unaligned_supported;
1951 }
1952
1953 /* Unsupported. */
1954 return dr_unaligned_unsupported;
1955 }
1956
1957
1958 /* Function vect_is_simple_use.
1959
1960 Input:
1961 LOOP - the loop that is being vectorized.
1962 OPERAND - operand of a stmt in LOOP.
1963 DEF - the defining stmt in case OPERAND is an SSA_NAME.
1964
1965 Returns whether a stmt with OPERAND can be vectorized.
1966 Supportable operands are constants, loop invariants, and operands that are
1967 defined by the current iteration of the loop. Unsupportable operands are
1968 those that are defined by a previous iteration of the loop (as is the case
1969 in reduction/induction computations). */
1970
1971 bool
1972 vect_is_simple_use (tree operand, loop_vec_info loop_vinfo, tree *def_stmt,
1973 tree *def, enum vect_def_type *dt)
1974 {
1975 basic_block bb;
1976 stmt_vec_info stmt_vinfo;
1977 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1978
1979 *def_stmt = NULL_TREE;
1980 *def = NULL_TREE;
1981
1982 if (vect_print_dump_info (REPORT_DETAILS))
1983 {
1984 fprintf (vect_dump, "vect_is_simple_use: operand ");
1985 print_generic_expr (vect_dump, operand, TDF_SLIM);
1986 }
1987
1988 if (TREE_CODE (operand) == INTEGER_CST || TREE_CODE (operand) == REAL_CST)
1989 {
1990 *dt = vect_constant_def;
1991 return true;
1992 }
1993 if (is_gimple_min_invariant (operand))
1994 {
1995 *def = operand;
1996 *dt = vect_invariant_def;
1997 return true;
1998 }
1999
2000 if (TREE_CODE (operand) == PAREN_EXPR)
2001 {
2002 if (vect_print_dump_info (REPORT_DETAILS))
2003 fprintf (vect_dump, "non-associatable copy.");
2004 operand = TREE_OPERAND (operand, 0);
2005 }
2006 if (TREE_CODE (operand) != SSA_NAME)
2007 {
2008 if (vect_print_dump_info (REPORT_DETAILS))
2009 fprintf (vect_dump, "not ssa-name.");
2010 return false;
2011 }
2012
2013 *def_stmt = SSA_NAME_DEF_STMT (operand);
2014 if (*def_stmt == NULL_TREE )
2015 {
2016 if (vect_print_dump_info (REPORT_DETAILS))
2017 fprintf (vect_dump, "no def_stmt.");
2018 return false;
2019 }
2020
2021 if (vect_print_dump_info (REPORT_DETAILS))
2022 {
2023 fprintf (vect_dump, "def_stmt: ");
2024 print_generic_expr (vect_dump, *def_stmt, TDF_SLIM);
2025 }
2026
2027 /* empty stmt is expected only in case of a function argument.
2028 (Otherwise - we expect a phi_node or a GIMPLE_MODIFY_STMT). */
2029 if (IS_EMPTY_STMT (*def_stmt))
2030 {
2031 tree arg = TREE_OPERAND (*def_stmt, 0);
2032 if (is_gimple_min_invariant (arg))
2033 {
2034 *def = operand;
2035 *dt = vect_invariant_def;
2036 return true;
2037 }
2038
2039 if (vect_print_dump_info (REPORT_DETAILS))
2040 fprintf (vect_dump, "Unexpected empty stmt.");
2041 return false;
2042 }
2043
2044 bb = bb_for_stmt (*def_stmt);
2045 if (!flow_bb_inside_loop_p (loop, bb))
2046 *dt = vect_invariant_def;
2047 else
2048 {
2049 stmt_vinfo = vinfo_for_stmt (*def_stmt);
2050 *dt = STMT_VINFO_DEF_TYPE (stmt_vinfo);
2051 }
2052
2053 if (*dt == vect_unknown_def_type)
2054 {
2055 if (vect_print_dump_info (REPORT_DETAILS))
2056 fprintf (vect_dump, "Unsupported pattern.");
2057 return false;
2058 }
2059
2060 if (vect_print_dump_info (REPORT_DETAILS))
2061 fprintf (vect_dump, "type of def: %d.",*dt);
2062
2063 switch (TREE_CODE (*def_stmt))
2064 {
2065 case PHI_NODE:
2066 *def = PHI_RESULT (*def_stmt);
2067 break;
2068
2069 case GIMPLE_MODIFY_STMT:
2070 *def = GIMPLE_STMT_OPERAND (*def_stmt, 0);
2071 break;
2072
2073 default:
2074 if (vect_print_dump_info (REPORT_DETAILS))
2075 fprintf (vect_dump, "unsupported defining stmt: ");
2076 return false;
2077 }
2078
2079 return true;
2080 }
2081
2082
2083 /* Function supportable_widening_operation
2084
2085 Check whether an operation represented by the code CODE is a
2086 widening operation that is supported by the target platform in
2087 vector form (i.e., when operating on arguments of type VECTYPE).
2088
2089 Widening operations we currently support are NOP (CONVERT), FLOAT
2090 and WIDEN_MULT. This function checks if these operations are supported
2091 by the target platform either directly (via vector tree-codes), or via
2092 target builtins.
2093
2094 Output:
2095 - CODE1 and CODE2 are codes of vector operations to be used when
2096 vectorizing the operation, if available.
2097 - DECL1 and DECL2 are decls of target builtin functions to be used
2098 when vectorizing the operation, if available. In this case,
2099 CODE1 and CODE2 are CALL_EXPR. */
2100
2101 bool
2102 supportable_widening_operation (enum tree_code code, tree stmt, tree vectype,
2103 tree *decl1, tree *decl2,
2104 enum tree_code *code1, enum tree_code *code2)
2105 {
2106 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2107 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
2108 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2109 bool ordered_p;
2110 enum machine_mode vec_mode;
2111 enum insn_code icode1, icode2;
2112 optab optab1, optab2;
2113 tree expr = GIMPLE_STMT_OPERAND (stmt, 1);
2114 tree type = TREE_TYPE (expr);
2115 tree wide_vectype = get_vectype_for_scalar_type (type);
2116 enum tree_code c1, c2;
2117
2118 /* The result of a vectorized widening operation usually requires two vectors
2119 (because the widened results do not fit int one vector). The generated
2120 vector results would normally be expected to be generated in the same
2121 order as in the original scalar computation. i.e. if 8 results are
2122 generated in each vector iteration, they are to be organized as follows:
2123 vect1: [res1,res2,res3,res4], vect2: [res5,res6,res7,res8].
2124
2125 However, in the special case that the result of the widening operation is
2126 used in a reduction computation only, the order doesn't matter (because
2127 when vectorizing a reduction we change the order of the computation).
2128 Some targets can take advantage of this and generate more efficient code.
2129 For example, targets like Altivec, that support widen_mult using a sequence
2130 of {mult_even,mult_odd} generate the following vectors:
2131 vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8].
2132
2133 When vectorizaing outer-loops, we execute the inner-loop sequentially
2134 (each vectorized inner-loop iteration contributes to VF outer-loop
2135 iterations in parallel). We therefore don't allow to change the order
2136 of the computation in the inner-loop during outer-loop vectorization. */
2137
2138 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
2139 && !nested_in_vect_loop_p (vect_loop, stmt))
2140 ordered_p = false;
2141 else
2142 ordered_p = true;
2143
2144 if (!ordered_p
2145 && code == WIDEN_MULT_EXPR
2146 && targetm.vectorize.builtin_mul_widen_even
2147 && targetm.vectorize.builtin_mul_widen_even (vectype)
2148 && targetm.vectorize.builtin_mul_widen_odd
2149 && targetm.vectorize.builtin_mul_widen_odd (vectype))
2150 {
2151 if (vect_print_dump_info (REPORT_DETAILS))
2152 fprintf (vect_dump, "Unordered widening operation detected.");
2153
2154 *code1 = *code2 = CALL_EXPR;
2155 *decl1 = targetm.vectorize.builtin_mul_widen_even (vectype);
2156 *decl2 = targetm.vectorize.builtin_mul_widen_odd (vectype);
2157 return true;
2158 }
2159
2160 switch (code)
2161 {
2162 case WIDEN_MULT_EXPR:
2163 if (BYTES_BIG_ENDIAN)
2164 {
2165 c1 = VEC_WIDEN_MULT_HI_EXPR;
2166 c2 = VEC_WIDEN_MULT_LO_EXPR;
2167 }
2168 else
2169 {
2170 c2 = VEC_WIDEN_MULT_HI_EXPR;
2171 c1 = VEC_WIDEN_MULT_LO_EXPR;
2172 }
2173 break;
2174
2175 CASE_CONVERT:
2176 if (BYTES_BIG_ENDIAN)
2177 {
2178 c1 = VEC_UNPACK_HI_EXPR;
2179 c2 = VEC_UNPACK_LO_EXPR;
2180 }
2181 else
2182 {
2183 c2 = VEC_UNPACK_HI_EXPR;
2184 c1 = VEC_UNPACK_LO_EXPR;
2185 }
2186 break;
2187
2188 case FLOAT_EXPR:
2189 if (BYTES_BIG_ENDIAN)
2190 {
2191 c1 = VEC_UNPACK_FLOAT_HI_EXPR;
2192 c2 = VEC_UNPACK_FLOAT_LO_EXPR;
2193 }
2194 else
2195 {
2196 c2 = VEC_UNPACK_FLOAT_HI_EXPR;
2197 c1 = VEC_UNPACK_FLOAT_LO_EXPR;
2198 }
2199 break;
2200
2201 case FIX_TRUNC_EXPR:
2202 /* ??? Not yet implemented due to missing VEC_UNPACK_FIX_TRUNC_HI_EXPR/
2203 VEC_UNPACK_FIX_TRUNC_LO_EXPR tree codes and optabs used for
2204 computing the operation. */
2205 return false;
2206
2207 default:
2208 gcc_unreachable ();
2209 }
2210
2211 if (code == FIX_TRUNC_EXPR)
2212 {
2213 /* The signedness is determined from output operand. */
2214 optab1 = optab_for_tree_code (c1, type, optab_default);
2215 optab2 = optab_for_tree_code (c2, type, optab_default);
2216 }
2217 else
2218 {
2219 optab1 = optab_for_tree_code (c1, vectype, optab_default);
2220 optab2 = optab_for_tree_code (c2, vectype, optab_default);
2221 }
2222
2223 if (!optab1 || !optab2)
2224 return false;
2225
2226 vec_mode = TYPE_MODE (vectype);
2227 if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
2228 || insn_data[icode1].operand[0].mode != TYPE_MODE (wide_vectype)
2229 || (icode2 = optab_handler (optab2, vec_mode)->insn_code)
2230 == CODE_FOR_nothing
2231 || insn_data[icode2].operand[0].mode != TYPE_MODE (wide_vectype))
2232 return false;
2233
2234 *code1 = c1;
2235 *code2 = c2;
2236 return true;
2237 }
2238
2239
2240 /* Function supportable_narrowing_operation
2241
2242 Check whether an operation represented by the code CODE is a
2243 narrowing operation that is supported by the target platform in
2244 vector form (i.e., when operating on arguments of type VECTYPE).
2245
2246 Narrowing operations we currently support are NOP (CONVERT) and
2247 FIX_TRUNC. This function checks if these operations are supported by
2248 the target platform directly via vector tree-codes.
2249
2250 Output:
2251 - CODE1 is the code of a vector operation to be used when
2252 vectorizing the operation, if available. */
2253
2254 bool
2255 supportable_narrowing_operation (enum tree_code code,
2256 const_tree stmt, const_tree vectype,
2257 enum tree_code *code1)
2258 {
2259 enum machine_mode vec_mode;
2260 enum insn_code icode1;
2261 optab optab1;
2262 tree expr = GIMPLE_STMT_OPERAND (stmt, 1);
2263 tree type = TREE_TYPE (expr);
2264 tree narrow_vectype = get_vectype_for_scalar_type (type);
2265 enum tree_code c1;
2266
2267 switch (code)
2268 {
2269 CASE_CONVERT:
2270 c1 = VEC_PACK_TRUNC_EXPR;
2271 break;
2272
2273 case FIX_TRUNC_EXPR:
2274 c1 = VEC_PACK_FIX_TRUNC_EXPR;
2275 break;
2276
2277 case FLOAT_EXPR:
2278 /* ??? Not yet implemented due to missing VEC_PACK_FLOAT_EXPR
2279 tree code and optabs used for computing the operation. */
2280 return false;
2281
2282 default:
2283 gcc_unreachable ();
2284 }
2285
2286 if (code == FIX_TRUNC_EXPR)
2287 /* The signedness is determined from output operand. */
2288 optab1 = optab_for_tree_code (c1, type, optab_default);
2289 else
2290 optab1 = optab_for_tree_code (c1, vectype, optab_default);
2291
2292 if (!optab1)
2293 return false;
2294
2295 vec_mode = TYPE_MODE (vectype);
2296 if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
2297 || insn_data[icode1].operand[0].mode != TYPE_MODE (narrow_vectype))
2298 return false;
2299
2300 *code1 = c1;
2301 return true;
2302 }
2303
2304
2305 /* Function reduction_code_for_scalar_code
2306
2307 Input:
2308 CODE - tree_code of a reduction operations.
2309
2310 Output:
2311 REDUC_CODE - the corresponding tree-code to be used to reduce the
2312 vector of partial results into a single scalar result (which
2313 will also reside in a vector).
2314
2315 Return TRUE if a corresponding REDUC_CODE was found, FALSE otherwise. */
2316
2317 bool
2318 reduction_code_for_scalar_code (enum tree_code code,
2319 enum tree_code *reduc_code)
2320 {
2321 switch (code)
2322 {
2323 case MAX_EXPR:
2324 *reduc_code = REDUC_MAX_EXPR;
2325 return true;
2326
2327 case MIN_EXPR:
2328 *reduc_code = REDUC_MIN_EXPR;
2329 return true;
2330
2331 case PLUS_EXPR:
2332 *reduc_code = REDUC_PLUS_EXPR;
2333 return true;
2334
2335 default:
2336 return false;
2337 }
2338 }
2339
2340
2341 /* Function vect_is_simple_reduction
2342
2343 Detect a cross-iteration def-use cycle that represents a simple
2344 reduction computation. We look for the following pattern:
2345
2346 loop_header:
2347 a1 = phi < a0, a2 >
2348 a3 = ...
2349 a2 = operation (a3, a1)
2350
2351 such that:
2352 1. operation is commutative and associative and it is safe to
2353 change the order of the computation.
2354 2. no uses for a2 in the loop (a2 is used out of the loop)
2355 3. no uses of a1 in the loop besides the reduction operation.
2356
2357 Condition 1 is tested here.
2358 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. */
2359
2360 tree
2361 vect_is_simple_reduction (loop_vec_info loop_info, tree phi)
2362 {
2363 struct loop *loop = (bb_for_stmt (phi))->loop_father;
2364 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2365 edge latch_e = loop_latch_edge (loop);
2366 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2367 tree def_stmt, def1, def2;
2368 enum tree_code code;
2369 int op_type;
2370 tree operation, op1, op2;
2371 tree type;
2372 int nloop_uses;
2373 tree name;
2374 imm_use_iterator imm_iter;
2375 use_operand_p use_p;
2376
2377 gcc_assert (loop == vect_loop || flow_loop_nested_p (vect_loop, loop));
2378
2379 name = PHI_RESULT (phi);
2380 nloop_uses = 0;
2381 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2382 {
2383 tree use_stmt = USE_STMT (use_p);
2384 if (flow_bb_inside_loop_p (loop, bb_for_stmt (use_stmt))
2385 && vinfo_for_stmt (use_stmt)
2386 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2387 nloop_uses++;
2388 if (nloop_uses > 1)
2389 {
2390 if (vect_print_dump_info (REPORT_DETAILS))
2391 fprintf (vect_dump, "reduction used in loop.");
2392 return NULL_TREE;
2393 }
2394 }
2395
2396 if (TREE_CODE (loop_arg) != SSA_NAME)
2397 {
2398 if (vect_print_dump_info (REPORT_DETAILS))
2399 {
2400 fprintf (vect_dump, "reduction: not ssa_name: ");
2401 print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
2402 }
2403 return NULL_TREE;
2404 }
2405
2406 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2407 if (!def_stmt)
2408 {
2409 if (vect_print_dump_info (REPORT_DETAILS))
2410 fprintf (vect_dump, "reduction: no def_stmt.");
2411 return NULL_TREE;
2412 }
2413
2414 if (TREE_CODE (def_stmt) != GIMPLE_MODIFY_STMT)
2415 {
2416 if (vect_print_dump_info (REPORT_DETAILS))
2417 print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
2418 return NULL_TREE;
2419 }
2420
2421 name = GIMPLE_STMT_OPERAND (def_stmt, 0);
2422 nloop_uses = 0;
2423 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2424 {
2425 tree use_stmt = USE_STMT (use_p);
2426 if (flow_bb_inside_loop_p (loop, bb_for_stmt (use_stmt))
2427 && vinfo_for_stmt (use_stmt)
2428 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2429 nloop_uses++;
2430 if (nloop_uses > 1)
2431 {
2432 if (vect_print_dump_info (REPORT_DETAILS))
2433 fprintf (vect_dump, "reduction used in loop.");
2434 return NULL_TREE;
2435 }
2436 }
2437
2438 operation = GIMPLE_STMT_OPERAND (def_stmt, 1);
2439 code = TREE_CODE (operation);
2440 if (!commutative_tree_code (code) || !associative_tree_code (code))
2441 {
2442 if (vect_print_dump_info (REPORT_DETAILS))
2443 {
2444 fprintf (vect_dump, "reduction: not commutative/associative: ");
2445 print_generic_expr (vect_dump, operation, TDF_SLIM);
2446 }
2447 return NULL_TREE;
2448 }
2449
2450 op_type = TREE_OPERAND_LENGTH (operation);
2451 if (op_type != binary_op)
2452 {
2453 if (vect_print_dump_info (REPORT_DETAILS))
2454 {
2455 fprintf (vect_dump, "reduction: not binary operation: ");
2456 print_generic_expr (vect_dump, operation, TDF_SLIM);
2457 }
2458 return NULL_TREE;
2459 }
2460
2461 op1 = TREE_OPERAND (operation, 0);
2462 op2 = TREE_OPERAND (operation, 1);
2463 if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
2464 {
2465 if (vect_print_dump_info (REPORT_DETAILS))
2466 {
2467 fprintf (vect_dump, "reduction: uses not ssa_names: ");
2468 print_generic_expr (vect_dump, operation, TDF_SLIM);
2469 }
2470 return NULL_TREE;
2471 }
2472
2473 /* Check that it's ok to change the order of the computation. */
2474 type = TREE_TYPE (operation);
2475 if (TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op1))
2476 || TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op2)))
2477 {
2478 if (vect_print_dump_info (REPORT_DETAILS))
2479 {
2480 fprintf (vect_dump, "reduction: multiple types: operation type: ");
2481 print_generic_expr (vect_dump, type, TDF_SLIM);
2482 fprintf (vect_dump, ", operands types: ");
2483 print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2484 fprintf (vect_dump, ",");
2485 print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2486 }
2487 return NULL_TREE;
2488 }
2489
2490 /* Generally, when vectorizing a reduction we change the order of the
2491 computation. This may change the behavior of the program in some
2492 cases, so we need to check that this is ok. One exception is when
2493 vectorizing an outer-loop: the inner-loop is executed sequentially,
2494 and therefore vectorizing reductions in the inner-loop durint
2495 outer-loop vectorization is safe. */
2496
2497 /* CHECKME: check for !flag_finite_math_only too? */
2498 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2499 && !nested_in_vect_loop_p (vect_loop, def_stmt))
2500 {
2501 /* Changing the order of operations changes the semantics. */
2502 if (vect_print_dump_info (REPORT_DETAILS))
2503 {
2504 fprintf (vect_dump, "reduction: unsafe fp math optimization: ");
2505 print_generic_expr (vect_dump, operation, TDF_SLIM);
2506 }
2507 return NULL_TREE;
2508 }
2509 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2510 && !nested_in_vect_loop_p (vect_loop, def_stmt))
2511 {
2512 /* Changing the order of operations changes the semantics. */
2513 if (vect_print_dump_info (REPORT_DETAILS))
2514 {
2515 fprintf (vect_dump, "reduction: unsafe int math optimization: ");
2516 print_generic_expr (vect_dump, operation, TDF_SLIM);
2517 }
2518 return NULL_TREE;
2519 }
2520 else if (SAT_FIXED_POINT_TYPE_P (type))
2521 {
2522 /* Changing the order of operations changes the semantics. */
2523 if (vect_print_dump_info (REPORT_DETAILS))
2524 {
2525 fprintf (vect_dump, "reduction: unsafe fixed-point math optimization: ");
2526 print_generic_expr (vect_dump, operation, TDF_SLIM);
2527 }
2528 return NULL_TREE;
2529 }
2530
2531 /* reduction is safe. we're dealing with one of the following:
2532 1) integer arithmetic and no trapv
2533 2) floating point arithmetic, and special flags permit this optimization.
2534 */
2535 def1 = SSA_NAME_DEF_STMT (op1);
2536 def2 = SSA_NAME_DEF_STMT (op2);
2537 if (!def1 || !def2 || IS_EMPTY_STMT (def1) || IS_EMPTY_STMT (def2))
2538 {
2539 if (vect_print_dump_info (REPORT_DETAILS))
2540 {
2541 fprintf (vect_dump, "reduction: no defs for operands: ");
2542 print_generic_expr (vect_dump, operation, TDF_SLIM);
2543 }
2544 return NULL_TREE;
2545 }
2546
2547
2548 /* Check that one def is the reduction def, defined by PHI,
2549 the other def is either defined in the loop ("vect_loop_def"),
2550 or it's an induction (defined by a loop-header phi-node). */
2551
2552 if (def2 == phi
2553 && flow_bb_inside_loop_p (loop, bb_for_stmt (def1))
2554 && (TREE_CODE (def1) == GIMPLE_MODIFY_STMT
2555 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def
2556 || (TREE_CODE (def1) == PHI_NODE
2557 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_loop_def
2558 && !is_loop_header_bb_p (bb_for_stmt (def1)))))
2559 {
2560 if (vect_print_dump_info (REPORT_DETAILS))
2561 {
2562 fprintf (vect_dump, "detected reduction:");
2563 print_generic_expr (vect_dump, operation, TDF_SLIM);
2564 }
2565 return def_stmt;
2566 }
2567 else if (def1 == phi
2568 && flow_bb_inside_loop_p (loop, bb_for_stmt (def2))
2569 && (TREE_CODE (def2) == GIMPLE_MODIFY_STMT
2570 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def
2571 || (TREE_CODE (def2) == PHI_NODE
2572 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_loop_def
2573 && !is_loop_header_bb_p (bb_for_stmt (def2)))))
2574 {
2575 /* Swap operands (just for simplicity - so that the rest of the code
2576 can assume that the reduction variable is always the last (second)
2577 argument). */
2578 if (vect_print_dump_info (REPORT_DETAILS))
2579 {
2580 fprintf (vect_dump, "detected reduction: need to swap operands:");
2581 print_generic_expr (vect_dump, operation, TDF_SLIM);
2582 }
2583 swap_tree_operands (def_stmt, &TREE_OPERAND (operation, 0),
2584 &TREE_OPERAND (operation, 1));
2585 return def_stmt;
2586 }
2587 else
2588 {
2589 if (vect_print_dump_info (REPORT_DETAILS))
2590 {
2591 fprintf (vect_dump, "reduction: unknown pattern.");
2592 print_generic_expr (vect_dump, operation, TDF_SLIM);
2593 }
2594 return NULL_TREE;
2595 }
2596 }
2597
2598
2599 /* Function vect_is_simple_iv_evolution.
2600
2601 FORNOW: A simple evolution of an induction variables in the loop is
2602 considered a polynomial evolution with constant step. */
2603
2604 bool
2605 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
2606 tree * step)
2607 {
2608 tree init_expr;
2609 tree step_expr;
2610 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
2611
2612 /* When there is no evolution in this loop, the evolution function
2613 is not "simple". */
2614 if (evolution_part == NULL_TREE)
2615 return false;
2616
2617 /* When the evolution is a polynomial of degree >= 2
2618 the evolution function is not "simple". */
2619 if (tree_is_chrec (evolution_part))
2620 return false;
2621
2622 step_expr = evolution_part;
2623 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
2624
2625 if (vect_print_dump_info (REPORT_DETAILS))
2626 {
2627 fprintf (vect_dump, "step: ");
2628 print_generic_expr (vect_dump, step_expr, TDF_SLIM);
2629 fprintf (vect_dump, ", init: ");
2630 print_generic_expr (vect_dump, init_expr, TDF_SLIM);
2631 }
2632
2633 *init = init_expr;
2634 *step = step_expr;
2635
2636 if (TREE_CODE (step_expr) != INTEGER_CST)
2637 {
2638 if (vect_print_dump_info (REPORT_DETAILS))
2639 fprintf (vect_dump, "step unknown.");
2640 return false;
2641 }
2642
2643 return true;
2644 }
2645
2646
2647 /* Function vectorize_loops.
2648
2649 Entry Point to loop vectorization phase. */
2650
2651 unsigned
2652 vectorize_loops (void)
2653 {
2654 unsigned int i;
2655 unsigned int num_vectorized_loops = 0;
2656 unsigned int vect_loops_num;
2657 loop_iterator li;
2658 struct loop *loop;
2659
2660 vect_loops_num = number_of_loops ();
2661
2662 /* Bail out if there are no loops. */
2663 if (vect_loops_num <= 1)
2664 return 0;
2665
2666 /* Fix the verbosity level if not defined explicitly by the user. */
2667 vect_set_dump_settings ();
2668
2669 /* Allocate the bitmap that records which virtual variables that
2670 need to be renamed. */
2671 vect_memsyms_to_rename = BITMAP_ALLOC (NULL);
2672
2673 /* ----------- Analyze loops. ----------- */
2674
2675 /* If some loop was duplicated, it gets bigger number
2676 than all previously defined loops. This fact allows us to run
2677 only over initial loops skipping newly generated ones. */
2678 FOR_EACH_LOOP (li, loop, 0)
2679 {
2680 loop_vec_info loop_vinfo;
2681
2682 vect_loop_location = find_loop_location (loop);
2683 loop_vinfo = vect_analyze_loop (loop);
2684 loop->aux = loop_vinfo;
2685
2686 if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
2687 continue;
2688
2689 vect_transform_loop (loop_vinfo);
2690 num_vectorized_loops++;
2691 }
2692 vect_loop_location = UNKNOWN_LOC;
2693
2694 statistics_counter_event (cfun, "Vectorized loops", num_vectorized_loops);
2695 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)
2696 || (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
2697 && num_vectorized_loops > 0))
2698 fprintf (vect_dump, "vectorized %u loops in function.\n",
2699 num_vectorized_loops);
2700
2701 /* ----------- Finalize. ----------- */
2702
2703 BITMAP_FREE (vect_memsyms_to_rename);
2704
2705 for (i = 1; i < vect_loops_num; i++)
2706 {
2707 loop_vec_info loop_vinfo;
2708
2709 loop = get_loop (i);
2710 if (!loop)
2711 continue;
2712 loop_vinfo = loop->aux;
2713 destroy_loop_vec_info (loop_vinfo, true);
2714 loop->aux = NULL;
2715 }
2716
2717 return num_vectorized_loops > 0 ? TODO_cleanup_cfg : 0;
2718 }
2719
2720 /* Increase alignment of global arrays to improve vectorization potential.
2721 TODO:
2722 - Consider also structs that have an array field.
2723 - Use ipa analysis to prune arrays that can't be vectorized?
2724 This should involve global alignment analysis and in the future also
2725 array padding. */
2726
2727 static unsigned int
2728 increase_alignment (void)
2729 {
2730 struct varpool_node *vnode;
2731
2732 /* Increase the alignment of all global arrays for vectorization. */
2733 for (vnode = varpool_nodes_queue;
2734 vnode;
2735 vnode = vnode->next_needed)
2736 {
2737 tree vectype, decl = vnode->decl;
2738 unsigned int alignment;
2739
2740 if (TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
2741 continue;
2742 vectype = get_vectype_for_scalar_type (TREE_TYPE (TREE_TYPE (decl)));
2743 if (!vectype)
2744 continue;
2745 alignment = TYPE_ALIGN (vectype);
2746 if (DECL_ALIGN (decl) >= alignment)
2747 continue;
2748
2749 if (vect_can_force_dr_alignment_p (decl, alignment))
2750 {
2751 DECL_ALIGN (decl) = TYPE_ALIGN (vectype);
2752 DECL_USER_ALIGN (decl) = 1;
2753 if (dump_file)
2754 {
2755 fprintf (dump_file, "Increasing alignment of decl: ");
2756 print_generic_expr (dump_file, decl, TDF_SLIM);
2757 }
2758 }
2759 }
2760 return 0;
2761 }
2762
2763 static bool
2764 gate_increase_alignment (void)
2765 {
2766 return flag_section_anchors && flag_tree_vectorize;
2767 }
2768
2769 struct simple_ipa_opt_pass pass_ipa_increase_alignment =
2770 {
2771 {
2772 SIMPLE_IPA_PASS,
2773 "increase_alignment", /* name */
2774 gate_increase_alignment, /* gate */
2775 increase_alignment, /* execute */
2776 NULL, /* sub */
2777 NULL, /* next */
2778 0, /* static_pass_number */
2779 0, /* tv_id */
2780 0, /* properties_required */
2781 0, /* properties_provided */
2782 0, /* properties_destroyed */
2783 0, /* todo_flags_start */
2784 0 /* todo_flags_finish */
2785 }
2786 };