avr.c: Move definition of TARGET macros to end of file.
[gcc.git] / gcc / tree-loop-distribution.c
1 /* Loop distribution.
2 Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011
3 Free Software Foundation, Inc.
4 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
5 and Sebastian Pop <sebastian.pop@amd.com>.
6
7 This file is part of GCC.
8
9 GCC is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 3, or (at your option) any
12 later version.
13
14 GCC is distributed in the hope that it will be useful, but WITHOUT
15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
22
23 /* This pass performs loop distribution: for example, the loop
24
25 |DO I = 2, N
26 | A(I) = B(I) + C
27 | D(I) = A(I-1)*E
28 |ENDDO
29
30 is transformed to
31
32 |DOALL I = 2, N
33 | A(I) = B(I) + C
34 |ENDDO
35 |
36 |DOALL I = 2, N
37 | D(I) = A(I-1)*E
38 |ENDDO
39
40 This pass uses an RDG, Reduced Dependence Graph built on top of the
41 data dependence relations. The RDG is then topologically sorted to
42 obtain a map of information producers/consumers based on which it
43 generates the new loops. */
44
45 #include "config.h"
46 #include "system.h"
47 #include "coretypes.h"
48 #include "tree-flow.h"
49 #include "cfgloop.h"
50 #include "tree-chrec.h"
51 #include "tree-data-ref.h"
52 #include "tree-scalar-evolution.h"
53 #include "tree-pass.h"
54
55 /* If bit I is not set, it means that this node represents an
56 operation that has already been performed, and that should not be
57 performed again. This is the subgraph of remaining important
58 computations that is passed to the DFS algorithm for avoiding to
59 include several times the same stores in different loops. */
60 static bitmap remaining_stmts;
61
62 /* A node of the RDG is marked in this bitmap when it has as a
63 predecessor a node that writes to memory. */
64 static bitmap upstream_mem_writes;
65
66 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
67 the LOOP. */
68
69 static bool
70 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
71 {
72 imm_use_iterator imm_iter;
73 use_operand_p use_p;
74
75 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
76 if (loop != loop_containing_stmt (USE_STMT (use_p)))
77 return true;
78
79 return false;
80 }
81
82 /* Returns true when STMT defines a scalar variable used after the
83 loop. */
84
85 static bool
86 stmt_has_scalar_dependences_outside_loop (gimple stmt)
87 {
88 tree name;
89
90 switch (gimple_code (stmt))
91 {
92 case GIMPLE_CALL:
93 case GIMPLE_ASSIGN:
94 name = gimple_get_lhs (stmt);
95 break;
96
97 case GIMPLE_PHI:
98 name = gimple_phi_result (stmt);
99 break;
100
101 default:
102 return false;
103 }
104
105 return (name
106 && TREE_CODE (name) == SSA_NAME
107 && ssa_name_has_uses_outside_loop_p (name,
108 loop_containing_stmt (stmt)));
109 }
110
111 /* Update the PHI nodes of NEW_LOOP. NEW_LOOP is a duplicate of
112 ORIG_LOOP. */
113
114 static void
115 update_phis_for_loop_copy (struct loop *orig_loop, struct loop *new_loop)
116 {
117 tree new_ssa_name;
118 gimple_stmt_iterator si_new, si_orig;
119 edge orig_loop_latch = loop_latch_edge (orig_loop);
120 edge orig_entry_e = loop_preheader_edge (orig_loop);
121 edge new_loop_entry_e = loop_preheader_edge (new_loop);
122
123 /* Scan the phis in the headers of the old and new loops
124 (they are organized in exactly the same order). */
125 for (si_new = gsi_start_phis (new_loop->header),
126 si_orig = gsi_start_phis (orig_loop->header);
127 !gsi_end_p (si_new) && !gsi_end_p (si_orig);
128 gsi_next (&si_new), gsi_next (&si_orig))
129 {
130 tree def;
131 source_location locus;
132 gimple phi_new = gsi_stmt (si_new);
133 gimple phi_orig = gsi_stmt (si_orig);
134
135 /* Add the first phi argument for the phi in NEW_LOOP (the one
136 associated with the entry of NEW_LOOP) */
137 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_entry_e);
138 locus = gimple_phi_arg_location_from_edge (phi_orig, orig_entry_e);
139 add_phi_arg (phi_new, def, new_loop_entry_e, locus);
140
141 /* Add the second phi argument for the phi in NEW_LOOP (the one
142 associated with the latch of NEW_LOOP) */
143 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
144 locus = gimple_phi_arg_location_from_edge (phi_orig, orig_loop_latch);
145
146 if (TREE_CODE (def) == SSA_NAME)
147 {
148 new_ssa_name = get_current_def (def);
149
150 if (!new_ssa_name)
151 /* This only happens if there are no definitions inside the
152 loop. Use the the invariant in the new loop as is. */
153 new_ssa_name = def;
154 }
155 else
156 /* Could be an integer. */
157 new_ssa_name = def;
158
159 add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop), locus);
160 }
161 }
162
163 /* Return a copy of LOOP placed before LOOP. */
164
165 static struct loop *
166 copy_loop_before (struct loop *loop)
167 {
168 struct loop *res;
169 edge preheader = loop_preheader_edge (loop);
170
171 if (!single_exit (loop))
172 return NULL;
173
174 initialize_original_copy_tables ();
175 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader);
176 free_original_copy_tables ();
177
178 if (!res)
179 return NULL;
180
181 update_phis_for_loop_copy (loop, res);
182 rename_variables_in_loop (res);
183
184 return res;
185 }
186
187 /* Creates an empty basic block after LOOP. */
188
189 static void
190 create_bb_after_loop (struct loop *loop)
191 {
192 edge exit = single_exit (loop);
193
194 if (!exit)
195 return;
196
197 split_edge (exit);
198 }
199
200 /* Generate code for PARTITION from the code in LOOP. The loop is
201 copied when COPY_P is true. All the statements not flagged in the
202 PARTITION bitmap are removed from the loop or from its copy. The
203 statements are indexed in sequence inside a basic block, and the
204 basic blocks of a loop are taken in dom order. Returns true when
205 the code gen succeeded. */
206
207 static bool
208 generate_loops_for_partition (struct loop *loop, bitmap partition, bool copy_p)
209 {
210 unsigned i, x;
211 gimple_stmt_iterator bsi;
212 basic_block *bbs;
213
214 if (copy_p)
215 {
216 loop = copy_loop_before (loop);
217 create_preheader (loop, CP_SIMPLE_PREHEADERS);
218 create_bb_after_loop (loop);
219 }
220
221 if (loop == NULL)
222 return false;
223
224 /* Remove stmts not in the PARTITION bitmap. The order in which we
225 visit the phi nodes and the statements is exactly as in
226 stmts_from_loop. */
227 bbs = get_loop_body_in_dom_order (loop);
228
229 if (MAY_HAVE_DEBUG_STMTS)
230 for (x = 0, i = 0; i < loop->num_nodes; i++)
231 {
232 basic_block bb = bbs[i];
233
234 for (bsi = gsi_start_phis (bb); !gsi_end_p (bsi); gsi_next (&bsi))
235 if (!bitmap_bit_p (partition, x++))
236 reset_debug_uses (gsi_stmt (bsi));
237
238 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
239 {
240 gimple stmt = gsi_stmt (bsi);
241 if (gimple_code (stmt) != GIMPLE_LABEL
242 && !is_gimple_debug (stmt)
243 && !bitmap_bit_p (partition, x++))
244 reset_debug_uses (stmt);
245 }
246 }
247
248 for (x = 0, i = 0; i < loop->num_nodes; i++)
249 {
250 basic_block bb = bbs[i];
251
252 for (bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
253 if (!bitmap_bit_p (partition, x++))
254 {
255 gimple phi = gsi_stmt (bsi);
256 if (!is_gimple_reg (gimple_phi_result (phi)))
257 mark_virtual_phi_result_for_renaming (phi);
258 remove_phi_node (&bsi, true);
259 }
260 else
261 gsi_next (&bsi);
262
263 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
264 {
265 gimple stmt = gsi_stmt (bsi);
266 if (gimple_code (stmt) != GIMPLE_LABEL
267 && !is_gimple_debug (stmt)
268 && !bitmap_bit_p (partition, x++))
269 {
270 unlink_stmt_vdef (stmt);
271 gsi_remove (&bsi, true);
272 release_defs (stmt);
273 }
274 else
275 gsi_next (&bsi);
276 }
277 }
278
279 free (bbs);
280 return true;
281 }
282
283 /* Build the size argument for a memset call. */
284
285 static inline tree
286 build_size_arg_loc (location_t loc, tree nb_iter, tree op,
287 gimple_seq *stmt_list)
288 {
289 gimple_seq stmts;
290 tree x = fold_build2_loc (loc, MULT_EXPR, size_type_node,
291 fold_convert_loc (loc, size_type_node, nb_iter),
292 fold_convert_loc (loc, size_type_node,
293 TYPE_SIZE_UNIT (TREE_TYPE (op))));
294 x = force_gimple_operand (x, &stmts, true, NULL);
295 gimple_seq_add_seq (stmt_list, stmts);
296
297 return x;
298 }
299
300 /* Generate a call to memset. Return true when the operation succeeded. */
301
302 static void
303 generate_memset_zero (gimple stmt, tree op0, tree nb_iter,
304 gimple_stmt_iterator bsi)
305 {
306 tree addr_base, nb_bytes;
307 bool res = false;
308 gimple_seq stmt_list = NULL, stmts;
309 gimple fn_call;
310 tree mem, fn;
311 struct data_reference *dr = XCNEW (struct data_reference);
312 location_t loc = gimple_location (stmt);
313
314 DR_STMT (dr) = stmt;
315 DR_REF (dr) = op0;
316 res = dr_analyze_innermost (dr, loop_containing_stmt (stmt));
317 gcc_assert (res && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0)));
318
319 nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list);
320 addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
321 addr_base = fold_convert_loc (loc, sizetype, addr_base);
322
323 /* Test for a negative stride, iterating over every element. */
324 if (tree_int_cst_sgn (DR_STEP (dr)) == -1)
325 {
326 addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base,
327 fold_convert_loc (loc, sizetype, nb_bytes));
328 addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base,
329 TYPE_SIZE_UNIT (TREE_TYPE (op0)));
330 }
331
332 addr_base = fold_build_pointer_plus_loc (loc,
333 DR_BASE_ADDRESS (dr), addr_base);
334 mem = force_gimple_operand (addr_base, &stmts, true, NULL);
335 gimple_seq_add_seq (&stmt_list, stmts);
336
337 fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
338 fn_call = gimple_build_call (fn, 3, mem, integer_zero_node, nb_bytes);
339 gimple_seq_add_stmt (&stmt_list, fn_call);
340 gsi_insert_seq_after (&bsi, stmt_list, GSI_CONTINUE_LINKING);
341
342 if (dump_file && (dump_flags & TDF_DETAILS))
343 fprintf (dump_file, "generated memset zero\n");
344
345 free_data_ref (dr);
346 }
347
348 /* Tries to generate a builtin function for the instructions of LOOP
349 pointed to by the bits set in PARTITION. Returns true when the
350 operation succeeded. */
351
352 static bool
353 generate_builtin (struct loop *loop, bitmap partition, bool copy_p)
354 {
355 bool res = false;
356 unsigned i, x = 0;
357 basic_block *bbs;
358 gimple write = NULL;
359 gimple_stmt_iterator bsi;
360 tree nb_iter = number_of_exit_cond_executions (loop);
361
362 if (!nb_iter || nb_iter == chrec_dont_know)
363 return false;
364
365 bbs = get_loop_body_in_dom_order (loop);
366
367 for (i = 0; i < loop->num_nodes; i++)
368 {
369 basic_block bb = bbs[i];
370
371 for (bsi = gsi_start_phis (bb); !gsi_end_p (bsi); gsi_next (&bsi))
372 x++;
373
374 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
375 {
376 gimple stmt = gsi_stmt (bsi);
377
378 if (gimple_code (stmt) == GIMPLE_LABEL
379 || is_gimple_debug (stmt))
380 continue;
381
382 if (!bitmap_bit_p (partition, x++))
383 continue;
384
385 /* If the stmt has uses outside of the loop fail. */
386 if (stmt_has_scalar_dependences_outside_loop (stmt))
387 goto end;
388
389 if (is_gimple_assign (stmt)
390 && !is_gimple_reg (gimple_assign_lhs (stmt)))
391 {
392 /* Don't generate the builtins when there are more than
393 one memory write. */
394 if (write != NULL)
395 goto end;
396
397 write = stmt;
398 if (bb == loop->latch)
399 nb_iter = number_of_latch_executions (loop);
400 }
401 }
402 }
403
404 if (!stmt_with_adjacent_zero_store_dr_p (write))
405 goto end;
406
407 /* The new statements will be placed before LOOP. */
408 bsi = gsi_last_bb (loop_preheader_edge (loop)->src);
409 generate_memset_zero (write, gimple_assign_lhs (write), nb_iter, bsi);
410 res = true;
411
412 /* If this is the last partition for which we generate code, we have
413 to destroy the loop. */
414 if (!copy_p)
415 {
416 unsigned nbbs = loop->num_nodes;
417 edge exit = single_exit (loop);
418 basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
419 redirect_edge_pred (exit, src);
420 exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
421 exit->flags |= EDGE_FALLTHRU;
422 cancel_loop_tree (loop);
423 rescan_loop_exit (exit, false, true);
424
425 for (i = 0; i < nbbs; i++)
426 delete_basic_block (bbs[i]);
427
428 set_immediate_dominator (CDI_DOMINATORS, dest,
429 recompute_dominator (CDI_DOMINATORS, dest));
430 }
431
432 end:
433 free (bbs);
434 return res;
435 }
436
437 /* Generates code for PARTITION. For simple loops, this function can
438 generate a built-in. */
439
440 static bool
441 generate_code_for_partition (struct loop *loop, bitmap partition, bool copy_p)
442 {
443 if (generate_builtin (loop, partition, copy_p))
444 return true;
445
446 return generate_loops_for_partition (loop, partition, copy_p);
447 }
448
449
450 /* Returns true if the node V of RDG cannot be recomputed. */
451
452 static bool
453 rdg_cannot_recompute_vertex_p (struct graph *rdg, int v)
454 {
455 if (RDG_MEM_WRITE_STMT (rdg, v))
456 return true;
457
458 return false;
459 }
460
461 /* Returns true when the vertex V has already been generated in the
462 current partition (V is in PROCESSED), or when V belongs to another
463 partition and cannot be recomputed (V is not in REMAINING_STMTS). */
464
465 static inline bool
466 already_processed_vertex_p (bitmap processed, int v)
467 {
468 return (bitmap_bit_p (processed, v)
469 || !bitmap_bit_p (remaining_stmts, v));
470 }
471
472 /* Returns NULL when there is no anti-dependence among the successors
473 of vertex V, otherwise returns the edge with the anti-dep. */
474
475 static struct graph_edge *
476 has_anti_dependence (struct vertex *v)
477 {
478 struct graph_edge *e;
479
480 if (v->succ)
481 for (e = v->succ; e; e = e->succ_next)
482 if (RDGE_TYPE (e) == anti_dd)
483 return e;
484
485 return NULL;
486 }
487
488 /* Returns true when V has an anti-dependence edge among its successors. */
489
490 static bool
491 predecessor_has_mem_write (struct graph *rdg, struct vertex *v)
492 {
493 struct graph_edge *e;
494
495 if (v->pred)
496 for (e = v->pred; e; e = e->pred_next)
497 if (bitmap_bit_p (upstream_mem_writes, e->src)
498 /* Don't consider flow channels: a write to memory followed
499 by a read from memory. These channels allow the split of
500 the RDG in different partitions. */
501 && !RDG_MEM_WRITE_STMT (rdg, e->src))
502 return true;
503
504 return false;
505 }
506
507 /* Initializes the upstream_mem_writes bitmap following the
508 information from RDG. */
509
510 static void
511 mark_nodes_having_upstream_mem_writes (struct graph *rdg)
512 {
513 int v, x;
514 bitmap seen = BITMAP_ALLOC (NULL);
515
516 for (v = rdg->n_vertices - 1; v >= 0; v--)
517 if (!bitmap_bit_p (seen, v))
518 {
519 unsigned i;
520 VEC (int, heap) *nodes = VEC_alloc (int, heap, 3);
521
522 graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
523
524 FOR_EACH_VEC_ELT (int, nodes, i, x)
525 {
526 if (!bitmap_set_bit (seen, x))
527 continue;
528
529 if (RDG_MEM_WRITE_STMT (rdg, x)
530 || predecessor_has_mem_write (rdg, &(rdg->vertices[x]))
531 /* In anti dependences the read should occur before
532 the write, this is why both the read and the write
533 should be placed in the same partition. */
534 || has_anti_dependence (&(rdg->vertices[x])))
535 {
536 bitmap_set_bit (upstream_mem_writes, x);
537 }
538 }
539
540 VEC_free (int, heap, nodes);
541 }
542 }
543
544 /* Returns true when vertex u has a memory write node as a predecessor
545 in RDG. */
546
547 static bool
548 has_upstream_mem_writes (int u)
549 {
550 return bitmap_bit_p (upstream_mem_writes, u);
551 }
552
553 static void rdg_flag_vertex_and_dependent (struct graph *, int, bitmap, bitmap,
554 bitmap, bool *);
555
556 /* Flag the uses of U stopping following the information from
557 upstream_mem_writes. */
558
559 static void
560 rdg_flag_uses (struct graph *rdg, int u, bitmap partition, bitmap loops,
561 bitmap processed, bool *part_has_writes)
562 {
563 use_operand_p use_p;
564 struct vertex *x = &(rdg->vertices[u]);
565 gimple stmt = RDGV_STMT (x);
566 struct graph_edge *anti_dep = has_anti_dependence (x);
567
568 /* Keep in the same partition the destination of an antidependence,
569 because this is a store to the exact same location. Putting this
570 in another partition is bad for cache locality. */
571 if (anti_dep)
572 {
573 int v = anti_dep->dest;
574
575 if (!already_processed_vertex_p (processed, v))
576 rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
577 processed, part_has_writes);
578 }
579
580 if (gimple_code (stmt) != GIMPLE_PHI)
581 {
582 if ((use_p = gimple_vuse_op (stmt)) != NULL_USE_OPERAND_P)
583 {
584 tree use = USE_FROM_PTR (use_p);
585
586 if (TREE_CODE (use) == SSA_NAME)
587 {
588 gimple def_stmt = SSA_NAME_DEF_STMT (use);
589 int v = rdg_vertex_for_stmt (rdg, def_stmt);
590
591 if (v >= 0
592 && !already_processed_vertex_p (processed, v))
593 rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
594 processed, part_has_writes);
595 }
596 }
597 }
598
599 if (is_gimple_assign (stmt) && has_upstream_mem_writes (u))
600 {
601 tree op0 = gimple_assign_lhs (stmt);
602
603 /* Scalar channels don't have enough space for transmitting data
604 between tasks, unless we add more storage by privatizing. */
605 if (is_gimple_reg (op0))
606 {
607 use_operand_p use_p;
608 imm_use_iterator iter;
609
610 FOR_EACH_IMM_USE_FAST (use_p, iter, op0)
611 {
612 int v = rdg_vertex_for_stmt (rdg, USE_STMT (use_p));
613
614 if (!already_processed_vertex_p (processed, v))
615 rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
616 processed, part_has_writes);
617 }
618 }
619 }
620 }
621
622 /* Flag V from RDG as part of PARTITION, and also flag its loop number
623 in LOOPS. */
624
625 static void
626 rdg_flag_vertex (struct graph *rdg, int v, bitmap partition, bitmap loops,
627 bool *part_has_writes)
628 {
629 struct loop *loop;
630
631 if (!bitmap_set_bit (partition, v))
632 return;
633
634 loop = loop_containing_stmt (RDG_STMT (rdg, v));
635 bitmap_set_bit (loops, loop->num);
636
637 if (rdg_cannot_recompute_vertex_p (rdg, v))
638 {
639 *part_has_writes = true;
640 bitmap_clear_bit (remaining_stmts, v);
641 }
642 }
643
644 /* Flag in the bitmap PARTITION the vertex V and all its predecessors.
645 Also flag their loop number in LOOPS. */
646
647 static void
648 rdg_flag_vertex_and_dependent (struct graph *rdg, int v, bitmap partition,
649 bitmap loops, bitmap processed,
650 bool *part_has_writes)
651 {
652 unsigned i;
653 VEC (int, heap) *nodes = VEC_alloc (int, heap, 3);
654 int x;
655
656 bitmap_set_bit (processed, v);
657 rdg_flag_uses (rdg, v, partition, loops, processed, part_has_writes);
658 graphds_dfs (rdg, &v, 1, &nodes, false, remaining_stmts);
659 rdg_flag_vertex (rdg, v, partition, loops, part_has_writes);
660
661 FOR_EACH_VEC_ELT (int, nodes, i, x)
662 if (!already_processed_vertex_p (processed, x))
663 rdg_flag_vertex_and_dependent (rdg, x, partition, loops, processed,
664 part_has_writes);
665
666 VEC_free (int, heap, nodes);
667 }
668
669 /* Initialize CONDS with all the condition statements from the basic
670 blocks of LOOP. */
671
672 static void
673 collect_condition_stmts (struct loop *loop, VEC (gimple, heap) **conds)
674 {
675 unsigned i;
676 edge e;
677 VEC (edge, heap) *exits = get_loop_exit_edges (loop);
678
679 FOR_EACH_VEC_ELT (edge, exits, i, e)
680 {
681 gimple cond = last_stmt (e->src);
682
683 if (cond)
684 VEC_safe_push (gimple, heap, *conds, cond);
685 }
686
687 VEC_free (edge, heap, exits);
688 }
689
690 /* Add to PARTITION all the exit condition statements for LOOPS
691 together with all their dependent statements determined from
692 RDG. */
693
694 static void
695 rdg_flag_loop_exits (struct graph *rdg, bitmap loops, bitmap partition,
696 bitmap processed, bool *part_has_writes)
697 {
698 unsigned i;
699 bitmap_iterator bi;
700 VEC (gimple, heap) *conds = VEC_alloc (gimple, heap, 3);
701
702 EXECUTE_IF_SET_IN_BITMAP (loops, 0, i, bi)
703 collect_condition_stmts (get_loop (i), &conds);
704
705 while (!VEC_empty (gimple, conds))
706 {
707 gimple cond = VEC_pop (gimple, conds);
708 int v = rdg_vertex_for_stmt (rdg, cond);
709 bitmap new_loops = BITMAP_ALLOC (NULL);
710
711 if (!already_processed_vertex_p (processed, v))
712 rdg_flag_vertex_and_dependent (rdg, v, partition, new_loops, processed,
713 part_has_writes);
714
715 EXECUTE_IF_SET_IN_BITMAP (new_loops, 0, i, bi)
716 if (bitmap_set_bit (loops, i))
717 collect_condition_stmts (get_loop (i), &conds);
718
719 BITMAP_FREE (new_loops);
720 }
721
722 VEC_free (gimple, heap, conds);
723 }
724
725 /* Returns a bitmap in which all the statements needed for computing
726 the strongly connected component C of the RDG are flagged, also
727 including the loop exit conditions. */
728
729 static bitmap
730 build_rdg_partition_for_component (struct graph *rdg, rdgc c,
731 bool *part_has_writes)
732 {
733 int i, v;
734 bitmap partition = BITMAP_ALLOC (NULL);
735 bitmap loops = BITMAP_ALLOC (NULL);
736 bitmap processed = BITMAP_ALLOC (NULL);
737
738 FOR_EACH_VEC_ELT (int, c->vertices, i, v)
739 if (!already_processed_vertex_p (processed, v))
740 rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed,
741 part_has_writes);
742
743 rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes);
744
745 BITMAP_FREE (processed);
746 BITMAP_FREE (loops);
747 return partition;
748 }
749
750 /* Free memory for COMPONENTS. */
751
752 static void
753 free_rdg_components (VEC (rdgc, heap) *components)
754 {
755 int i;
756 rdgc x;
757
758 FOR_EACH_VEC_ELT (rdgc, components, i, x)
759 {
760 VEC_free (int, heap, x->vertices);
761 free (x);
762 }
763
764 VEC_free (rdgc, heap, components);
765 }
766
767 /* Build the COMPONENTS vector with the strongly connected components
768 of RDG in which the STARTING_VERTICES occur. */
769
770 static void
771 rdg_build_components (struct graph *rdg, VEC (int, heap) *starting_vertices,
772 VEC (rdgc, heap) **components)
773 {
774 int i, v;
775 bitmap saved_components = BITMAP_ALLOC (NULL);
776 int n_components = graphds_scc (rdg, NULL);
777 VEC (int, heap) **all_components = XNEWVEC (VEC (int, heap) *, n_components);
778
779 for (i = 0; i < n_components; i++)
780 all_components[i] = VEC_alloc (int, heap, 3);
781
782 for (i = 0; i < rdg->n_vertices; i++)
783 VEC_safe_push (int, heap, all_components[rdg->vertices[i].component], i);
784
785 FOR_EACH_VEC_ELT (int, starting_vertices, i, v)
786 {
787 int c = rdg->vertices[v].component;
788
789 if (bitmap_set_bit (saved_components, c))
790 {
791 rdgc x = XCNEW (struct rdg_component);
792 x->num = c;
793 x->vertices = all_components[c];
794
795 VEC_safe_push (rdgc, heap, *components, x);
796 }
797 }
798
799 for (i = 0; i < n_components; i++)
800 if (!bitmap_bit_p (saved_components, i))
801 VEC_free (int, heap, all_components[i]);
802
803 free (all_components);
804 BITMAP_FREE (saved_components);
805 }
806
807 /* Returns true when it is possible to generate a builtin pattern for
808 the PARTITION of RDG. For the moment we detect only the memset
809 zero pattern. */
810
811 static bool
812 can_generate_builtin (struct graph *rdg, bitmap partition)
813 {
814 unsigned i;
815 bitmap_iterator bi;
816 int nb_reads = 0;
817 int nb_writes = 0;
818 int stores_zero = 0;
819
820 EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, bi)
821 if (RDG_MEM_READS_STMT (rdg, i))
822 nb_reads++;
823 else if (RDG_MEM_WRITE_STMT (rdg, i))
824 {
825 nb_writes++;
826 if (stmt_with_adjacent_zero_store_dr_p (RDG_STMT (rdg, i)))
827 stores_zero++;
828 }
829
830 return stores_zero == 1 && nb_writes == 1 && nb_reads == 0;
831 }
832
833 /* Returns true when PARTITION1 and PARTITION2 have similar memory
834 accesses in RDG. */
835
836 static bool
837 similar_memory_accesses (struct graph *rdg, bitmap partition1,
838 bitmap partition2)
839 {
840 unsigned i, j;
841 bitmap_iterator bi, bj;
842
843 EXECUTE_IF_SET_IN_BITMAP (partition1, 0, i, bi)
844 if (RDG_MEM_WRITE_STMT (rdg, i)
845 || RDG_MEM_READS_STMT (rdg, i))
846 EXECUTE_IF_SET_IN_BITMAP (partition2, 0, j, bj)
847 if (RDG_MEM_WRITE_STMT (rdg, j)
848 || RDG_MEM_READS_STMT (rdg, j))
849 if (rdg_has_similar_memory_accesses (rdg, i, j))
850 return true;
851
852 return false;
853 }
854
855 /* Fuse all the partitions from PARTITIONS that contain similar memory
856 references, i.e., we're taking care of cache locality. This
857 function does not fuse those partitions that contain patterns that
858 can be code generated with builtins. */
859
860 static void
861 fuse_partitions_with_similar_memory_accesses (struct graph *rdg,
862 VEC (bitmap, heap) **partitions)
863 {
864 int p1, p2;
865 bitmap partition1, partition2;
866
867 FOR_EACH_VEC_ELT (bitmap, *partitions, p1, partition1)
868 if (!can_generate_builtin (rdg, partition1))
869 FOR_EACH_VEC_ELT (bitmap, *partitions, p2, partition2)
870 if (p1 != p2
871 && !can_generate_builtin (rdg, partition2)
872 && similar_memory_accesses (rdg, partition1, partition2))
873 {
874 bitmap_ior_into (partition1, partition2);
875 VEC_ordered_remove (bitmap, *partitions, p2);
876 p2--;
877 }
878 }
879
880 /* Returns true when STMT will be code generated in a partition of RDG
881 different than PART and that will not be code generated as a
882 builtin. */
883
884 static bool
885 stmt_generated_in_another_partition (struct graph *rdg, gimple stmt, int part,
886 VEC (bitmap, heap) *partitions)
887 {
888 int p;
889 bitmap pp;
890 unsigned i;
891 bitmap_iterator bi;
892
893 FOR_EACH_VEC_ELT (bitmap, partitions, p, pp)
894 if (p != part
895 && !can_generate_builtin (rdg, pp))
896 EXECUTE_IF_SET_IN_BITMAP (pp, 0, i, bi)
897 if (stmt == RDG_STMT (rdg, i))
898 return true;
899
900 return false;
901 }
902
903 /* For each partition in PARTITIONS that will be code generated using
904 a builtin, add its scalar computations used after the loop to
905 PARTITION. */
906
907 static void
908 add_scalar_computations_to_partition (struct graph *rdg,
909 VEC (bitmap, heap) *partitions,
910 bitmap partition)
911 {
912 int p;
913 bitmap pp;
914 unsigned i;
915 bitmap_iterator bi;
916 bitmap l = BITMAP_ALLOC (NULL);
917 bitmap pr = BITMAP_ALLOC (NULL);
918 bool f = false;
919
920 FOR_EACH_VEC_ELT (bitmap, partitions, p, pp)
921 if (can_generate_builtin (rdg, pp))
922 EXECUTE_IF_SET_IN_BITMAP (pp, 0, i, bi)
923 if (stmt_has_scalar_dependences_outside_loop (RDG_STMT (rdg, i))
924 && !stmt_generated_in_another_partition (rdg, RDG_STMT (rdg, i), p,
925 partitions))
926 rdg_flag_vertex_and_dependent (rdg, i, partition, l, pr, &f);
927
928 rdg_flag_loop_exits (rdg, l, partition, pr, &f);
929
930 BITMAP_FREE (pr);
931 BITMAP_FREE (l);
932 }
933
934 /* Aggregate several components into a useful partition that is
935 registered in the PARTITIONS vector. Partitions will be
936 distributed in different loops. */
937
938 static void
939 rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components,
940 VEC (int, heap) **other_stores,
941 VEC (bitmap, heap) **partitions, bitmap processed)
942 {
943 int i;
944 rdgc x;
945 bitmap partition = BITMAP_ALLOC (NULL);
946
947 FOR_EACH_VEC_ELT (rdgc, components, i, x)
948 {
949 bitmap np;
950 bool part_has_writes = false;
951 int v = VEC_index (int, x->vertices, 0);
952
953 if (bitmap_bit_p (processed, v))
954 continue;
955
956 np = build_rdg_partition_for_component (rdg, x, &part_has_writes);
957 bitmap_ior_into (partition, np);
958 bitmap_ior_into (processed, np);
959 BITMAP_FREE (np);
960
961 if (part_has_writes)
962 {
963 if (dump_file && (dump_flags & TDF_DETAILS))
964 {
965 fprintf (dump_file, "ldist useful partition:\n");
966 dump_bitmap (dump_file, partition);
967 }
968
969 VEC_safe_push (bitmap, heap, *partitions, partition);
970 partition = BITMAP_ALLOC (NULL);
971 }
972 }
973
974 /* Add the nodes from the RDG that were not marked as processed, and
975 that are used outside the current loop. These are scalar
976 computations that are not yet part of previous partitions. */
977 for (i = 0; i < rdg->n_vertices; i++)
978 if (!bitmap_bit_p (processed, i)
979 && rdg_defs_used_in_other_loops_p (rdg, i))
980 VEC_safe_push (int, heap, *other_stores, i);
981
982 /* If there are still statements left in the OTHER_STORES array,
983 create other components and partitions with these stores and
984 their dependences. */
985 if (VEC_length (int, *other_stores) > 0)
986 {
987 VEC (rdgc, heap) *comps = VEC_alloc (rdgc, heap, 3);
988 VEC (int, heap) *foo = VEC_alloc (int, heap, 3);
989
990 rdg_build_components (rdg, *other_stores, &comps);
991 rdg_build_partitions (rdg, comps, &foo, partitions, processed);
992
993 VEC_free (int, heap, foo);
994 free_rdg_components (comps);
995 }
996
997 add_scalar_computations_to_partition (rdg, *partitions, partition);
998
999 /* If there is something left in the last partition, save it. */
1000 if (bitmap_count_bits (partition) > 0)
1001 VEC_safe_push (bitmap, heap, *partitions, partition);
1002 else
1003 BITMAP_FREE (partition);
1004
1005 fuse_partitions_with_similar_memory_accesses (rdg, partitions);
1006 }
1007
1008 /* Dump to FILE the PARTITIONS. */
1009
1010 static void
1011 dump_rdg_partitions (FILE *file, VEC (bitmap, heap) *partitions)
1012 {
1013 int i;
1014 bitmap partition;
1015
1016 FOR_EACH_VEC_ELT (bitmap, partitions, i, partition)
1017 debug_bitmap_file (file, partition);
1018 }
1019
1020 /* Debug PARTITIONS. */
1021 extern void debug_rdg_partitions (VEC (bitmap, heap) *);
1022
1023 DEBUG_FUNCTION void
1024 debug_rdg_partitions (VEC (bitmap, heap) *partitions)
1025 {
1026 dump_rdg_partitions (stderr, partitions);
1027 }
1028
1029 /* Returns the number of read and write operations in the RDG. */
1030
1031 static int
1032 number_of_rw_in_rdg (struct graph *rdg)
1033 {
1034 int i, res = 0;
1035
1036 for (i = 0; i < rdg->n_vertices; i++)
1037 {
1038 if (RDG_MEM_WRITE_STMT (rdg, i))
1039 ++res;
1040
1041 if (RDG_MEM_READS_STMT (rdg, i))
1042 ++res;
1043 }
1044
1045 return res;
1046 }
1047
1048 /* Returns the number of read and write operations in a PARTITION of
1049 the RDG. */
1050
1051 static int
1052 number_of_rw_in_partition (struct graph *rdg, bitmap partition)
1053 {
1054 int res = 0;
1055 unsigned i;
1056 bitmap_iterator ii;
1057
1058 EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii)
1059 {
1060 if (RDG_MEM_WRITE_STMT (rdg, i))
1061 ++res;
1062
1063 if (RDG_MEM_READS_STMT (rdg, i))
1064 ++res;
1065 }
1066
1067 return res;
1068 }
1069
1070 /* Returns true when one of the PARTITIONS contains all the read or
1071 write operations of RDG. */
1072
1073 static bool
1074 partition_contains_all_rw (struct graph *rdg, VEC (bitmap, heap) *partitions)
1075 {
1076 int i;
1077 bitmap partition;
1078 int nrw = number_of_rw_in_rdg (rdg);
1079
1080 FOR_EACH_VEC_ELT (bitmap, partitions, i, partition)
1081 if (nrw == number_of_rw_in_partition (rdg, partition))
1082 return true;
1083
1084 return false;
1085 }
1086
1087 /* Generate code from STARTING_VERTICES in RDG. Returns the number of
1088 distributed loops. */
1089
1090 static int
1091 ldist_gen (struct loop *loop, struct graph *rdg,
1092 VEC (int, heap) *starting_vertices)
1093 {
1094 int i, nbp;
1095 VEC (rdgc, heap) *components = VEC_alloc (rdgc, heap, 3);
1096 VEC (bitmap, heap) *partitions = VEC_alloc (bitmap, heap, 3);
1097 VEC (int, heap) *other_stores = VEC_alloc (int, heap, 3);
1098 bitmap partition, processed = BITMAP_ALLOC (NULL);
1099
1100 remaining_stmts = BITMAP_ALLOC (NULL);
1101 upstream_mem_writes = BITMAP_ALLOC (NULL);
1102
1103 for (i = 0; i < rdg->n_vertices; i++)
1104 {
1105 bitmap_set_bit (remaining_stmts, i);
1106
1107 /* Save in OTHER_STORES all the memory writes that are not in
1108 STARTING_VERTICES. */
1109 if (RDG_MEM_WRITE_STMT (rdg, i))
1110 {
1111 int v;
1112 unsigned j;
1113 bool found = false;
1114
1115 FOR_EACH_VEC_ELT (int, starting_vertices, j, v)
1116 if (i == v)
1117 {
1118 found = true;
1119 break;
1120 }
1121
1122 if (!found)
1123 VEC_safe_push (int, heap, other_stores, i);
1124 }
1125 }
1126
1127 mark_nodes_having_upstream_mem_writes (rdg);
1128 rdg_build_components (rdg, starting_vertices, &components);
1129 rdg_build_partitions (rdg, components, &other_stores, &partitions,
1130 processed);
1131 BITMAP_FREE (processed);
1132 nbp = VEC_length (bitmap, partitions);
1133
1134 if (nbp <= 1
1135 || partition_contains_all_rw (rdg, partitions))
1136 goto ldist_done;
1137
1138 if (dump_file && (dump_flags & TDF_DETAILS))
1139 dump_rdg_partitions (dump_file, partitions);
1140
1141 FOR_EACH_VEC_ELT (bitmap, partitions, i, partition)
1142 if (!generate_code_for_partition (loop, partition, i < nbp - 1))
1143 goto ldist_done;
1144
1145 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
1146 mark_sym_for_renaming (gimple_vop (cfun));
1147 update_ssa (TODO_update_ssa_only_virtuals);
1148
1149 ldist_done:
1150
1151 BITMAP_FREE (remaining_stmts);
1152 BITMAP_FREE (upstream_mem_writes);
1153
1154 FOR_EACH_VEC_ELT (bitmap, partitions, i, partition)
1155 BITMAP_FREE (partition);
1156
1157 VEC_free (int, heap, other_stores);
1158 VEC_free (bitmap, heap, partitions);
1159 free_rdg_components (components);
1160 return nbp;
1161 }
1162
1163 /* Distributes the code from LOOP in such a way that producer
1164 statements are placed before consumer statements. When STMTS is
1165 NULL, performs the maximal distribution, if STMTS is not NULL,
1166 tries to separate only these statements from the LOOP's body.
1167 Returns the number of distributed loops. */
1168
1169 static int
1170 distribute_loop (struct loop *loop, VEC (gimple, heap) *stmts)
1171 {
1172 int res = 0;
1173 struct graph *rdg;
1174 gimple s;
1175 unsigned i;
1176 VEC (int, heap) *vertices;
1177 VEC (ddr_p, heap) *dependence_relations;
1178 VEC (data_reference_p, heap) *datarefs;
1179 VEC (loop_p, heap) *loop_nest;
1180
1181 if (loop->num_nodes > 2)
1182 {
1183 if (dump_file && (dump_flags & TDF_DETAILS))
1184 fprintf (dump_file,
1185 "FIXME: Loop %d not distributed: it has more than two basic blocks.\n",
1186 loop->num);
1187
1188 return res;
1189 }
1190
1191 datarefs = VEC_alloc (data_reference_p, heap, 10);
1192 dependence_relations = VEC_alloc (ddr_p, heap, 100);
1193 loop_nest = VEC_alloc (loop_p, heap, 3);
1194 rdg = build_rdg (loop, &loop_nest, &dependence_relations, &datarefs);
1195
1196 if (!rdg)
1197 {
1198 if (dump_file && (dump_flags & TDF_DETAILS))
1199 fprintf (dump_file,
1200 "FIXME: Loop %d not distributed: failed to build the RDG.\n",
1201 loop->num);
1202
1203 free_dependence_relations (dependence_relations);
1204 free_data_refs (datarefs);
1205 VEC_free (loop_p, heap, loop_nest);
1206 return res;
1207 }
1208
1209 vertices = VEC_alloc (int, heap, 3);
1210
1211 if (dump_file && (dump_flags & TDF_DETAILS))
1212 dump_rdg (dump_file, rdg);
1213
1214 FOR_EACH_VEC_ELT (gimple, stmts, i, s)
1215 {
1216 int v = rdg_vertex_for_stmt (rdg, s);
1217
1218 if (v >= 0)
1219 {
1220 VEC_safe_push (int, heap, vertices, v);
1221
1222 if (dump_file && (dump_flags & TDF_DETAILS))
1223 fprintf (dump_file,
1224 "ldist asked to generate code for vertex %d\n", v);
1225 }
1226 }
1227
1228 res = ldist_gen (loop, rdg, vertices);
1229 VEC_free (int, heap, vertices);
1230 free_rdg (rdg);
1231 free_dependence_relations (dependence_relations);
1232 free_data_refs (datarefs);
1233 VEC_free (loop_p, heap, loop_nest);
1234 return res;
1235 }
1236
1237 /* Distribute all loops in the current function. */
1238
1239 static unsigned int
1240 tree_loop_distribution (void)
1241 {
1242 struct loop *loop;
1243 loop_iterator li;
1244 int nb_generated_loops = 0;
1245
1246 FOR_EACH_LOOP (li, loop, 0)
1247 {
1248 VEC (gimple, heap) *work_list = NULL;
1249 int num = loop->num;
1250
1251 /* If the loop doesn't have a single exit we will fail anyway,
1252 so do that early. */
1253 if (!single_exit (loop))
1254 continue;
1255
1256 /* If both flag_tree_loop_distribute_patterns and
1257 flag_tree_loop_distribution are set, then only
1258 distribute_patterns is executed. */
1259 if (flag_tree_loop_distribute_patterns)
1260 {
1261 /* With the following working list, we're asking
1262 distribute_loop to separate from the rest of the loop the
1263 stores of the form "A[i] = 0". */
1264 stores_zero_from_loop (loop, &work_list);
1265
1266 /* Do nothing if there are no patterns to be distributed. */
1267 if (VEC_length (gimple, work_list) > 0)
1268 nb_generated_loops = distribute_loop (loop, work_list);
1269 }
1270 else if (flag_tree_loop_distribution)
1271 {
1272 /* With the following working list, we're asking
1273 distribute_loop to separate the stores of the loop: when
1274 dependences allow, it will end on having one store per
1275 loop. */
1276 stores_from_loop (loop, &work_list);
1277
1278 /* A simple heuristic for cache locality is to not split
1279 stores to the same array. Without this call, an unrolled
1280 loop would be split into as many loops as unroll factor,
1281 each loop storing in the same array. */
1282 remove_similar_memory_refs (&work_list);
1283
1284 nb_generated_loops = distribute_loop (loop, work_list);
1285 }
1286
1287 if (dump_file && (dump_flags & TDF_DETAILS))
1288 {
1289 if (nb_generated_loops > 1)
1290 fprintf (dump_file, "Loop %d distributed: split to %d loops.\n",
1291 num, nb_generated_loops);
1292 else
1293 fprintf (dump_file, "Loop %d is the same.\n", num);
1294 }
1295
1296 verify_loop_structure ();
1297
1298 VEC_free (gimple, heap, work_list);
1299 }
1300
1301 return 0;
1302 }
1303
1304 static bool
1305 gate_tree_loop_distribution (void)
1306 {
1307 return flag_tree_loop_distribution
1308 || flag_tree_loop_distribute_patterns;
1309 }
1310
1311 struct gimple_opt_pass pass_loop_distribution =
1312 {
1313 {
1314 GIMPLE_PASS,
1315 "ldist", /* name */
1316 gate_tree_loop_distribution, /* gate */
1317 tree_loop_distribution, /* execute */
1318 NULL, /* sub */
1319 NULL, /* next */
1320 0, /* static_pass_number */
1321 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
1322 PROP_cfg | PROP_ssa, /* properties_required */
1323 0, /* properties_provided */
1324 0, /* properties_destroyed */
1325 0, /* todo_flags_start */
1326 TODO_ggc_collect
1327 | TODO_verify_ssa /* todo_flags_finish */
1328 }
1329 };