invoke.texi: Document -ftree-loop-distribution.
[gcc.git] / gcc / tree-loop-distribution.c
1 /* Loop distribution.
2 Copyright (C) 2006, 2007 Free Software Foundation, Inc.
3 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4 and Sebastian Pop <sebastian.pop@amd.com>.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 /* This pass performs loop distribution: for example, the loop
23
24 |DO I = 2, N
25 | A(I) = B(I) + C
26 | D(I) = A(I-1)*E
27 |ENDDO
28
29 is transformed to
30
31 |DOALL I = 2, N
32 | A(I) = B(I) + C
33 |ENDDO
34 |
35 |DOALL I = 2, N
36 | D(I) = A(I-1)*E
37 |ENDDO
38
39 This pass uses an RDG, Reduced Dependence Graph built on top of the
40 data dependence relations. The RDG is then topologically sorted to
41 obtain a map of information producers/consumers based on which it
42 generates the new loops. */
43
44 #include "config.h"
45 #include "system.h"
46 #include "coretypes.h"
47 #include "tm.h"
48 #include "ggc.h"
49 #include "tree.h"
50 #include "target.h"
51
52 #include "rtl.h"
53 #include "basic-block.h"
54 #include "diagnostic.h"
55 #include "tree-flow.h"
56 #include "tree-dump.h"
57 #include "timevar.h"
58 #include "cfgloop.h"
59 #include "expr.h"
60 #include "optabs.h"
61 #include "tree-chrec.h"
62 #include "tree-data-ref.h"
63 #include "tree-scalar-evolution.h"
64 #include "tree-pass.h"
65 #include "lambda.h"
66 #include "langhooks.h"
67 #include "tree-vectorizer.h"
68
69 /* If bit I is not set, it means that this node represents an
70 operation that has already been performed, and that should not be
71 performed again. This is the subgraph of remaining important
72 computations that is passed to the DFS algorithm for avoiding to
73 include several times the same stores in different loops. */
74 static bitmap remaining_stmts;
75
76 /* A node of the RDG is marked in this bitmap when it has as a
77 predecessor a node that writes to memory. */
78 static bitmap upstream_mem_writes;
79
80 /* Update the PHI nodes of NEW_LOOP. NEW_LOOP is a duplicate of
81 ORIG_LOOP. */
82
83 static void
84 update_phis_for_loop_copy (struct loop *orig_loop, struct loop *new_loop)
85 {
86 tree new_ssa_name;
87 tree phi_new, phi_orig;
88 edge orig_loop_latch = loop_latch_edge (orig_loop);
89 edge orig_entry_e = loop_preheader_edge (orig_loop);
90 edge new_loop_entry_e = loop_preheader_edge (new_loop);
91
92 /* Scan the phis in the headers of the old and new loops
93 (they are organized in exactly the same order). */
94
95 for (phi_new = phi_nodes (new_loop->header),
96 phi_orig = phi_nodes (orig_loop->header);
97 phi_new && phi_orig;
98 phi_new = PHI_CHAIN (phi_new), phi_orig = PHI_CHAIN (phi_orig))
99 {
100 /* Add the first phi argument for the phi in NEW_LOOP (the one
101 associated with the entry of NEW_LOOP) */
102 tree def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_entry_e);
103 add_phi_arg (phi_new, def, new_loop_entry_e);
104
105 /* Add the second phi argument for the phi in NEW_LOOP (the one
106 associated with the latch of NEW_LOOP) */
107 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
108
109 if (TREE_CODE (def) == SSA_NAME)
110 {
111 new_ssa_name = get_current_def (def);
112
113 if (!new_ssa_name)
114 /* This only happens if there are no definitions inside the
115 loop. Use the phi_result in this case. */
116 new_ssa_name = PHI_RESULT (phi_new);
117 }
118 else
119 /* Could be an integer. */
120 new_ssa_name = def;
121
122 add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop));
123 }
124 }
125
126 /* Return a copy of LOOP placed before LOOP. */
127
128 static struct loop *
129 copy_loop_before (struct loop *loop)
130 {
131 struct loop *res;
132 edge preheader = loop_preheader_edge (loop);
133
134 if (!single_exit (loop))
135 return NULL;
136
137 initialize_original_copy_tables ();
138 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader);
139 free_original_copy_tables ();
140
141 if (!res)
142 return NULL;
143
144 update_phis_for_loop_copy (loop, res);
145 rename_variables_in_loop (res);
146
147 return res;
148 }
149
150 /* Creates an empty basic block after LOOP. */
151
152 static void
153 create_bb_after_loop (struct loop *loop)
154 {
155 edge exit = single_exit (loop);
156
157 if (!exit)
158 return;
159
160 split_edge (exit);
161 }
162
163 /* Generate code for PARTITION from the code in LOOP. The loop is
164 copied when COPY_P is true. All the statements not flagged in the
165 PARTITION bitmap are removed from the loop or from its copy. The
166 statements are indexed in sequence inside a basic block, and the
167 basic blocks of a loop are taken in dom order. Returns true when
168 the code gen succeeded. */
169
170 static bool
171 generate_loops_for_partition (struct loop *loop, bitmap partition, bool copy_p)
172 {
173 unsigned i, x;
174 block_stmt_iterator bsi;
175 basic_block *bbs;
176
177 if (copy_p)
178 {
179 loop = copy_loop_before (loop);
180 create_preheader (loop, CP_SIMPLE_PREHEADERS);
181 create_bb_after_loop (loop);
182 }
183
184 if (loop == NULL)
185 return false;
186
187 /* Remove stmts not in the PARTITION bitmap. The order in which we
188 visit the phi nodes and the statements is exactly as in
189 stmts_from_loop. */
190 bbs = get_loop_body_in_dom_order (loop);
191
192 for (x = 0, i = 0; i < loop->num_nodes; i++)
193 {
194 basic_block bb = bbs[i];
195 tree phi, prev = NULL_TREE, next;
196
197 for (phi = phi_nodes (bb); phi;)
198 if (!bitmap_bit_p (partition, x++))
199 {
200 next = PHI_CHAIN (phi);
201 remove_phi_node (phi, prev, true);
202 phi = next;
203 }
204 else
205 {
206 prev = phi;
207 phi = PHI_CHAIN (phi);
208 }
209
210 for (bsi = bsi_start (bb); !bsi_end_p (bsi);)
211 if (TREE_CODE (bsi_stmt (bsi)) != LABEL_EXPR
212 && !bitmap_bit_p (partition, x++))
213 bsi_remove (&bsi, false);
214 else
215 bsi_next (&bsi);
216
217 mark_virtual_ops_in_bb (bb);
218 }
219
220 free (bbs);
221 return true;
222 }
223
224 /* Generate a call to memset. Return true when the operation succeeded. */
225
226 static bool
227 generate_memset_zero (tree stmt, tree op0, tree nb_iter,
228 block_stmt_iterator bsi)
229 {
230 tree s, t, stmts, nb_bytes, addr_base;
231 bool res = false;
232 tree stmt_list = NULL_TREE;
233 tree args [3];
234 tree fn_call, mem, fndecl, fntype, fn;
235 tree_stmt_iterator i;
236 ssa_op_iter iter;
237 struct data_reference *dr = XCNEW (struct data_reference);
238
239 nb_bytes = fold_build2 (MULT_EXPR, TREE_TYPE (nb_iter),
240 nb_iter, TYPE_SIZE_UNIT (TREE_TYPE (op0)));
241 nb_bytes = force_gimple_operand (nb_bytes, &stmts, true, NULL);
242 append_to_statement_list_force (stmts, &stmt_list);
243
244 DR_STMT (dr) = stmt;
245 DR_REF (dr) = op0;
246 dr_analyze_innermost (dr);
247
248 /* Test for a positive stride, iterating over every element. */
249 if (integer_zerop (fold_build2 (MINUS_EXPR, integer_type_node, DR_STEP (dr),
250 TYPE_SIZE_UNIT (TREE_TYPE (op0)))))
251 addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_BASE_ADDRESS (dr)),
252 DR_BASE_ADDRESS (dr),
253 size_binop (PLUS_EXPR,
254 DR_OFFSET (dr), DR_INIT (dr)));
255
256 /* Test for a negative stride, iterating over every element. */
257 else if (integer_zerop (fold_build2 (PLUS_EXPR, integer_type_node,
258 TYPE_SIZE_UNIT (TREE_TYPE (op0)),
259 DR_STEP (dr))))
260 {
261 addr_base = size_binop (PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
262 addr_base = fold_build2 (MINUS_EXPR, sizetype, addr_base, nb_bytes);
263 addr_base = force_gimple_operand (addr_base, &stmts, true, NULL);
264 append_to_statement_list_force (stmts, &stmt_list);
265
266 addr_base = fold_build2 (POINTER_PLUS_EXPR,
267 TREE_TYPE (DR_BASE_ADDRESS (dr)),
268 DR_BASE_ADDRESS (dr), addr_base);
269 }
270 else
271 goto end;
272
273 mem = force_gimple_operand (addr_base, &stmts, true, NULL);
274 append_to_statement_list_force (stmts, &stmt_list);
275
276
277 fndecl = implicit_built_in_decls [BUILT_IN_MEMSET];
278 fntype = TREE_TYPE (fndecl);
279 fn = build1 (ADDR_EXPR, build_pointer_type (fntype), fndecl);
280
281 args[0] = mem;
282 args[1] = integer_zero_node;
283 args[2] = nb_bytes;
284
285 fn_call = build_call_array (fntype, fn, 3, args);
286 append_to_statement_list_force (fn_call, &stmt_list);
287
288 for (i = tsi_start (stmt_list); !tsi_end_p (i); tsi_next (&i))
289 {
290 s = tsi_stmt (i);
291 update_stmt_if_modified (s);
292
293 FOR_EACH_SSA_TREE_OPERAND (t, s, iter, SSA_OP_VIRTUAL_DEFS)
294 {
295 if (TREE_CODE (t) == SSA_NAME)
296 t = SSA_NAME_VAR (t);
297 mark_sym_for_renaming (t);
298 }
299 }
300
301 /* Mark also the uses of the VDEFS of STMT to be renamed. */
302 FOR_EACH_SSA_TREE_OPERAND (t, stmt, iter, SSA_OP_VIRTUAL_DEFS)
303 {
304 if (TREE_CODE (t) == SSA_NAME)
305 {
306 imm_use_iterator imm_iter;
307
308 FOR_EACH_IMM_USE_STMT (s, imm_iter, t)
309 update_stmt (s);
310
311 t = SSA_NAME_VAR (t);
312 }
313 mark_sym_for_renaming (t);
314 }
315
316 bsi_insert_after (&bsi, stmt_list, BSI_CONTINUE_LINKING);
317 res = true;
318
319 if (dump_file && (dump_flags & TDF_DETAILS))
320 fprintf (dump_file, "generated memset zero\n");
321
322 end:
323 free_data_ref (dr);
324 return res;
325 }
326
327 /* Tries to generate a builtin function for the instructions of LOOP
328 pointed to by the bits set in PARTITION. Returns true when the
329 operation succeeded. */
330
331 static bool
332 generate_builtin (struct loop *loop, bitmap partition, bool copy_p)
333 {
334 bool res = false;
335 unsigned i, x = 0;
336 basic_block *bbs;
337 tree write = NULL_TREE;
338 tree op0, op1;
339 block_stmt_iterator bsi;
340 tree nb_iter = number_of_exit_cond_executions (loop);
341
342 if (!nb_iter || nb_iter == chrec_dont_know)
343 return false;
344
345 bbs = get_loop_body_in_dom_order (loop);
346
347 for (i = 0; i < loop->num_nodes; i++)
348 {
349 basic_block bb = bbs[i];
350 tree phi;
351
352 for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
353 x++;
354
355 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
356 {
357 tree stmt = bsi_stmt (bsi);
358
359 if (bitmap_bit_p (partition, x++)
360 && TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
361 && !is_gimple_reg (GIMPLE_STMT_OPERAND (stmt, 0)))
362 {
363 /* Don't generate the builtins when there are more than
364 one memory write. */
365 if (write != NULL)
366 goto end;
367
368 write = stmt;
369 }
370 }
371 }
372
373 if (!write)
374 goto end;
375
376 op0 = GIMPLE_STMT_OPERAND (write, 0);
377 op1 = GIMPLE_STMT_OPERAND (write, 1);
378
379 if (!(TREE_CODE (op0) == ARRAY_REF
380 || TREE_CODE (op0) == INDIRECT_REF))
381 goto end;
382
383 /* The new statements will be placed before LOOP. */
384 bsi = bsi_last (loop_preheader_edge (loop)->src);
385
386 if (integer_zerop (op1) || real_zerop (op1))
387 res = generate_memset_zero (write, op0, nb_iter, bsi);
388
389 /* If this is the last partition for which we generate code, we have
390 to destroy the loop. */
391 if (res && !copy_p)
392 {
393 unsigned nbbs = loop->num_nodes;
394 basic_block src = loop_preheader_edge (loop)->src;
395 basic_block dest = single_exit (loop)->dest;
396 make_edge (src, dest, EDGE_FALLTHRU);
397 set_immediate_dominator (CDI_DOMINATORS, dest, src);
398 cancel_loop_tree (loop);
399
400 for (i = 0; i < nbbs; i++)
401 delete_basic_block (bbs[i]);
402 }
403
404 end:
405 free (bbs);
406 return res;
407 }
408
409 /* Generates code for PARTITION. For simple loops, this function can
410 generate a built-in. */
411
412 static bool
413 generate_code_for_partition (struct loop *loop, bitmap partition, bool copy_p)
414 {
415 if (generate_builtin (loop, partition, copy_p))
416 return true;
417
418 return generate_loops_for_partition (loop, partition, copy_p);
419 }
420
421
422 /* Returns true if the node V of RDG cannot be recomputed. */
423
424 static bool
425 rdg_cannot_recompute_vertex_p (struct graph *rdg, int v)
426 {
427 if (RDG_MEM_WRITE_STMT (rdg, v))
428 return true;
429
430 return false;
431 }
432
433 /* Returns true when the vertex V has already been generated in the
434 current partition (V is in PROCESSED), or when V belongs to another
435 partition and cannot be recomputed (V is not in REMAINING_STMTS). */
436
437 static inline bool
438 already_processed_vertex_p (bitmap processed, int v)
439 {
440 return (bitmap_bit_p (processed, v)
441 || !bitmap_bit_p (remaining_stmts, v));
442 }
443
444 /* Returns NULL when there is no anti-dependence among the successors
445 of vertex V, otherwise returns the edge with the anti-dep. */
446
447 static struct graph_edge *
448 has_anti_dependence (struct vertex *v)
449 {
450 struct graph_edge *e;
451
452 if (v->succ)
453 for (e = v->succ; e; e = e->succ_next)
454 if (RDGE_TYPE (e) == anti_dd)
455 return e;
456
457 return NULL;
458 }
459
460 /* Returns true when V has an anti-dependence edge among its successors. */
461
462 static bool
463 predecessor_has_mem_write (struct graph *rdg, struct vertex *v)
464 {
465 struct graph_edge *e;
466
467 if (v->pred)
468 for (e = v->pred; e; e = e->pred_next)
469 if (bitmap_bit_p (upstream_mem_writes, e->src)
470 /* Don't consider flow channels: a write to memory followed
471 by a read from memory. These channels allow the split of
472 the RDG in different partitions. */
473 && !RDG_MEM_WRITE_STMT (rdg, e->src))
474 return true;
475
476 return false;
477 }
478
479 /* Initializes the upstream_mem_writes bitmap following the
480 information from RDG. */
481
482 static void
483 mark_nodes_having_upstream_mem_writes (struct graph *rdg)
484 {
485 int v, x;
486 bitmap seen = BITMAP_ALLOC (NULL);
487
488 for (v = rdg->n_vertices - 1; v >= 0; v--)
489 if (!bitmap_bit_p (seen, v))
490 {
491 unsigned i;
492 VEC (int, heap) *nodes = VEC_alloc (int, heap, 3);
493 bool has_upstream_mem_write_p = false;
494
495 graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
496
497 for (i = 0; VEC_iterate (int, nodes, i, x); i++)
498 {
499 if (bitmap_bit_p (seen, x))
500 continue;
501
502 bitmap_set_bit (seen, x);
503
504 if (RDG_MEM_WRITE_STMT (rdg, x)
505 || predecessor_has_mem_write (rdg, &(rdg->vertices[x]))
506 /* In anti dependences the read should occur before
507 the write, this is why both the read and the write
508 should be placed in the same partition. */
509 || has_anti_dependence (&(rdg->vertices[x])))
510 {
511 has_upstream_mem_write_p = true;
512 bitmap_set_bit (upstream_mem_writes, x);
513 }
514 }
515
516 VEC_free (int, heap, nodes);
517 }
518 }
519
520 /* Returns true when vertex u has a memory write node as a predecessor
521 in RDG. */
522
523 static bool
524 has_upstream_mem_writes (int u)
525 {
526 return bitmap_bit_p (upstream_mem_writes, u);
527 }
528
529 static void rdg_flag_vertex_and_dependent (struct graph *, int, bitmap, bitmap,
530 bitmap, bool *);
531
532 /* Flag all the uses of U. */
533
534 static void
535 rdg_flag_all_uses (struct graph *rdg, int u, bitmap partition, bitmap loops,
536 bitmap processed, bool *part_has_writes)
537 {
538 struct graph_edge *e;
539
540 for (e = rdg->vertices[u].succ; e; e = e->succ_next)
541 if (!bitmap_bit_p (processed, e->dest))
542 {
543 rdg_flag_vertex_and_dependent (rdg, e->dest, partition, loops,
544 processed, part_has_writes);
545 rdg_flag_all_uses (rdg, e->dest, partition, loops, processed,
546 part_has_writes);
547 }
548 }
549
550 /* Flag the uses of U stopping following the information from
551 upstream_mem_writes. */
552
553 static void
554 rdg_flag_uses (struct graph *rdg, int u, bitmap partition, bitmap loops,
555 bitmap processed, bool *part_has_writes)
556 {
557 ssa_op_iter iter;
558 use_operand_p use_p;
559 struct vertex *x = &(rdg->vertices[u]);
560 tree stmt = RDGV_STMT (x);
561 struct graph_edge *anti_dep = has_anti_dependence (x);
562
563 /* Keep in the same partition the destination of an antidependence,
564 because this is a store to the exact same location. Putting this
565 in another partition is bad for cache locality. */
566 if (anti_dep)
567 {
568 int v = anti_dep->dest;
569
570 if (!already_processed_vertex_p (processed, v))
571 rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
572 processed, part_has_writes);
573 }
574
575 if (TREE_CODE (stmt) != PHI_NODE)
576 {
577 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VIRTUAL_USES)
578 {
579 tree use = USE_FROM_PTR (use_p);
580
581 if (TREE_CODE (use) == SSA_NAME)
582 {
583 tree def_stmt = SSA_NAME_DEF_STMT (use);
584 int v = rdg_vertex_for_stmt (rdg, def_stmt);
585
586 if (v >= 0
587 && !already_processed_vertex_p (processed, v))
588 rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
589 processed, part_has_writes);
590 }
591 }
592 }
593
594 if (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
595 && has_upstream_mem_writes (u))
596 {
597 tree op0 = GIMPLE_STMT_OPERAND (stmt, 0);
598
599 /* Scalar channels don't have enough space for transmitting data
600 between tasks, unless we add more storage by privatizing. */
601 if (is_gimple_reg (op0))
602 {
603 use_operand_p use_p;
604 imm_use_iterator iter;
605
606 FOR_EACH_IMM_USE_FAST (use_p, iter, op0)
607 {
608 int v = rdg_vertex_for_stmt (rdg, USE_STMT (use_p));
609
610 if (!already_processed_vertex_p (processed, v))
611 rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
612 processed, part_has_writes);
613 }
614 }
615 }
616 }
617
618 /* Flag V from RDG as part of PARTITION, and also flag its loop number
619 in LOOPS. */
620
621 static void
622 rdg_flag_vertex (struct graph *rdg, int v, bitmap partition, bitmap loops,
623 bool *part_has_writes)
624 {
625 struct loop *loop;
626
627 if (bitmap_bit_p (partition, v))
628 return;
629
630 loop = loop_containing_stmt (RDG_STMT (rdg, v));
631 bitmap_set_bit (loops, loop->num);
632 bitmap_set_bit (partition, v);
633
634 if (rdg_cannot_recompute_vertex_p (rdg, v))
635 {
636 *part_has_writes = true;
637 bitmap_clear_bit (remaining_stmts, v);
638 }
639 }
640
641 /* Flag in the bitmap PARTITION the vertex V and all its predecessors.
642 Alse flag their loop number in LOOPS. */
643
644 static void
645 rdg_flag_vertex_and_dependent (struct graph *rdg, int v, bitmap partition,
646 bitmap loops, bitmap processed,
647 bool *part_has_writes)
648 {
649 unsigned i;
650 VEC (int, heap) *nodes = VEC_alloc (int, heap, 3);
651 int x;
652
653 bitmap_set_bit (processed, v);
654 rdg_flag_uses (rdg, v, partition, loops, processed, part_has_writes);
655 graphds_dfs (rdg, &v, 1, &nodes, false, remaining_stmts);
656 rdg_flag_vertex (rdg, v, partition, loops, part_has_writes);
657
658 for (i = 0; VEC_iterate (int, nodes, i, x); i++)
659 if (!already_processed_vertex_p (processed, x))
660 rdg_flag_vertex_and_dependent (rdg, x, partition, loops, processed,
661 part_has_writes);
662
663 VEC_free (int, heap, nodes);
664 }
665
666 /* Initialize CONDS with all the condition statements from the basic
667 blocks of LOOP. */
668
669 static void
670 collect_condition_stmts (struct loop *loop, VEC (tree, heap) **conds)
671 {
672 unsigned i;
673 edge e;
674 VEC (edge, heap) *exits = get_loop_exit_edges (loop);
675
676 for (i = 0; VEC_iterate (edge, exits, i, e); i++)
677 {
678 tree cond = last_stmt (e->src);
679
680 if (cond)
681 VEC_safe_push (tree, heap, *conds, cond);
682 }
683
684 VEC_free (edge, heap, exits);
685 }
686
687 /* Add to PARTITION all the exit condition statements for LOOPS
688 together with all their dependent statements determined from
689 RDG. */
690
691 static void
692 rdg_flag_loop_exits (struct graph *rdg, bitmap loops, bitmap partition,
693 bitmap processed, bool *part_has_writes)
694 {
695 unsigned i;
696 bitmap_iterator bi;
697 VEC (tree, heap) *conds = VEC_alloc (tree, heap, 3);
698
699 EXECUTE_IF_SET_IN_BITMAP (loops, 0, i, bi)
700 collect_condition_stmts (get_loop (i), &conds);
701
702 while (!VEC_empty (tree, conds))
703 {
704 tree cond = VEC_pop (tree, conds);
705 int v = rdg_vertex_for_stmt (rdg, cond);
706 bitmap new_loops = BITMAP_ALLOC (NULL);
707
708 if (!already_processed_vertex_p (processed, v))
709 rdg_flag_vertex_and_dependent (rdg, v, partition, new_loops, processed,
710 part_has_writes);
711
712 EXECUTE_IF_SET_IN_BITMAP (new_loops, 0, i, bi)
713 if (!bitmap_bit_p (loops, i))
714 {
715 bitmap_set_bit (loops, i);
716 collect_condition_stmts (get_loop (i), &conds);
717 }
718
719 BITMAP_FREE (new_loops);
720 }
721 }
722
723 /* Strongly connected components of the reduced data dependence graph. */
724
725 typedef struct rdg_component
726 {
727 int num;
728 VEC (int, heap) *vertices;
729 } *rdgc;
730
731 DEF_VEC_P (rdgc);
732 DEF_VEC_ALLOC_P (rdgc, heap);
733
734 /* Flag all the nodes of RDG containing memory accesses that could
735 potentially belong to arrays already accessed in the current
736 PARTITION. */
737
738 static void
739 rdg_flag_similar_memory_accesses (struct graph *rdg, bitmap partition,
740 bitmap loops, bitmap processed,
741 VEC (int, heap) **other_stores)
742 {
743 bool foo;
744 unsigned i, n;
745 int j, k, kk;
746 bitmap_iterator ii;
747 struct graph_edge *e;
748
749 EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii)
750 if (RDG_MEM_WRITE_STMT (rdg, i)
751 || RDG_MEM_READS_STMT (rdg, i))
752 {
753 for (j = 0; j < rdg->n_vertices; j++)
754 if (!bitmap_bit_p (processed, j)
755 && (RDG_MEM_WRITE_STMT (rdg, j)
756 || RDG_MEM_READS_STMT (rdg, j))
757 && rdg_has_similar_memory_accesses (rdg, i, j))
758 {
759 /* Flag first the node J itself, and all the nodes that
760 are needed to compute J. */
761 rdg_flag_vertex_and_dependent (rdg, j, partition, loops,
762 processed, &foo);
763
764 /* When J is a read, we want to coalesce in the same
765 PARTITION all the nodes that are using J: this is
766 needed for better cache locality. */
767 rdg_flag_all_uses (rdg, j, partition, loops, processed, &foo);
768
769 /* Remove from OTHER_STORES the vertex that we flagged. */
770 if (RDG_MEM_WRITE_STMT (rdg, j))
771 for (k = 0; VEC_iterate (int, *other_stores, k, kk); k++)
772 if (kk == j)
773 {
774 VEC_unordered_remove (int, *other_stores, k);
775 break;
776 }
777 }
778
779 /* If the node I has two uses, then keep these together in the
780 same PARTITION. */
781 for (n = 0, e = rdg->vertices[i].succ; e; e = e->succ_next, n++);
782
783 if (n > 1)
784 rdg_flag_all_uses (rdg, i, partition, loops, processed, &foo);
785 }
786 }
787
788 /* Returns a bitmap in which all the statements needed for computing
789 the strongly connected component C of the RDG are flagged, also
790 including the loop exit conditions. */
791
792 static bitmap
793 build_rdg_partition_for_component (struct graph *rdg, rdgc c,
794 bool *part_has_writes,
795 VEC (int, heap) **other_stores)
796 {
797 int i, v;
798 bitmap partition = BITMAP_ALLOC (NULL);
799 bitmap loops = BITMAP_ALLOC (NULL);
800 bitmap processed = BITMAP_ALLOC (NULL);
801
802 for (i = 0; VEC_iterate (int, c->vertices, i, v); i++)
803 if (!already_processed_vertex_p (processed, v))
804 rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed,
805 part_has_writes);
806
807 /* Also iterate on the array of stores not in the starting vertices,
808 and determine those vertices that have some memory affinity with
809 the current nodes in the component: these are stores to the same
810 arrays, i.e. we're taking care of cache locality. */
811 rdg_flag_similar_memory_accesses (rdg, partition, loops, processed,
812 other_stores);
813
814 rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes);
815
816 BITMAP_FREE (processed);
817 BITMAP_FREE (loops);
818 return partition;
819 }
820
821 /* Free memory for COMPONENTS. */
822
823 static void
824 free_rdg_components (VEC (rdgc, heap) *components)
825 {
826 int i;
827 rdgc x;
828
829 for (i = 0; VEC_iterate (rdgc, components, i, x); i++)
830 {
831 VEC_free (int, heap, x->vertices);
832 free (x);
833 }
834 }
835
836 /* Build the COMPONENTS vector with the strongly connected components
837 of RDG in which the STARTING_VERTICES occur. */
838
839 static void
840 rdg_build_components (struct graph *rdg, VEC (int, heap) *starting_vertices,
841 VEC (rdgc, heap) **components)
842 {
843 int i, v;
844 bitmap saved_components = BITMAP_ALLOC (NULL);
845 int n_components = graphds_scc (rdg, NULL);
846 VEC (int, heap) **all_components = XNEWVEC (VEC (int, heap) *, n_components);
847
848 for (i = 0; i < n_components; i++)
849 all_components[i] = VEC_alloc (int, heap, 3);
850
851 for (i = 0; i < rdg->n_vertices; i++)
852 VEC_safe_push (int, heap, all_components[rdg->vertices[i].component], i);
853
854 for (i = 0; VEC_iterate (int, starting_vertices, i, v); i++)
855 {
856 int c = rdg->vertices[v].component;
857
858 if (!bitmap_bit_p (saved_components, c))
859 {
860 rdgc x = XCNEW (struct rdg_component);
861 x->num = c;
862 x->vertices = all_components[c];
863
864 VEC_safe_push (rdgc, heap, *components, x);
865 bitmap_set_bit (saved_components, c);
866 }
867 }
868
869 for (i = 0; i < n_components; i++)
870 if (!bitmap_bit_p (saved_components, i))
871 VEC_free (int, heap, all_components[i]);
872
873 free (all_components);
874 BITMAP_FREE (saved_components);
875 }
876
877 DEF_VEC_P (bitmap);
878 DEF_VEC_ALLOC_P (bitmap, heap);
879
880 /* Aggregate several components into a useful partition that is
881 registered in the PARTITIONS vector. Partitions will be
882 distributed in different loops. */
883
884 static void
885 rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components,
886 VEC (int, heap) **other_stores,
887 VEC (bitmap, heap) **partitions, bitmap processed)
888 {
889 int i;
890 rdgc x;
891 bitmap partition = BITMAP_ALLOC (NULL);
892
893 for (i = 0; VEC_iterate (rdgc, components, i, x); i++)
894 {
895 bitmap np;
896 bool part_has_writes = false;
897 int v = VEC_index (int, x->vertices, 0);
898
899 if (bitmap_bit_p (processed, v))
900 continue;
901
902 np = build_rdg_partition_for_component (rdg, x, &part_has_writes,
903 other_stores);
904 bitmap_ior_into (partition, np);
905 bitmap_ior_into (processed, np);
906 BITMAP_FREE (np);
907
908 if (part_has_writes)
909 {
910 if (dump_file && (dump_flags & TDF_DETAILS))
911 {
912 fprintf (dump_file, "ldist useful partition:\n");
913 dump_bitmap (dump_file, partition);
914 }
915
916 VEC_safe_push (bitmap, heap, *partitions, partition);
917 partition = BITMAP_ALLOC (NULL);
918 }
919 }
920
921 /* Add the nodes from the RDG that were not marked as processed, and
922 that are used outside the current loop. These are scalar
923 computations that are not yet part of previous partitions. */
924 for (i = 0; i < rdg->n_vertices; i++)
925 if (!bitmap_bit_p (processed, i)
926 && rdg_defs_used_in_other_loops_p (rdg, i))
927 VEC_safe_push (int, heap, *other_stores, i);
928
929 /* If there are still statements left in the OTHER_STORES array,
930 create other components and partitions with these stores and
931 their dependences. */
932 if (VEC_length (int, *other_stores) > 0)
933 {
934 VEC (rdgc, heap) *comps = VEC_alloc (rdgc, heap, 3);
935 VEC (int, heap) *foo = VEC_alloc (int, heap, 3);
936
937 rdg_build_components (rdg, *other_stores, &comps);
938 rdg_build_partitions (rdg, comps, &foo, partitions, processed);
939
940 VEC_free (int, heap, foo);
941 free_rdg_components (comps);
942 }
943
944 /* If there is something left in the last partition, save it. */
945 if (bitmap_count_bits (partition) > 0)
946 VEC_safe_push (bitmap, heap, *partitions, partition);
947 else
948 BITMAP_FREE (partition);
949 }
950
951 /* Dump to FILE the PARTITIONS. */
952
953 static void
954 dump_rdg_partitions (FILE *file, VEC (bitmap, heap) *partitions)
955 {
956 int i;
957 bitmap partition;
958
959 for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
960 debug_bitmap_file (file, partition);
961 }
962
963 /* Debug PARTITIONS. */
964 extern void debug_rdg_partitions (VEC (bitmap, heap) *);
965
966 void
967 debug_rdg_partitions (VEC (bitmap, heap) *partitions)
968 {
969 dump_rdg_partitions (stderr, partitions);
970 }
971
972 /* Generate code from STARTING_VERTICES in RDG. Returns the number of
973 distributed loops. */
974
975 static int
976 ldist_gen (struct loop *loop, struct graph *rdg,
977 VEC (int, heap) *starting_vertices)
978 {
979 int i, nbp;
980 VEC (rdgc, heap) *components = VEC_alloc (rdgc, heap, 3);
981 VEC (bitmap, heap) *partitions = VEC_alloc (bitmap, heap, 3);
982 VEC (int, heap) *other_stores = VEC_alloc (int, heap, 3);
983 bitmap partition, processed = BITMAP_ALLOC (NULL);
984
985 remaining_stmts = BITMAP_ALLOC (NULL);
986 upstream_mem_writes = BITMAP_ALLOC (NULL);
987
988 for (i = 0; i < rdg->n_vertices; i++)
989 {
990 bitmap_set_bit (remaining_stmts, i);
991
992 /* Save in OTHER_STORES all the memory writes that are not in
993 STARTING_VERTICES. */
994 if (RDG_MEM_WRITE_STMT (rdg, i))
995 {
996 int v;
997 unsigned j;
998 bool found = false;
999
1000 for (j = 0; VEC_iterate (int, starting_vertices, j, v); j++)
1001 if (i == v)
1002 {
1003 found = true;
1004 break;
1005 }
1006
1007 if (!found)
1008 VEC_safe_push (int, heap, other_stores, i);
1009 }
1010 }
1011
1012 mark_nodes_having_upstream_mem_writes (rdg);
1013 rdg_build_components (rdg, starting_vertices, &components);
1014 rdg_build_partitions (rdg, components, &other_stores, &partitions,
1015 processed);
1016 BITMAP_FREE (processed);
1017 nbp = VEC_length (bitmap, partitions);
1018
1019 if (nbp <= 1)
1020 goto ldist_done;
1021
1022 if (dump_file && (dump_flags & TDF_DETAILS))
1023 dump_rdg_partitions (dump_file, partitions);
1024
1025 for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
1026 if (!generate_code_for_partition (loop, partition, i < nbp - 1))
1027 goto ldist_done;
1028
1029 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
1030 update_ssa (TODO_update_ssa_only_virtuals | TODO_update_ssa);
1031
1032 ldist_done:
1033
1034 BITMAP_FREE (remaining_stmts);
1035 BITMAP_FREE (upstream_mem_writes);
1036
1037 for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
1038 BITMAP_FREE (partition);
1039
1040 VEC_free (int, heap, other_stores);
1041 VEC_free (bitmap, heap, partitions);
1042 free_rdg_components (components);
1043 return nbp;
1044 }
1045
1046 /* Distributes the code from LOOP in such a way that producer
1047 statements are placed before consumer statements. When STMTS is
1048 NULL, performs the maximal distribution, if STMTS is not NULL,
1049 tries to separate only these statements from the LOOP's body.
1050 Returns the number of distributed loops. */
1051
1052 static int
1053 distribute_loop (struct loop *loop, VEC (tree, heap) *stmts)
1054 {
1055 bool res = false;
1056 struct graph *rdg;
1057 tree s;
1058 unsigned i;
1059 VEC (int, heap) *vertices;
1060
1061 if (loop->num_nodes > 2)
1062 {
1063 if (dump_file && (dump_flags & TDF_DETAILS))
1064 fprintf (dump_file,
1065 "FIXME: Loop %d not distributed: it has more than two basic blocks.\n",
1066 loop->num);
1067
1068 return res;
1069 }
1070
1071 rdg = build_rdg (loop);
1072
1073 if (!rdg)
1074 {
1075 if (dump_file && (dump_flags & TDF_DETAILS))
1076 fprintf (dump_file,
1077 "FIXME: Loop %d not distributed: failed to build the RDG.\n",
1078 loop->num);
1079
1080 return res;
1081 }
1082
1083 vertices = VEC_alloc (int, heap, 3);
1084
1085 if (dump_file && (dump_flags & TDF_DETAILS))
1086 dump_rdg (dump_file, rdg);
1087
1088 for (i = 0; VEC_iterate (tree, stmts, i, s); i++)
1089 {
1090 int v = rdg_vertex_for_stmt (rdg, s);
1091
1092 if (v >= 0)
1093 {
1094 VEC_safe_push (int, heap, vertices, v);
1095
1096 if (dump_file && (dump_flags & TDF_DETAILS))
1097 fprintf (dump_file,
1098 "ldist asked to generate code for vertex %d\n", v);
1099 }
1100 }
1101
1102 res = ldist_gen (loop, rdg, vertices);
1103 VEC_free (int, heap, vertices);
1104 free_rdg (rdg);
1105
1106 return res;
1107 }
1108
1109 /* Distribute all loops in the current function. */
1110
1111 static unsigned int
1112 tree_loop_distribution (void)
1113 {
1114 struct loop *loop;
1115 loop_iterator li;
1116 int nb_generated_loops = 0;
1117
1118 FOR_EACH_LOOP (li, loop, 0)
1119 {
1120 VEC (tree, heap) *work_list = VEC_alloc (tree, heap, 3);
1121
1122 /* With the following working list, we're asking distribute_loop
1123 to separate the stores of the loop: when dependences allow,
1124 it will end on having one store per loop. */
1125 stores_from_loop (loop, &work_list);
1126
1127 /* A simple heuristic for cache locality is to not split stores
1128 to the same array. Without this call, an unrolled loop would
1129 be split into as many loops as unroll factor, each loop
1130 storing in the same array. */
1131 remove_similar_memory_refs (&work_list);
1132
1133 nb_generated_loops = distribute_loop (loop, work_list);
1134
1135 if (dump_file && (dump_flags & TDF_DETAILS))
1136 {
1137 if (nb_generated_loops > 1)
1138 fprintf (dump_file, "Loop %d distributed: split to %d loops.\n",
1139 loop->num, nb_generated_loops);
1140 else
1141 fprintf (dump_file, "Loop %d is the same.\n", loop->num);
1142 }
1143
1144 verify_loop_structure ();
1145
1146 VEC_free (tree, heap, work_list);
1147 }
1148
1149 return 0;
1150 }
1151
1152 static bool
1153 gate_tree_loop_distribution (void)
1154 {
1155 return flag_tree_loop_distribution != 0;
1156 }
1157
1158 struct tree_opt_pass pass_loop_distribution =
1159 {
1160 "ldist", /* name */
1161 gate_tree_loop_distribution, /* gate */
1162 tree_loop_distribution, /* execute */
1163 NULL, /* sub */
1164 NULL, /* next */
1165 0, /* static_pass_number */
1166 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
1167 PROP_cfg | PROP_ssa, /* properties_required */
1168 0, /* properties_provided */
1169 0, /* properties_destroyed */
1170 0, /* todo_flags_start */
1171 TODO_dump_func | TODO_verify_loops, /* todo_flags_finish */
1172 0 /* letter */
1173 };