ddg.c (build_intra_loop_deps): Adjust add_forward_dependence call.
[gcc.git] / gcc / modulo-sched.c
1 /* Swing Modulo Scheduling implementation.
2 Copyright (C) 2004, 2005, 2006
3 Free Software Foundation, Inc.
4 Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
21 02110-1301, USA. */
22
23
24 #include "config.h"
25 #include "system.h"
26 #include "coretypes.h"
27 #include "tm.h"
28 #include "toplev.h"
29 #include "rtl.h"
30 #include "tm_p.h"
31 #include "hard-reg-set.h"
32 #include "regs.h"
33 #include "function.h"
34 #include "flags.h"
35 #include "insn-config.h"
36 #include "insn-attr.h"
37 #include "except.h"
38 #include "toplev.h"
39 #include "recog.h"
40 #include "sched-int.h"
41 #include "target.h"
42 #include "cfglayout.h"
43 #include "cfgloop.h"
44 #include "cfghooks.h"
45 #include "expr.h"
46 #include "params.h"
47 #include "gcov-io.h"
48 #include "df.h"
49 #include "ddg.h"
50 #include "timevar.h"
51 #include "tree-pass.h"
52
53 #ifdef INSN_SCHEDULING
54
55 /* This file contains the implementation of the Swing Modulo Scheduler,
56 described in the following references:
57 [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
58 Lifetime--sensitive modulo scheduling in a production environment.
59 IEEE Trans. on Comps., 50(3), March 2001
60 [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
61 Swing Modulo Scheduling: A Lifetime Sensitive Approach.
62 PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
63
64 The basic structure is:
65 1. Build a data-dependence graph (DDG) for each loop.
66 2. Use the DDG to order the insns of a loop (not in topological order
67 necessarily, but rather) trying to place each insn after all its
68 predecessors _or_ after all its successors.
69 3. Compute MII: a lower bound on the number of cycles to schedule the loop.
70 4. Use the ordering to perform list-scheduling of the loop:
71 1. Set II = MII. We will try to schedule the loop within II cycles.
72 2. Try to schedule the insns one by one according to the ordering.
73 For each insn compute an interval of cycles by considering already-
74 scheduled preds and succs (and associated latencies); try to place
75 the insn in the cycles of this window checking for potential
76 resource conflicts (using the DFA interface).
77 Note: this is different from the cycle-scheduling of schedule_insns;
78 here the insns are not scheduled monotonically top-down (nor bottom-
79 up).
80 3. If failed in scheduling all insns - bump II++ and try again, unless
81 II reaches an upper bound MaxII, in which case report failure.
82 5. If we succeeded in scheduling the loop within II cycles, we now
83 generate prolog and epilog, decrease the counter of the loop, and
84 perform modulo variable expansion for live ranges that span more than
85 II cycles (i.e. use register copies to prevent a def from overwriting
86 itself before reaching the use).
87 */
88
89 \f
90 /* This page defines partial-schedule structures and functions for
91 modulo scheduling. */
92
93 typedef struct partial_schedule *partial_schedule_ptr;
94 typedef struct ps_insn *ps_insn_ptr;
95
96 /* The minimum (absolute) cycle that a node of ps was scheduled in. */
97 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
98
99 /* The maximum (absolute) cycle that a node of ps was scheduled in. */
100 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
101
102 /* Perform signed modulo, always returning a non-negative value. */
103 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
104
105 /* The number of different iterations the nodes in ps span, assuming
106 the stage boundaries are placed efficiently. */
107 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
108 + 1 + (ps)->ii - 1) / (ps)->ii)
109
110 /* A single instruction in the partial schedule. */
111 struct ps_insn
112 {
113 /* The corresponding DDG_NODE. */
114 ddg_node_ptr node;
115
116 /* The (absolute) cycle in which the PS instruction is scheduled.
117 Same as SCHED_TIME (node). */
118 int cycle;
119
120 /* The next/prev PS_INSN in the same row. */
121 ps_insn_ptr next_in_row,
122 prev_in_row;
123
124 /* The number of nodes in the same row that come after this node. */
125 int row_rest_count;
126 };
127
128 /* Holds the partial schedule as an array of II rows. Each entry of the
129 array points to a linked list of PS_INSNs, which represents the
130 instructions that are scheduled for that row. */
131 struct partial_schedule
132 {
133 int ii; /* Number of rows in the partial schedule. */
134 int history; /* Threshold for conflict checking using DFA. */
135
136 /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii). */
137 ps_insn_ptr *rows;
138
139 /* The earliest absolute cycle of an insn in the partial schedule. */
140 int min_cycle;
141
142 /* The latest absolute cycle of an insn in the partial schedule. */
143 int max_cycle;
144
145 ddg_ptr g; /* The DDG of the insns in the partial schedule. */
146 };
147
148 /* We use this to record all the register replacements we do in
149 the kernel so we can undo SMS if it is not profitable. */
150 struct undo_replace_buff_elem
151 {
152 rtx insn;
153 rtx orig_reg;
154 rtx new_reg;
155 struct undo_replace_buff_elem *next;
156 };
157
158
159
160 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
161 static void free_partial_schedule (partial_schedule_ptr);
162 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
163 void print_partial_schedule (partial_schedule_ptr, FILE *);
164 static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
165 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
166 ddg_node_ptr node, int cycle,
167 sbitmap must_precede,
168 sbitmap must_follow);
169 static void rotate_partial_schedule (partial_schedule_ptr, int);
170 void set_row_column_for_ps (partial_schedule_ptr);
171 static bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr );
172
173 \f
174 /* This page defines constants and structures for the modulo scheduling
175 driver. */
176
177 /* As in haifa-sched.c: */
178 /* issue_rate is the number of insns that can be scheduled in the same
179 machine cycle. It can be defined in the config/mach/mach.h file,
180 otherwise we set it to 1. */
181
182 static int issue_rate;
183
184 static int sms_order_nodes (ddg_ptr, int, int * result);
185 static void set_node_sched_params (ddg_ptr);
186 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
187 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
188 static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
189 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
190 int from_stage, int to_stage,
191 int is_prolog);
192
193 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
194 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
195 #define SCHED_FIRST_REG_MOVE(x) \
196 (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
197 #define SCHED_NREG_MOVES(x) \
198 (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
199 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
200 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
201 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
202
203 /* The scheduling parameters held for each node. */
204 typedef struct node_sched_params
205 {
206 int asap; /* A lower-bound on the absolute scheduling cycle. */
207 int time; /* The absolute scheduling cycle (time >= asap). */
208
209 /* The following field (first_reg_move) is a pointer to the first
210 register-move instruction added to handle the modulo-variable-expansion
211 of the register defined by this node. This register-move copies the
212 original register defined by the node. */
213 rtx first_reg_move;
214
215 /* The number of register-move instructions added, immediately preceding
216 first_reg_move. */
217 int nreg_moves;
218
219 int row; /* Holds time % ii. */
220 int stage; /* Holds time / ii. */
221
222 /* The column of a node inside the ps. If nodes u, v are on the same row,
223 u will precede v if column (u) < column (v). */
224 int column;
225 } *node_sched_params_ptr;
226
227 \f
228 /* The following three functions are copied from the current scheduler
229 code in order to use sched_analyze() for computing the dependencies.
230 They are used when initializing the sched_info structure. */
231 static const char *
232 sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
233 {
234 static char tmp[80];
235
236 sprintf (tmp, "i%4d", INSN_UID (insn));
237 return tmp;
238 }
239
240 static int
241 contributes_to_priority (rtx next, rtx insn)
242 {
243 return BLOCK_NUM (next) == BLOCK_NUM (insn);
244 }
245
246 static void
247 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
248 regset cond_exec ATTRIBUTE_UNUSED,
249 regset used ATTRIBUTE_UNUSED,
250 regset set ATTRIBUTE_UNUSED)
251 {
252 }
253
254 static struct sched_info sms_sched_info =
255 {
256 NULL,
257 NULL,
258 NULL,
259 NULL,
260 NULL,
261 sms_print_insn,
262 contributes_to_priority,
263 compute_jump_reg_dependencies,
264 NULL, NULL,
265 NULL, NULL,
266 0, 0, 0,
267
268 0
269 };
270
271
272 /* Return the register decremented and tested in INSN,
273 or zero if it is not a decrement-and-branch insn. */
274
275 static rtx
276 doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
277 {
278 #ifdef HAVE_doloop_end
279 rtx pattern, reg, condition;
280
281 if (! JUMP_P (insn))
282 return NULL_RTX;
283
284 pattern = PATTERN (insn);
285 condition = doloop_condition_get (pattern);
286 if (! condition)
287 return NULL_RTX;
288
289 if (REG_P (XEXP (condition, 0)))
290 reg = XEXP (condition, 0);
291 else if (GET_CODE (XEXP (condition, 0)) == PLUS
292 && REG_P (XEXP (XEXP (condition, 0), 0)))
293 reg = XEXP (XEXP (condition, 0), 0);
294 else
295 gcc_unreachable ();
296
297 return reg;
298 #else
299 return NULL_RTX;
300 #endif
301 }
302
303 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
304 that the number of iterations is a compile-time constant. If so,
305 return the rtx that sets COUNT_REG to a constant, and set COUNT to
306 this constant. Otherwise return 0. */
307 static rtx
308 const_iteration_count (rtx count_reg, basic_block pre_header,
309 HOST_WIDEST_INT * count)
310 {
311 rtx insn;
312 rtx head, tail;
313
314 if (! pre_header)
315 return NULL_RTX;
316
317 get_block_head_tail (pre_header->index, &head, &tail);
318
319 for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
320 if (INSN_P (insn) && single_set (insn) &&
321 rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
322 {
323 rtx pat = single_set (insn);
324
325 if (GET_CODE (SET_SRC (pat)) == CONST_INT)
326 {
327 *count = INTVAL (SET_SRC (pat));
328 return insn;
329 }
330
331 return NULL_RTX;
332 }
333
334 return NULL_RTX;
335 }
336
337 /* A very simple resource-based lower bound on the initiation interval.
338 ??? Improve the accuracy of this bound by considering the
339 utilization of various units. */
340 static int
341 res_MII (ddg_ptr g)
342 {
343 return (g->num_nodes / issue_rate);
344 }
345
346
347 /* Points to the array that contains the sched data for each node. */
348 static node_sched_params_ptr node_sched_params;
349
350 /* Allocate sched_params for each node and initialize it. Assumes that
351 the aux field of each node contain the asap bound (computed earlier),
352 and copies it into the sched_params field. */
353 static void
354 set_node_sched_params (ddg_ptr g)
355 {
356 int i;
357
358 /* Allocate for each node in the DDG a place to hold the "sched_data". */
359 /* Initialize ASAP/ALAP/HIGHT to zero. */
360 node_sched_params = (node_sched_params_ptr)
361 xcalloc (g->num_nodes,
362 sizeof (struct node_sched_params));
363
364 /* Set the pointer of the general data of the node to point to the
365 appropriate sched_params structure. */
366 for (i = 0; i < g->num_nodes; i++)
367 {
368 /* Watch out for aliasing problems? */
369 node_sched_params[i].asap = g->nodes[i].aux.count;
370 g->nodes[i].aux.info = &node_sched_params[i];
371 }
372 }
373
374 static void
375 print_node_sched_params (FILE * file, int num_nodes)
376 {
377 int i;
378
379 if (! file)
380 return;
381 for (i = 0; i < num_nodes; i++)
382 {
383 node_sched_params_ptr nsp = &node_sched_params[i];
384 rtx reg_move = nsp->first_reg_move;
385 int j;
386
387 fprintf (file, "Node %d:\n", i);
388 fprintf (file, " asap = %d:\n", nsp->asap);
389 fprintf (file, " time = %d:\n", nsp->time);
390 fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
391 for (j = 0; j < nsp->nreg_moves; j++)
392 {
393 fprintf (file, " reg_move = ");
394 print_rtl_single (file, reg_move);
395 reg_move = PREV_INSN (reg_move);
396 }
397 }
398 }
399
400 /* Calculate an upper bound for II. SMS should not schedule the loop if it
401 requires more cycles than this bound. Currently set to the sum of the
402 longest latency edge for each node. Reset based on experiments. */
403 static int
404 calculate_maxii (ddg_ptr g)
405 {
406 int i;
407 int maxii = 0;
408
409 for (i = 0; i < g->num_nodes; i++)
410 {
411 ddg_node_ptr u = &g->nodes[i];
412 ddg_edge_ptr e;
413 int max_edge_latency = 0;
414
415 for (e = u->out; e; e = e->next_out)
416 max_edge_latency = MAX (max_edge_latency, e->latency);
417
418 maxii += max_edge_latency;
419 }
420 return maxii;
421 }
422
423 /*
424 Breaking intra-loop register anti-dependences:
425 Each intra-loop register anti-dependence implies a cross-iteration true
426 dependence of distance 1. Therefore, we can remove such false dependencies
427 and figure out if the partial schedule broke them by checking if (for a
428 true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
429 if so generate a register move. The number of such moves is equal to:
430 SCHED_TIME (use) - SCHED_TIME (def) { 0 broken
431 nreg_moves = ----------------------------------- + 1 - { dependence.
432 ii { 1 if not.
433 */
434 static struct undo_replace_buff_elem *
435 generate_reg_moves (partial_schedule_ptr ps)
436 {
437 ddg_ptr g = ps->g;
438 int ii = ps->ii;
439 int i;
440 struct undo_replace_buff_elem *reg_move_replaces = NULL;
441
442 for (i = 0; i < g->num_nodes; i++)
443 {
444 ddg_node_ptr u = &g->nodes[i];
445 ddg_edge_ptr e;
446 int nreg_moves = 0, i_reg_move;
447 sbitmap *uses_of_defs;
448 rtx last_reg_move;
449 rtx prev_reg, old_reg;
450
451 /* Compute the number of reg_moves needed for u, by looking at life
452 ranges started at u (excluding self-loops). */
453 for (e = u->out; e; e = e->next_out)
454 if (e->type == TRUE_DEP && e->dest != e->src)
455 {
456 int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
457
458 if (e->distance == 1)
459 nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
460
461 /* If dest precedes src in the schedule of the kernel, then dest
462 will read before src writes and we can save one reg_copy. */
463 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
464 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
465 nreg_moves4e--;
466
467 nreg_moves = MAX (nreg_moves, nreg_moves4e);
468 }
469
470 if (nreg_moves == 0)
471 continue;
472
473 /* Every use of the register defined by node may require a different
474 copy of this register, depending on the time the use is scheduled.
475 Set a bitmap vector, telling which nodes use each copy of this
476 register. */
477 uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
478 sbitmap_vector_zero (uses_of_defs, nreg_moves);
479 for (e = u->out; e; e = e->next_out)
480 if (e->type == TRUE_DEP && e->dest != e->src)
481 {
482 int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
483
484 if (e->distance == 1)
485 dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
486
487 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
488 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
489 dest_copy--;
490
491 if (dest_copy)
492 SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
493 }
494
495 /* Now generate the reg_moves, attaching relevant uses to them. */
496 SCHED_NREG_MOVES (u) = nreg_moves;
497 old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
498 last_reg_move = u->insn;
499
500 for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
501 {
502 unsigned int i_use = 0;
503 rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
504 rtx reg_move = gen_move_insn (new_reg, prev_reg);
505 sbitmap_iterator sbi;
506
507 add_insn_before (reg_move, last_reg_move);
508 last_reg_move = reg_move;
509
510 if (!SCHED_FIRST_REG_MOVE (u))
511 SCHED_FIRST_REG_MOVE (u) = reg_move;
512
513 EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi)
514 {
515 struct undo_replace_buff_elem *rep;
516
517 rep = (struct undo_replace_buff_elem *)
518 xcalloc (1, sizeof (struct undo_replace_buff_elem));
519 rep->insn = g->nodes[i_use].insn;
520 rep->orig_reg = old_reg;
521 rep->new_reg = new_reg;
522
523 if (! reg_move_replaces)
524 reg_move_replaces = rep;
525 else
526 {
527 rep->next = reg_move_replaces;
528 reg_move_replaces = rep;
529 }
530
531 replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
532 }
533
534 prev_reg = new_reg;
535 }
536 sbitmap_vector_free (uses_of_defs);
537 }
538 return reg_move_replaces;
539 }
540
541 /* We call this when we want to undo the SMS schedule for a given loop.
542 One of the things that we do is to delete the register moves generated
543 for the sake of SMS; this function deletes the register move instructions
544 recorded in the undo buffer. */
545 static void
546 undo_generate_reg_moves (partial_schedule_ptr ps,
547 struct undo_replace_buff_elem *reg_move_replaces)
548 {
549 int i,j;
550
551 for (i = 0; i < ps->g->num_nodes; i++)
552 {
553 ddg_node_ptr u = &ps->g->nodes[i];
554 rtx prev;
555 rtx crr = SCHED_FIRST_REG_MOVE (u);
556
557 for (j = 0; j < SCHED_NREG_MOVES (u); j++)
558 {
559 prev = PREV_INSN (crr);
560 delete_insn (crr);
561 crr = prev;
562 }
563 SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
564 }
565
566 while (reg_move_replaces)
567 {
568 struct undo_replace_buff_elem *rep = reg_move_replaces;
569
570 reg_move_replaces = reg_move_replaces->next;
571 replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
572 }
573 }
574
575 /* Free memory allocated for the undo buffer. */
576 static void
577 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
578 {
579
580 while (reg_move_replaces)
581 {
582 struct undo_replace_buff_elem *rep = reg_move_replaces;
583
584 reg_move_replaces = reg_move_replaces->next;
585 free (rep);
586 }
587 }
588
589 /* Bump the SCHED_TIMEs of all nodes to start from zero. Set the values
590 of SCHED_ROW and SCHED_STAGE. */
591 static void
592 normalize_sched_times (partial_schedule_ptr ps)
593 {
594 int i;
595 ddg_ptr g = ps->g;
596 int amount = PS_MIN_CYCLE (ps);
597 int ii = ps->ii;
598
599 /* Don't include the closing branch assuming that it is the last node. */
600 for (i = 0; i < g->num_nodes - 1; i++)
601 {
602 ddg_node_ptr u = &g->nodes[i];
603 int normalized_time = SCHED_TIME (u) - amount;
604
605 gcc_assert (normalized_time >= 0);
606
607 SCHED_TIME (u) = normalized_time;
608 SCHED_ROW (u) = normalized_time % ii;
609 SCHED_STAGE (u) = normalized_time / ii;
610 }
611 }
612
613 /* Set SCHED_COLUMN of each node according to its position in PS. */
614 static void
615 set_columns_for_ps (partial_schedule_ptr ps)
616 {
617 int row;
618
619 for (row = 0; row < ps->ii; row++)
620 {
621 ps_insn_ptr cur_insn = ps->rows[row];
622 int column = 0;
623
624 for (; cur_insn; cur_insn = cur_insn->next_in_row)
625 SCHED_COLUMN (cur_insn->node) = column++;
626 }
627 }
628
629 /* Permute the insns according to their order in PS, from row 0 to
630 row ii-1, and position them right before LAST. This schedules
631 the insns of the loop kernel. */
632 static void
633 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
634 {
635 int ii = ps->ii;
636 int row;
637 ps_insn_ptr ps_ij;
638
639 for (row = 0; row < ii ; row++)
640 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
641 if (PREV_INSN (last) != ps_ij->node->insn)
642 reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
643 PREV_INSN (last));
644 }
645
646 /* As part of undoing SMS we return to the original ordering of the
647 instructions inside the loop kernel. Given the partial schedule PS, this
648 function returns the ordering of the instruction according to their CUID
649 in the DDG (PS->G), which is the original order of the instruction before
650 performing SMS. */
651 static void
652 undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
653 {
654 int i;
655
656 for (i = 0 ; i < ps->g->num_nodes; i++)
657 if (last == ps->g->nodes[i].insn
658 || last == ps->g->nodes[i].first_note)
659 break;
660 else if (PREV_INSN (last) != ps->g->nodes[i].insn)
661 reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
662 PREV_INSN (last));
663 }
664
665 /* Used to generate the prologue & epilogue. Duplicate the subset of
666 nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
667 of both), together with a prefix/suffix of their reg_moves. */
668 static void
669 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
670 int to_stage, int for_prolog)
671 {
672 int row;
673 ps_insn_ptr ps_ij;
674
675 for (row = 0; row < ps->ii; row++)
676 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
677 {
678 ddg_node_ptr u_node = ps_ij->node;
679 int j, i_reg_moves;
680 rtx reg_move = NULL_RTX;
681
682 if (for_prolog)
683 {
684 /* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing
685 number of reg_moves starting with the second occurrence of
686 u_node, which is generated if its SCHED_STAGE <= to_stage. */
687 i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
688 i_reg_moves = MAX (i_reg_moves, 0);
689 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
690
691 /* The reg_moves start from the *first* reg_move backwards. */
692 if (i_reg_moves)
693 {
694 reg_move = SCHED_FIRST_REG_MOVE (u_node);
695 for (j = 1; j < i_reg_moves; j++)
696 reg_move = PREV_INSN (reg_move);
697 }
698 }
699 else /* It's for the epilog. */
700 {
701 /* SCHED_STAGE (u_node) <= to_stage. Generate all reg_moves,
702 starting to decrease one stage after u_node no longer occurs;
703 that is, generate all reg_moves until
704 SCHED_STAGE (u_node) == from_stage - 1. */
705 i_reg_moves = SCHED_NREG_MOVES (u_node)
706 - (from_stage - SCHED_STAGE (u_node) - 1);
707 i_reg_moves = MAX (i_reg_moves, 0);
708 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
709
710 /* The reg_moves start from the *last* reg_move forwards. */
711 if (i_reg_moves)
712 {
713 reg_move = SCHED_FIRST_REG_MOVE (u_node);
714 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
715 reg_move = PREV_INSN (reg_move);
716 }
717 }
718
719 for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
720 emit_insn (copy_rtx (PATTERN (reg_move)));
721 if (SCHED_STAGE (u_node) >= from_stage
722 && SCHED_STAGE (u_node) <= to_stage)
723 duplicate_insn_chain (u_node->first_note, u_node->insn);
724 }
725 }
726
727
728 /* Generate the instructions (including reg_moves) for prolog & epilog. */
729 static void
730 generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
731 {
732 int i;
733 int last_stage = PS_STAGE_COUNT (ps) - 1;
734 edge e;
735
736 /* Generate the prolog, inserting its insns on the loop-entry edge. */
737 start_sequence ();
738
739 if (count_reg)
740 /* Generate a subtract instruction at the beginning of the prolog to
741 adjust the loop count by STAGE_COUNT. */
742 emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
743
744 for (i = 0; i < last_stage; i++)
745 duplicate_insns_of_cycles (ps, 0, i, 1);
746
747 /* Put the prolog , on the one and only entry edge. */
748 e = loop_preheader_edge (loop);
749 loop_split_edge_with(e , get_insns());
750
751 end_sequence ();
752
753 /* Generate the epilog, inserting its insns on the loop-exit edge. */
754 start_sequence ();
755
756 for (i = 0; i < last_stage; i++)
757 duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
758
759 /* Put the epilogue on the one and only one exit edge. */
760 gcc_assert (loop->single_exit);
761 e = loop->single_exit;
762 loop_split_edge_with(e , get_insns());
763 end_sequence ();
764 }
765
766 /* Return the line note insn preceding INSN, for debugging. Taken from
767 emit-rtl.c. */
768 static rtx
769 find_line_note (rtx insn)
770 {
771 for (; insn; insn = PREV_INSN (insn))
772 if (NOTE_P (insn)
773 && NOTE_LINE_NUMBER (insn) >= 0)
774 break;
775
776 return insn;
777 }
778
779 /* Return true if all the BBs of the loop are empty except the
780 loop header. */
781 static bool
782 loop_single_full_bb_p (struct loop *loop)
783 {
784 unsigned i;
785 basic_block *bbs = get_loop_body (loop);
786
787 for (i = 0; i < loop->num_nodes ; i++)
788 {
789 rtx head, tail;
790 bool empty_bb = true;
791
792 if (bbs[i] == loop->header)
793 continue;
794
795 /* Make sure that basic blocks other than the header
796 have only notes labels or jumps. */
797 get_block_head_tail (bbs[i]->index, &head, &tail);
798 for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
799 {
800 if (NOTE_P (head) || LABEL_P (head)
801 || (INSN_P (head) && JUMP_P (head)))
802 continue;
803 empty_bb = false;
804 break;
805 }
806
807 if (! empty_bb)
808 {
809 free (bbs);
810 return false;
811 }
812 }
813 free (bbs);
814 return true;
815 }
816
817 /* A simple loop from SMS point of view; it is a loop that is composed of
818 either a single basic block or two BBs - a header and a latch. */
819 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) \
820 && (EDGE_COUNT (loop->latch->preds) == 1) \
821 && (EDGE_COUNT (loop->latch->succs) == 1))
822
823 /* Return true if the loop is in its canonical form and false if not.
824 i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit. */
825 static bool
826 loop_canon_p (struct loop *loop)
827 {
828
829 if (loop->inner || ! loop->outer)
830 return false;
831
832 if (!loop->single_exit)
833 {
834 if (dump_file)
835 {
836 rtx line_note = find_line_note (BB_END (loop->header));
837
838 fprintf (dump_file, "SMS loop many exits ");
839 if (line_note)
840 {
841 expanded_location xloc;
842 NOTE_EXPANDED_LOCATION (xloc, line_note);
843 fprintf (dump_file, " %s %d (file, line)\n",
844 xloc.file, xloc.line);
845 }
846 }
847 return false;
848 }
849
850 if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
851 {
852 if (dump_file)
853 {
854 rtx line_note = find_line_note (BB_END (loop->header));
855
856 fprintf (dump_file, "SMS loop many BBs. ");
857 if (line_note)
858 {
859 expanded_location xloc;
860 NOTE_EXPANDED_LOCATION (xloc, line_note);
861 fprintf (dump_file, " %s %d (file, line)\n",
862 xloc.file, xloc.line);
863 }
864 }
865 return false;
866 }
867
868 return true;
869 }
870
871 /* If there are more than one entry for the loop,
872 make it one by splitting the first entry edge and
873 redirecting the others to the new BB. */
874 static void
875 canon_loop (struct loop *loop)
876 {
877 edge e;
878 edge_iterator i;
879
880 /* Avoid annoying special cases of edges going to exit
881 block. */
882 FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
883 if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
884 loop_split_edge_with (e, NULL_RTX);
885
886 if (loop->latch == loop->header
887 || EDGE_COUNT (loop->latch->succs) > 1)
888 {
889 FOR_EACH_EDGE (e, i, loop->header->preds)
890 if (e->src == loop->latch)
891 break;
892 loop_split_edge_with (e, NULL_RTX);
893 }
894 }
895
896 /* Main entry point, perform SMS scheduling on the loops of the function
897 that consist of single basic blocks. */
898 static void
899 sms_schedule (void)
900 {
901 static int passes = 0;
902 rtx insn;
903 ddg_ptr *g_arr, g;
904 int * node_order;
905 int maxii;
906 unsigned i,num_loops;
907 partial_schedule_ptr ps;
908 struct df *df;
909 struct loops *loops;
910 basic_block bb = NULL;
911 /* vars to the versioning only if needed*/
912 struct loop * nloop;
913 basic_block condition_bb = NULL;
914 edge latch_edge;
915 gcov_type trip_count = 0;
916
917 loops = loop_optimizer_init (LOOPS_HAVE_PREHEADERS
918 | LOOPS_HAVE_MARKED_SINGLE_EXITS);
919 if (!loops)
920 return; /* There is no loops to schedule. */
921
922 /* Initialize issue_rate. */
923 if (targetm.sched.issue_rate)
924 {
925 int temp = reload_completed;
926
927 reload_completed = 1;
928 issue_rate = targetm.sched.issue_rate ();
929 reload_completed = temp;
930 }
931 else
932 issue_rate = 1;
933
934 /* Initialize the scheduler. */
935 current_sched_info = &sms_sched_info;
936 sched_init ();
937
938 /* Init Data Flow analysis, to be used in interloop dep calculation. */
939 df = df_init (DF_HARD_REGS | DF_EQUIV_NOTES | DF_SUBREGS);
940 df_rd_add_problem (df);
941 df_ru_add_problem (df);
942 df_chain_add_problem (df, DF_DU_CHAIN | DF_UD_CHAIN);
943 df_analyze (df);
944
945 /* Allocate memory to hold the DDG array one entry for each loop.
946 We use loop->num as index into this array. */
947 g_arr = XCNEWVEC (ddg_ptr, loops->num);
948
949
950 /* Build DDGs for all the relevant loops and hold them in G_ARR
951 indexed by the loop index. */
952 for (i = 0; i < loops->num; i++)
953 {
954 rtx head, tail;
955 rtx count_reg;
956 struct loop *loop = loops->parray[i];
957
958 /* For debugging. */
959 if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
960 {
961 if (dump_file)
962 fprintf (dump_file, "SMS reached MAX_PASSES... \n");
963
964 break;
965 }
966
967 if (! loop_canon_p (loop))
968 continue;
969
970 if (! loop_single_full_bb_p (loop))
971 continue;
972
973 bb = loop->header;
974
975 get_block_head_tail (bb->index, &head, &tail);
976 latch_edge = loop_latch_edge (loop);
977 gcc_assert (loop->single_exit);
978 if (loop->single_exit->count)
979 trip_count = latch_edge->count / loop->single_exit->count;
980
981 /* Perfrom SMS only on loops that their average count is above threshold. */
982
983 if ( latch_edge->count
984 && (latch_edge->count < loop->single_exit->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
985 {
986 if (dump_file)
987 {
988 rtx line_note = find_line_note (tail);
989
990 if (line_note)
991 {
992 expanded_location xloc;
993 NOTE_EXPANDED_LOCATION (xloc, line_note);
994 fprintf (dump_file, "SMS bb %s %d (file, line)\n",
995 xloc.file, xloc.line);
996 }
997 fprintf (dump_file, "SMS single-bb-loop\n");
998 if (profile_info && flag_branch_probabilities)
999 {
1000 fprintf (dump_file, "SMS loop-count ");
1001 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1002 (HOST_WIDEST_INT) bb->count);
1003 fprintf (dump_file, "\n");
1004 fprintf (dump_file, "SMS trip-count ");
1005 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1006 (HOST_WIDEST_INT) trip_count);
1007 fprintf (dump_file, "\n");
1008 fprintf (dump_file, "SMS profile-sum-max ");
1009 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1010 (HOST_WIDEST_INT) profile_info->sum_max);
1011 fprintf (dump_file, "\n");
1012 }
1013 }
1014 continue;
1015 }
1016
1017 /* Make sure this is a doloop. */
1018 if ( !(count_reg = doloop_register_get (tail)))
1019 continue;
1020
1021 /* Don't handle BBs with calls or barriers, or !single_set insns. */
1022 for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1023 if (CALL_P (insn)
1024 || BARRIER_P (insn)
1025 || (INSN_P (insn) && !JUMP_P (insn)
1026 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
1027 break;
1028
1029 if (insn != NEXT_INSN (tail))
1030 {
1031 if (dump_file)
1032 {
1033 if (CALL_P (insn))
1034 fprintf (dump_file, "SMS loop-with-call\n");
1035 else if (BARRIER_P (insn))
1036 fprintf (dump_file, "SMS loop-with-barrier\n");
1037 else
1038 fprintf (dump_file, "SMS loop-with-not-single-set\n");
1039 print_rtl_single (dump_file, insn);
1040 }
1041
1042 continue;
1043 }
1044
1045 if (! (g = create_ddg (bb, df, 0)))
1046 {
1047 if (dump_file)
1048 fprintf (dump_file, "SMS doloop\n");
1049 continue;
1050 }
1051
1052 g_arr[i] = g;
1053 }
1054
1055 /* Release Data Flow analysis data structures. */
1056 df_finish (df);
1057 df = NULL;
1058
1059 /* We don't want to perform SMS on new loops - created by versioning. */
1060 num_loops = loops->num;
1061 /* Go over the built DDGs and perfrom SMS for each one of them. */
1062 for (i = 0; i < num_loops; i++)
1063 {
1064 rtx head, tail;
1065 rtx count_reg, count_init;
1066 int mii, rec_mii;
1067 unsigned stage_count = 0;
1068 HOST_WIDEST_INT loop_count = 0;
1069 struct loop *loop = loops->parray[i];
1070
1071 if (! (g = g_arr[i]))
1072 continue;
1073
1074 if (dump_file)
1075 print_ddg (dump_file, g);
1076
1077 get_block_head_tail (loop->header->index, &head, &tail);
1078
1079 latch_edge = loop_latch_edge (loop);
1080 gcc_assert (loop->single_exit);
1081 if (loop->single_exit->count)
1082 trip_count = latch_edge->count / loop->single_exit->count;
1083
1084 if (dump_file)
1085 {
1086 rtx line_note = find_line_note (tail);
1087
1088 if (line_note)
1089 {
1090 expanded_location xloc;
1091 NOTE_EXPANDED_LOCATION (xloc, line_note);
1092 fprintf (dump_file, "SMS bb %s %d (file, line)\n",
1093 xloc.file, xloc.line);
1094 }
1095 fprintf (dump_file, "SMS single-bb-loop\n");
1096 if (profile_info && flag_branch_probabilities)
1097 {
1098 fprintf (dump_file, "SMS loop-count ");
1099 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1100 (HOST_WIDEST_INT) bb->count);
1101 fprintf (dump_file, "\n");
1102 fprintf (dump_file, "SMS profile-sum-max ");
1103 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1104 (HOST_WIDEST_INT) profile_info->sum_max);
1105 fprintf (dump_file, "\n");
1106 }
1107 fprintf (dump_file, "SMS doloop\n");
1108 fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1109 fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1110 fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1111 }
1112
1113
1114 /* In case of th loop have doloop register it gets special
1115 handling. */
1116 count_init = NULL_RTX;
1117 if ((count_reg = doloop_register_get (tail)))
1118 {
1119 basic_block pre_header;
1120
1121 pre_header = loop_preheader_edge (loop)->src;
1122 count_init = const_iteration_count (count_reg, pre_header,
1123 &loop_count);
1124 }
1125 gcc_assert (count_reg);
1126
1127 if (dump_file && count_init)
1128 {
1129 fprintf (dump_file, "SMS const-doloop ");
1130 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1131 loop_count);
1132 fprintf (dump_file, "\n");
1133 }
1134
1135 node_order = XNEWVEC (int, g->num_nodes);
1136
1137 mii = 1; /* Need to pass some estimate of mii. */
1138 rec_mii = sms_order_nodes (g, mii, node_order);
1139 mii = MAX (res_MII (g), rec_mii);
1140 maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1141
1142 if (dump_file)
1143 fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1144 rec_mii, mii, maxii);
1145
1146 /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1147 ASAP. */
1148 set_node_sched_params (g);
1149
1150 ps = sms_schedule_by_order (g, mii, maxii, node_order);
1151
1152 if (ps)
1153 stage_count = PS_STAGE_COUNT (ps);
1154
1155 /* Stage count of 1 means that there is no interleaving between
1156 iterations, let the scheduling passes do the job. */
1157 if (stage_count < 1
1158 || (count_init && (loop_count <= stage_count))
1159 || (flag_branch_probabilities && (trip_count <= stage_count)))
1160 {
1161 if (dump_file)
1162 {
1163 fprintf (dump_file, "SMS failed... \n");
1164 fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1165 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1166 fprintf (dump_file, ", trip-count=");
1167 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1168 fprintf (dump_file, ")\n");
1169 }
1170 continue;
1171 }
1172 else
1173 {
1174 int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1175 int new_cycles;
1176 struct undo_replace_buff_elem *reg_move_replaces;
1177
1178 if (dump_file)
1179 {
1180 fprintf (dump_file,
1181 "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1182 stage_count);
1183 print_partial_schedule (ps, dump_file);
1184 fprintf (dump_file,
1185 "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1186 g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1187 }
1188
1189 /* Set the stage boundaries. If the DDG is built with closing_branch_deps,
1190 the closing_branch was scheduled and should appear in the last (ii-1)
1191 row. Otherwise, we are free to schedule the branch, and we let nodes
1192 that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1193 row; this should reduce stage_count to minimum. */
1194 normalize_sched_times (ps);
1195 rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1196 set_columns_for_ps (ps);
1197
1198 /* Generate the kernel just to be able to measure its cycles. */
1199 permute_partial_schedule (ps, g->closing_branch->first_note);
1200 reg_move_replaces = generate_reg_moves (ps);
1201
1202 /* Get the number of cycles the new kernel expect to execute in. */
1203 new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1204
1205 /* Get back to the original loop so we can do loop versioning. */
1206 undo_permute_partial_schedule (ps, g->closing_branch->first_note);
1207 if (reg_move_replaces)
1208 undo_generate_reg_moves (ps, reg_move_replaces);
1209
1210 if ( new_cycles >= orig_cycles)
1211 {
1212 /* SMS is not profitable so undo the permutation and reg move generation
1213 and return the kernel to its original state. */
1214 if (dump_file)
1215 fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
1216
1217 }
1218 else
1219 {
1220 canon_loop (loop);
1221
1222 /* case the BCT count is not known , Do loop-versioning */
1223 if (count_reg && ! count_init)
1224 {
1225 rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1226 GEN_INT(stage_count));
1227
1228 nloop = loop_version (loops, loop, comp_rtx, &condition_bb,
1229 true);
1230 }
1231
1232 /* Set new iteration count of loop kernel. */
1233 if (count_reg && count_init)
1234 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1235 - stage_count + 1);
1236
1237 /* Now apply the scheduled kernel to the RTL of the loop. */
1238 permute_partial_schedule (ps, g->closing_branch->first_note);
1239
1240 /* Mark this loop as software pipelined so the later
1241 scheduling passes doesn't touch it. */
1242 if (! flag_resched_modulo_sched)
1243 g->bb->flags |= BB_DISABLE_SCHEDULE;
1244 /* The life-info is not valid any more. */
1245 g->bb->flags |= BB_DIRTY;
1246
1247 reg_move_replaces = generate_reg_moves (ps);
1248 if (dump_file)
1249 print_node_sched_params (dump_file, g->num_nodes);
1250 /* Generate prolog and epilog. */
1251 if (count_reg && !count_init)
1252 generate_prolog_epilog (ps, loop, count_reg);
1253 else
1254 generate_prolog_epilog (ps, loop, NULL_RTX);
1255 }
1256 free_undo_replace_buff (reg_move_replaces);
1257 }
1258
1259 free_partial_schedule (ps);
1260 free (node_sched_params);
1261 free (node_order);
1262 free_ddg (g);
1263 }
1264
1265 free (g_arr);
1266
1267 /* Release scheduler data, needed until now because of DFA. */
1268 sched_finish ();
1269 loop_optimizer_finalize (loops);
1270 }
1271
1272 /* The SMS scheduling algorithm itself
1273 -----------------------------------
1274 Input: 'O' an ordered list of insns of a loop.
1275 Output: A scheduling of the loop - kernel, prolog, and epilogue.
1276
1277 'Q' is the empty Set
1278 'PS' is the partial schedule; it holds the currently scheduled nodes with
1279 their cycle/slot.
1280 'PSP' previously scheduled predecessors.
1281 'PSS' previously scheduled successors.
1282 't(u)' the cycle where u is scheduled.
1283 'l(u)' is the latency of u.
1284 'd(v,u)' is the dependence distance from v to u.
1285 'ASAP(u)' the earliest time at which u could be scheduled as computed in
1286 the node ordering phase.
1287 'check_hardware_resources_conflicts(u, PS, c)'
1288 run a trace around cycle/slot through DFA model
1289 to check resource conflicts involving instruction u
1290 at cycle c given the partial schedule PS.
1291 'add_to_partial_schedule_at_time(u, PS, c)'
1292 Add the node/instruction u to the partial schedule
1293 PS at time c.
1294 'calculate_register_pressure(PS)'
1295 Given a schedule of instructions, calculate the register
1296 pressure it implies. One implementation could be the
1297 maximum number of overlapping live ranges.
1298 'maxRP' The maximum allowed register pressure, it is usually derived from the number
1299 registers available in the hardware.
1300
1301 1. II = MII.
1302 2. PS = empty list
1303 3. for each node u in O in pre-computed order
1304 4. if (PSP(u) != Q && PSS(u) == Q) then
1305 5. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1306 6. start = Early_start; end = Early_start + II - 1; step = 1
1307 11. else if (PSP(u) == Q && PSS(u) != Q) then
1308 12. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1309 13. start = Late_start; end = Late_start - II + 1; step = -1
1310 14. else if (PSP(u) != Q && PSS(u) != Q) then
1311 15. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1312 16. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1313 17. start = Early_start;
1314 18. end = min(Early_start + II - 1 , Late_start);
1315 19. step = 1
1316 20. else "if (PSP(u) == Q && PSS(u) == Q)"
1317 21. start = ASAP(u); end = start + II - 1; step = 1
1318 22. endif
1319
1320 23. success = false
1321 24. for (c = start ; c != end ; c += step)
1322 25. if check_hardware_resources_conflicts(u, PS, c) then
1323 26. add_to_partial_schedule_at_time(u, PS, c)
1324 27. success = true
1325 28. break
1326 29. endif
1327 30. endfor
1328 31. if (success == false) then
1329 32. II = II + 1
1330 33. if (II > maxII) then
1331 34. finish - failed to schedule
1332 35. endif
1333 36. goto 2.
1334 37. endif
1335 38. endfor
1336 39. if (calculate_register_pressure(PS) > maxRP) then
1337 40. goto 32.
1338 41. endif
1339 42. compute epilogue & prologue
1340 43. finish - succeeded to schedule
1341 */
1342
1343 /* A limit on the number of cycles that resource conflicts can span. ??? Should
1344 be provided by DFA, and be dependent on the type of insn scheduled. Currently
1345 set to 0 to save compile time. */
1346 #define DFA_HISTORY SMS_DFA_HISTORY
1347
1348 /* Given the partial schedule PS, this function calculates and returns the
1349 cycles in which we can schedule the node with the given index I.
1350 NOTE: Here we do the backtracking in SMS, in some special cases. We have
1351 noticed that there are several cases in which we fail to SMS the loop
1352 because the sched window of a node is empty due to tight data-deps. In
1353 such cases we want to unschedule some of the predecessors/successors
1354 until we get non-empty scheduling window. It returns -1 if the
1355 scheduling window is empty and zero otherwise. */
1356
1357 static int
1358 get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
1359 sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
1360 {
1361 int start, step, end;
1362 ddg_edge_ptr e;
1363 int u = nodes_order [i];
1364 ddg_node_ptr u_node = &ps->g->nodes[u];
1365 sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1366 sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1367 sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1368 sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1369 int psp_not_empty;
1370 int pss_not_empty;
1371
1372 /* 1. compute sched window for u (start, end, step). */
1373 sbitmap_zero (psp);
1374 sbitmap_zero (pss);
1375 psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1376 pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1377
1378 if (psp_not_empty && !pss_not_empty)
1379 {
1380 int early_start = INT_MIN;
1381
1382 end = INT_MAX;
1383 for (e = u_node->in; e != 0; e = e->next_in)
1384 {
1385 ddg_node_ptr v_node = e->src;
1386 if (TEST_BIT (sched_nodes, v_node->cuid))
1387 {
1388 int node_st = SCHED_TIME (v_node)
1389 + e->latency - (e->distance * ii);
1390
1391 early_start = MAX (early_start, node_st);
1392
1393 if (e->data_type == MEM_DEP)
1394 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1395 }
1396 }
1397 start = early_start;
1398 end = MIN (end, early_start + ii);
1399 step = 1;
1400 }
1401
1402 else if (!psp_not_empty && pss_not_empty)
1403 {
1404 int late_start = INT_MAX;
1405
1406 end = INT_MIN;
1407 for (e = u_node->out; e != 0; e = e->next_out)
1408 {
1409 ddg_node_ptr v_node = e->dest;
1410 if (TEST_BIT (sched_nodes, v_node->cuid))
1411 {
1412 late_start = MIN (late_start,
1413 SCHED_TIME (v_node) - e->latency
1414 + (e->distance * ii));
1415 if (e->data_type == MEM_DEP)
1416 end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1417 }
1418 }
1419 start = late_start;
1420 end = MAX (end, late_start - ii);
1421 step = -1;
1422 }
1423
1424 else if (psp_not_empty && pss_not_empty)
1425 {
1426 int early_start = INT_MIN;
1427 int late_start = INT_MAX;
1428
1429 start = INT_MIN;
1430 end = INT_MAX;
1431 for (e = u_node->in; e != 0; e = e->next_in)
1432 {
1433 ddg_node_ptr v_node = e->src;
1434
1435 if (TEST_BIT (sched_nodes, v_node->cuid))
1436 {
1437 early_start = MAX (early_start,
1438 SCHED_TIME (v_node) + e->latency
1439 - (e->distance * ii));
1440 if (e->data_type == MEM_DEP)
1441 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1442 }
1443 }
1444 for (e = u_node->out; e != 0; e = e->next_out)
1445 {
1446 ddg_node_ptr v_node = e->dest;
1447
1448 if (TEST_BIT (sched_nodes, v_node->cuid))
1449 {
1450 late_start = MIN (late_start,
1451 SCHED_TIME (v_node) - e->latency
1452 + (e->distance * ii));
1453 if (e->data_type == MEM_DEP)
1454 start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1455 }
1456 }
1457 start = MAX (start, early_start);
1458 end = MIN (end, MIN (early_start + ii, late_start + 1));
1459 step = 1;
1460 }
1461 else /* psp is empty && pss is empty. */
1462 {
1463 start = SCHED_ASAP (u_node);
1464 end = start + ii;
1465 step = 1;
1466 }
1467
1468 *start_p = start;
1469 *step_p = step;
1470 *end_p = end;
1471 sbitmap_free (psp);
1472 sbitmap_free (pss);
1473
1474 if ((start >= end && step == 1) || (start <= end && step == -1))
1475 return -1;
1476 else
1477 return 0;
1478 }
1479
1480 /* This function implements the scheduling algorithm for SMS according to the
1481 above algorithm. */
1482 static partial_schedule_ptr
1483 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
1484 {
1485 int ii = mii;
1486 int i, c, success;
1487 int try_again_with_larger_ii = true;
1488 int num_nodes = g->num_nodes;
1489 ddg_edge_ptr e;
1490 int start, end, step; /* Place together into one struct? */
1491 sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1492 sbitmap must_precede = sbitmap_alloc (num_nodes);
1493 sbitmap must_follow = sbitmap_alloc (num_nodes);
1494 sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1495
1496 partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1497
1498 sbitmap_ones (tobe_scheduled);
1499 sbitmap_zero (sched_nodes);
1500
1501 while ((! sbitmap_equal (tobe_scheduled, sched_nodes)
1502 || try_again_with_larger_ii ) && ii < maxii)
1503 {
1504 int j;
1505 bool unscheduled_nodes = false;
1506
1507 if (dump_file)
1508 fprintf(dump_file, "Starting with ii=%d\n", ii);
1509 if (try_again_with_larger_ii)
1510 {
1511 try_again_with_larger_ii = false;
1512 sbitmap_zero (sched_nodes);
1513 }
1514
1515 for (i = 0; i < num_nodes; i++)
1516 {
1517 int u = nodes_order[i];
1518 ddg_node_ptr u_node = &ps->g->nodes[u];
1519 rtx insn = u_node->insn;
1520
1521 if (!INSN_P (insn))
1522 {
1523 RESET_BIT (tobe_scheduled, u);
1524 continue;
1525 }
1526
1527 if (JUMP_P (insn)) /* Closing branch handled later. */
1528 {
1529 RESET_BIT (tobe_scheduled, u);
1530 continue;
1531 }
1532
1533 if (TEST_BIT (sched_nodes, u))
1534 continue;
1535
1536 /* Try to get non-empty scheduling window. */
1537 j = i;
1538 while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0
1539 && j > 0)
1540 {
1541 unscheduled_nodes = true;
1542 if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1])
1543 || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1]))
1544 {
1545 ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]);
1546 RESET_BIT (sched_nodes, nodes_order [j - 1]);
1547 }
1548 j--;
1549 }
1550 if (j < 0)
1551 {
1552 /* ??? Try backtracking instead of immediately ii++? */
1553 ii++;
1554 try_again_with_larger_ii = true;
1555 reset_partial_schedule (ps, ii);
1556 break;
1557 }
1558 /* 2. Try scheduling u in window. */
1559 if (dump_file)
1560 fprintf(dump_file, "Trying to schedule node %d in (%d .. %d) step %d\n",
1561 u, start, end, step);
1562
1563 /* use must_follow & must_precede bitmaps to determine order
1564 of nodes within the cycle. */
1565 sbitmap_zero (must_precede);
1566 sbitmap_zero (must_follow);
1567 for (e = u_node->in; e != 0; e = e->next_in)
1568 if (TEST_BIT (sched_nodes, e->src->cuid)
1569 && e->latency == (ii * e->distance)
1570 && start == SCHED_TIME (e->src))
1571 SET_BIT (must_precede, e->src->cuid);
1572
1573 for (e = u_node->out; e != 0; e = e->next_out)
1574 if (TEST_BIT (sched_nodes, e->dest->cuid)
1575 && e->latency == (ii * e->distance)
1576 && end == SCHED_TIME (e->dest))
1577 SET_BIT (must_follow, e->dest->cuid);
1578
1579 success = 0;
1580 if ((step > 0 && start < end) || (step < 0 && start > end))
1581 for (c = start; c != end; c += step)
1582 {
1583 ps_insn_ptr psi;
1584
1585 psi = ps_add_node_check_conflicts (ps, u_node, c,
1586 must_precede,
1587 must_follow);
1588
1589 if (psi)
1590 {
1591 SCHED_TIME (u_node) = c;
1592 SET_BIT (sched_nodes, u);
1593 success = 1;
1594 if (dump_file)
1595 fprintf(dump_file, "Schedule in %d\n", c);
1596 break;
1597 }
1598 }
1599 if (!success)
1600 {
1601 /* ??? Try backtracking instead of immediately ii++? */
1602 ii++;
1603 try_again_with_larger_ii = true;
1604 reset_partial_schedule (ps, ii);
1605 break;
1606 }
1607 if (unscheduled_nodes)
1608 break;
1609
1610 /* ??? If (success), check register pressure estimates. */
1611 } /* Continue with next node. */
1612 } /* While try_again_with_larger_ii. */
1613
1614 sbitmap_free (sched_nodes);
1615 sbitmap_free (must_precede);
1616 sbitmap_free (must_follow);
1617 sbitmap_free (tobe_scheduled);
1618
1619 if (ii >= maxii)
1620 {
1621 free_partial_schedule (ps);
1622 ps = NULL;
1623 }
1624 return ps;
1625 }
1626
1627 \f
1628 /* This page implements the algorithm for ordering the nodes of a DDG
1629 for modulo scheduling, activated through the
1630 "int sms_order_nodes (ddg_ptr, int mii, int * result)" API. */
1631
1632 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1633 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1634 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1635 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1636 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1637 #define DEPTH(x) (ASAP ((x)))
1638
1639 typedef struct node_order_params * nopa;
1640
1641 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1642 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1643 static nopa calculate_order_params (ddg_ptr, int mii);
1644 static int find_max_asap (ddg_ptr, sbitmap);
1645 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1646 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1647
1648 enum sms_direction {BOTTOMUP, TOPDOWN};
1649
1650 struct node_order_params
1651 {
1652 int asap;
1653 int alap;
1654 int height;
1655 };
1656
1657 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1. */
1658 static void
1659 check_nodes_order (int *node_order, int num_nodes)
1660 {
1661 int i;
1662 sbitmap tmp = sbitmap_alloc (num_nodes);
1663
1664 sbitmap_zero (tmp);
1665
1666 for (i = 0; i < num_nodes; i++)
1667 {
1668 int u = node_order[i];
1669
1670 gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
1671
1672 SET_BIT (tmp, u);
1673 }
1674
1675 sbitmap_free (tmp);
1676 }
1677
1678 /* Order the nodes of G for scheduling and pass the result in
1679 NODE_ORDER. Also set aux.count of each node to ASAP.
1680 Return the recMII for the given DDG. */
1681 static int
1682 sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1683 {
1684 int i;
1685 int rec_mii = 0;
1686 ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1687
1688 nopa nops = calculate_order_params (g, mii);
1689
1690 order_nodes_of_sccs (sccs, node_order);
1691
1692 if (sccs->num_sccs > 0)
1693 /* First SCC has the largest recurrence_length. */
1694 rec_mii = sccs->sccs[0]->recurrence_length;
1695
1696 /* Save ASAP before destroying node_order_params. */
1697 for (i = 0; i < g->num_nodes; i++)
1698 {
1699 ddg_node_ptr v = &g->nodes[i];
1700 v->aux.count = ASAP (v);
1701 }
1702
1703 free (nops);
1704 free_ddg_all_sccs (sccs);
1705 check_nodes_order (node_order, g->num_nodes);
1706
1707 return rec_mii;
1708 }
1709
1710 static void
1711 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1712 {
1713 int i, pos = 0;
1714 ddg_ptr g = all_sccs->ddg;
1715 int num_nodes = g->num_nodes;
1716 sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1717 sbitmap on_path = sbitmap_alloc (num_nodes);
1718 sbitmap tmp = sbitmap_alloc (num_nodes);
1719 sbitmap ones = sbitmap_alloc (num_nodes);
1720
1721 sbitmap_zero (prev_sccs);
1722 sbitmap_ones (ones);
1723
1724 /* Perfrom the node ordering starting from the SCC with the highest recMII.
1725 For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc. */
1726 for (i = 0; i < all_sccs->num_sccs; i++)
1727 {
1728 ddg_scc_ptr scc = all_sccs->sccs[i];
1729
1730 /* Add nodes on paths from previous SCCs to the current SCC. */
1731 find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1732 sbitmap_a_or_b (tmp, scc->nodes, on_path);
1733
1734 /* Add nodes on paths from the current SCC to previous SCCs. */
1735 find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1736 sbitmap_a_or_b (tmp, tmp, on_path);
1737
1738 /* Remove nodes of previous SCCs from current extended SCC. */
1739 sbitmap_difference (tmp, tmp, prev_sccs);
1740
1741 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1742 /* Above call to order_nodes_in_scc updated prev_sccs |= tmp. */
1743 }
1744
1745 /* Handle the remaining nodes that do not belong to any scc. Each call
1746 to order_nodes_in_scc handles a single connected component. */
1747 while (pos < g->num_nodes)
1748 {
1749 sbitmap_difference (tmp, ones, prev_sccs);
1750 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1751 }
1752 sbitmap_free (prev_sccs);
1753 sbitmap_free (on_path);
1754 sbitmap_free (tmp);
1755 sbitmap_free (ones);
1756 }
1757
1758 /* MII is needed if we consider backarcs (that do not close recursive cycles). */
1759 static struct node_order_params *
1760 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1761 {
1762 int u;
1763 int max_asap;
1764 int num_nodes = g->num_nodes;
1765 ddg_edge_ptr e;
1766 /* Allocate a place to hold ordering params for each node in the DDG. */
1767 nopa node_order_params_arr;
1768
1769 /* Initialize of ASAP/ALAP/HEIGHT to zero. */
1770 node_order_params_arr = (nopa) xcalloc (num_nodes,
1771 sizeof (struct node_order_params));
1772
1773 /* Set the aux pointer of each node to point to its order_params structure. */
1774 for (u = 0; u < num_nodes; u++)
1775 g->nodes[u].aux.info = &node_order_params_arr[u];
1776
1777 /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1778 calculate ASAP, ALAP, mobility, distance, and height for each node
1779 in the dependence (direct acyclic) graph. */
1780
1781 /* We assume that the nodes in the array are in topological order. */
1782
1783 max_asap = 0;
1784 for (u = 0; u < num_nodes; u++)
1785 {
1786 ddg_node_ptr u_node = &g->nodes[u];
1787
1788 ASAP (u_node) = 0;
1789 for (e = u_node->in; e; e = e->next_in)
1790 if (e->distance == 0)
1791 ASAP (u_node) = MAX (ASAP (u_node),
1792 ASAP (e->src) + e->latency);
1793 max_asap = MAX (max_asap, ASAP (u_node));
1794 }
1795
1796 for (u = num_nodes - 1; u > -1; u--)
1797 {
1798 ddg_node_ptr u_node = &g->nodes[u];
1799
1800 ALAP (u_node) = max_asap;
1801 HEIGHT (u_node) = 0;
1802 for (e = u_node->out; e; e = e->next_out)
1803 if (e->distance == 0)
1804 {
1805 ALAP (u_node) = MIN (ALAP (u_node),
1806 ALAP (e->dest) - e->latency);
1807 HEIGHT (u_node) = MAX (HEIGHT (u_node),
1808 HEIGHT (e->dest) + e->latency);
1809 }
1810 }
1811
1812 return node_order_params_arr;
1813 }
1814
1815 static int
1816 find_max_asap (ddg_ptr g, sbitmap nodes)
1817 {
1818 unsigned int u = 0;
1819 int max_asap = -1;
1820 int result = -1;
1821 sbitmap_iterator sbi;
1822
1823 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1824 {
1825 ddg_node_ptr u_node = &g->nodes[u];
1826
1827 if (max_asap < ASAP (u_node))
1828 {
1829 max_asap = ASAP (u_node);
1830 result = u;
1831 }
1832 }
1833 return result;
1834 }
1835
1836 static int
1837 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1838 {
1839 unsigned int u = 0;
1840 int max_hv = -1;
1841 int min_mob = INT_MAX;
1842 int result = -1;
1843 sbitmap_iterator sbi;
1844
1845 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1846 {
1847 ddg_node_ptr u_node = &g->nodes[u];
1848
1849 if (max_hv < HEIGHT (u_node))
1850 {
1851 max_hv = HEIGHT (u_node);
1852 min_mob = MOB (u_node);
1853 result = u;
1854 }
1855 else if ((max_hv == HEIGHT (u_node))
1856 && (min_mob > MOB (u_node)))
1857 {
1858 min_mob = MOB (u_node);
1859 result = u;
1860 }
1861 }
1862 return result;
1863 }
1864
1865 static int
1866 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1867 {
1868 unsigned int u = 0;
1869 int max_dv = -1;
1870 int min_mob = INT_MAX;
1871 int result = -1;
1872 sbitmap_iterator sbi;
1873
1874 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1875 {
1876 ddg_node_ptr u_node = &g->nodes[u];
1877
1878 if (max_dv < DEPTH (u_node))
1879 {
1880 max_dv = DEPTH (u_node);
1881 min_mob = MOB (u_node);
1882 result = u;
1883 }
1884 else if ((max_dv == DEPTH (u_node))
1885 && (min_mob > MOB (u_node)))
1886 {
1887 min_mob = MOB (u_node);
1888 result = u;
1889 }
1890 }
1891 return result;
1892 }
1893
1894 /* Places the nodes of SCC into the NODE_ORDER array starting
1895 at position POS, according to the SMS ordering algorithm.
1896 NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1897 the NODE_ORDER array, starting from position zero. */
1898 static int
1899 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1900 int * node_order, int pos)
1901 {
1902 enum sms_direction dir;
1903 int num_nodes = g->num_nodes;
1904 sbitmap workset = sbitmap_alloc (num_nodes);
1905 sbitmap tmp = sbitmap_alloc (num_nodes);
1906 sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1907 sbitmap predecessors = sbitmap_alloc (num_nodes);
1908 sbitmap successors = sbitmap_alloc (num_nodes);
1909
1910 sbitmap_zero (predecessors);
1911 find_predecessors (predecessors, g, nodes_ordered);
1912
1913 sbitmap_zero (successors);
1914 find_successors (successors, g, nodes_ordered);
1915
1916 sbitmap_zero (tmp);
1917 if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1918 {
1919 sbitmap_copy (workset, tmp);
1920 dir = BOTTOMUP;
1921 }
1922 else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1923 {
1924 sbitmap_copy (workset, tmp);
1925 dir = TOPDOWN;
1926 }
1927 else
1928 {
1929 int u;
1930
1931 sbitmap_zero (workset);
1932 if ((u = find_max_asap (g, scc)) >= 0)
1933 SET_BIT (workset, u);
1934 dir = BOTTOMUP;
1935 }
1936
1937 sbitmap_zero (zero_bitmap);
1938 while (!sbitmap_equal (workset, zero_bitmap))
1939 {
1940 int v;
1941 ddg_node_ptr v_node;
1942 sbitmap v_node_preds;
1943 sbitmap v_node_succs;
1944
1945 if (dir == TOPDOWN)
1946 {
1947 while (!sbitmap_equal (workset, zero_bitmap))
1948 {
1949 v = find_max_hv_min_mob (g, workset);
1950 v_node = &g->nodes[v];
1951 node_order[pos++] = v;
1952 v_node_succs = NODE_SUCCESSORS (v_node);
1953 sbitmap_a_and_b (tmp, v_node_succs, scc);
1954
1955 /* Don't consider the already ordered successors again. */
1956 sbitmap_difference (tmp, tmp, nodes_ordered);
1957 sbitmap_a_or_b (workset, workset, tmp);
1958 RESET_BIT (workset, v);
1959 SET_BIT (nodes_ordered, v);
1960 }
1961 dir = BOTTOMUP;
1962 sbitmap_zero (predecessors);
1963 find_predecessors (predecessors, g, nodes_ordered);
1964 sbitmap_a_and_b (workset, predecessors, scc);
1965 }
1966 else
1967 {
1968 while (!sbitmap_equal (workset, zero_bitmap))
1969 {
1970 v = find_max_dv_min_mob (g, workset);
1971 v_node = &g->nodes[v];
1972 node_order[pos++] = v;
1973 v_node_preds = NODE_PREDECESSORS (v_node);
1974 sbitmap_a_and_b (tmp, v_node_preds, scc);
1975
1976 /* Don't consider the already ordered predecessors again. */
1977 sbitmap_difference (tmp, tmp, nodes_ordered);
1978 sbitmap_a_or_b (workset, workset, tmp);
1979 RESET_BIT (workset, v);
1980 SET_BIT (nodes_ordered, v);
1981 }
1982 dir = TOPDOWN;
1983 sbitmap_zero (successors);
1984 find_successors (successors, g, nodes_ordered);
1985 sbitmap_a_and_b (workset, successors, scc);
1986 }
1987 }
1988 sbitmap_free (tmp);
1989 sbitmap_free (workset);
1990 sbitmap_free (zero_bitmap);
1991 sbitmap_free (predecessors);
1992 sbitmap_free (successors);
1993 return pos;
1994 }
1995
1996 \f
1997 /* This page contains functions for manipulating partial-schedules during
1998 modulo scheduling. */
1999
2000 /* Create a partial schedule and allocate a memory to hold II rows. */
2001
2002 static partial_schedule_ptr
2003 create_partial_schedule (int ii, ddg_ptr g, int history)
2004 {
2005 partial_schedule_ptr ps = XNEW (struct partial_schedule);
2006 ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2007 ps->ii = ii;
2008 ps->history = history;
2009 ps->min_cycle = INT_MAX;
2010 ps->max_cycle = INT_MIN;
2011 ps->g = g;
2012
2013 return ps;
2014 }
2015
2016 /* Free the PS_INSNs in rows array of the given partial schedule.
2017 ??? Consider caching the PS_INSN's. */
2018 static void
2019 free_ps_insns (partial_schedule_ptr ps)
2020 {
2021 int i;
2022
2023 for (i = 0; i < ps->ii; i++)
2024 {
2025 while (ps->rows[i])
2026 {
2027 ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2028
2029 free (ps->rows[i]);
2030 ps->rows[i] = ps_insn;
2031 }
2032 ps->rows[i] = NULL;
2033 }
2034 }
2035
2036 /* Free all the memory allocated to the partial schedule. */
2037
2038 static void
2039 free_partial_schedule (partial_schedule_ptr ps)
2040 {
2041 if (!ps)
2042 return;
2043 free_ps_insns (ps);
2044 free (ps->rows);
2045 free (ps);
2046 }
2047
2048 /* Clear the rows array with its PS_INSNs, and create a new one with
2049 NEW_II rows. */
2050
2051 static void
2052 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2053 {
2054 if (!ps)
2055 return;
2056 free_ps_insns (ps);
2057 if (new_ii == ps->ii)
2058 return;
2059 ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2060 * sizeof (ps_insn_ptr));
2061 memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2062 ps->ii = new_ii;
2063 ps->min_cycle = INT_MAX;
2064 ps->max_cycle = INT_MIN;
2065 }
2066
2067 /* Prints the partial schedule as an ii rows array, for each rows
2068 print the ids of the insns in it. */
2069 void
2070 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2071 {
2072 int i;
2073
2074 for (i = 0; i < ps->ii; i++)
2075 {
2076 ps_insn_ptr ps_i = ps->rows[i];
2077
2078 fprintf (dump, "\n[CYCLE %d ]: ", i);
2079 while (ps_i)
2080 {
2081 fprintf (dump, "%d, ",
2082 INSN_UID (ps_i->node->insn));
2083 ps_i = ps_i->next_in_row;
2084 }
2085 }
2086 }
2087
2088 /* Creates an object of PS_INSN and initializes it to the given parameters. */
2089 static ps_insn_ptr
2090 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
2091 {
2092 ps_insn_ptr ps_i = XNEW (struct ps_insn);
2093
2094 ps_i->node = node;
2095 ps_i->next_in_row = NULL;
2096 ps_i->prev_in_row = NULL;
2097 ps_i->row_rest_count = rest_count;
2098 ps_i->cycle = cycle;
2099
2100 return ps_i;
2101 }
2102
2103
2104 /* Removes the given PS_INSN from the partial schedule. Returns false if the
2105 node is not found in the partial schedule, else returns true. */
2106 static bool
2107 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2108 {
2109 int row;
2110
2111 if (!ps || !ps_i)
2112 return false;
2113
2114 row = SMODULO (ps_i->cycle, ps->ii);
2115 if (! ps_i->prev_in_row)
2116 {
2117 if (ps_i != ps->rows[row])
2118 return false;
2119
2120 ps->rows[row] = ps_i->next_in_row;
2121 if (ps->rows[row])
2122 ps->rows[row]->prev_in_row = NULL;
2123 }
2124 else
2125 {
2126 ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2127 if (ps_i->next_in_row)
2128 ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2129 }
2130 free (ps_i);
2131 return true;
2132 }
2133
2134 /* Unlike what literature describes for modulo scheduling (which focuses
2135 on VLIW machines) the order of the instructions inside a cycle is
2136 important. Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2137 where the current instruction should go relative to the already
2138 scheduled instructions in the given cycle. Go over these
2139 instructions and find the first possible column to put it in. */
2140 static bool
2141 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2142 sbitmap must_precede, sbitmap must_follow)
2143 {
2144 ps_insn_ptr next_ps_i;
2145 ps_insn_ptr first_must_follow = NULL;
2146 ps_insn_ptr last_must_precede = NULL;
2147 int row;
2148
2149 if (! ps_i)
2150 return false;
2151
2152 row = SMODULO (ps_i->cycle, ps->ii);
2153
2154 /* Find the first must follow and the last must precede
2155 and insert the node immediately after the must precede
2156 but make sure that it there is no must follow after it. */
2157 for (next_ps_i = ps->rows[row];
2158 next_ps_i;
2159 next_ps_i = next_ps_i->next_in_row)
2160 {
2161 if (TEST_BIT (must_follow, next_ps_i->node->cuid)
2162 && ! first_must_follow)
2163 first_must_follow = next_ps_i;
2164 if (TEST_BIT (must_precede, next_ps_i->node->cuid))
2165 {
2166 /* If we have already met a node that must follow, then
2167 there is no possible column. */
2168 if (first_must_follow)
2169 return false;
2170 else
2171 last_must_precede = next_ps_i;
2172 }
2173 }
2174
2175 /* Now insert the node after INSERT_AFTER_PSI. */
2176
2177 if (! last_must_precede)
2178 {
2179 ps_i->next_in_row = ps->rows[row];
2180 ps_i->prev_in_row = NULL;
2181 if (ps_i->next_in_row)
2182 ps_i->next_in_row->prev_in_row = ps_i;
2183 ps->rows[row] = ps_i;
2184 }
2185 else
2186 {
2187 ps_i->next_in_row = last_must_precede->next_in_row;
2188 last_must_precede->next_in_row = ps_i;
2189 ps_i->prev_in_row = last_must_precede;
2190 if (ps_i->next_in_row)
2191 ps_i->next_in_row->prev_in_row = ps_i;
2192 }
2193
2194 return true;
2195 }
2196
2197 /* Advances the PS_INSN one column in its current row; returns false
2198 in failure and true in success. Bit N is set in MUST_FOLLOW if
2199 the node with cuid N must be come after the node pointed to by
2200 PS_I when scheduled in the same cycle. */
2201 static int
2202 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2203 sbitmap must_follow)
2204 {
2205 ps_insn_ptr prev, next;
2206 int row;
2207 ddg_node_ptr next_node;
2208
2209 if (!ps || !ps_i)
2210 return false;
2211
2212 row = SMODULO (ps_i->cycle, ps->ii);
2213
2214 if (! ps_i->next_in_row)
2215 return false;
2216
2217 next_node = ps_i->next_in_row->node;
2218
2219 /* Check if next_in_row is dependent on ps_i, both having same sched
2220 times (typically ANTI_DEP). If so, ps_i cannot skip over it. */
2221 if (TEST_BIT (must_follow, next_node->cuid))
2222 return false;
2223
2224 /* Advance PS_I over its next_in_row in the doubly linked list. */
2225 prev = ps_i->prev_in_row;
2226 next = ps_i->next_in_row;
2227
2228 if (ps_i == ps->rows[row])
2229 ps->rows[row] = next;
2230
2231 ps_i->next_in_row = next->next_in_row;
2232
2233 if (next->next_in_row)
2234 next->next_in_row->prev_in_row = ps_i;
2235
2236 next->next_in_row = ps_i;
2237 ps_i->prev_in_row = next;
2238
2239 next->prev_in_row = prev;
2240 if (prev)
2241 prev->next_in_row = next;
2242
2243 return true;
2244 }
2245
2246 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2247 Returns 0 if this is not possible and a PS_INSN otherwise. Bit N is
2248 set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2249 before/after (respectively) the node pointed to by PS_I when scheduled
2250 in the same cycle. */
2251 static ps_insn_ptr
2252 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2253 sbitmap must_precede, sbitmap must_follow)
2254 {
2255 ps_insn_ptr ps_i;
2256 int rest_count = 1;
2257 int row = SMODULO (cycle, ps->ii);
2258
2259 if (ps->rows[row]
2260 && ps->rows[row]->row_rest_count >= issue_rate)
2261 return NULL;
2262
2263 if (ps->rows[row])
2264 rest_count += ps->rows[row]->row_rest_count;
2265
2266 ps_i = create_ps_insn (node, rest_count, cycle);
2267
2268 /* Finds and inserts PS_I according to MUST_FOLLOW and
2269 MUST_PRECEDE. */
2270 if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2271 {
2272 free (ps_i);
2273 return NULL;
2274 }
2275
2276 return ps_i;
2277 }
2278
2279 /* Advance time one cycle. Assumes DFA is being used. */
2280 static void
2281 advance_one_cycle (void)
2282 {
2283 if (targetm.sched.dfa_pre_cycle_insn)
2284 state_transition (curr_state,
2285 targetm.sched.dfa_pre_cycle_insn ());
2286
2287 state_transition (curr_state, NULL);
2288
2289 if (targetm.sched.dfa_post_cycle_insn)
2290 state_transition (curr_state,
2291 targetm.sched.dfa_post_cycle_insn ());
2292 }
2293
2294 /* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
2295 the number of cycles according to DFA that the kernel fits in,
2296 we use this to check if we done well with SMS after we add
2297 register moves. In some cases register moves overhead makes
2298 it even worse than the original loop. We want SMS to be performed
2299 when it gives less cycles after register moves are added. */
2300 static int
2301 kernel_number_of_cycles (rtx first_insn, rtx last_insn)
2302 {
2303 int cycles = 0;
2304 rtx insn;
2305 int can_issue_more = issue_rate;
2306
2307 state_reset (curr_state);
2308
2309 for (insn = first_insn;
2310 insn != NULL_RTX && insn != last_insn;
2311 insn = NEXT_INSN (insn))
2312 {
2313 if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
2314 continue;
2315
2316 /* Check if there is room for the current insn. */
2317 if (!can_issue_more || state_dead_lock_p (curr_state))
2318 {
2319 cycles ++;
2320 advance_one_cycle ();
2321 can_issue_more = issue_rate;
2322 }
2323
2324 /* Update the DFA state and return with failure if the DFA found
2325 recource conflicts. */
2326 if (state_transition (curr_state, insn) >= 0)
2327 {
2328 cycles ++;
2329 advance_one_cycle ();
2330 can_issue_more = issue_rate;
2331 }
2332
2333 if (targetm.sched.variable_issue)
2334 can_issue_more =
2335 targetm.sched.variable_issue (sched_dump, sched_verbose,
2336 insn, can_issue_more);
2337 /* A naked CLOBBER or USE generates no instruction, so don't
2338 let them consume issue slots. */
2339 else if (GET_CODE (PATTERN (insn)) != USE
2340 && GET_CODE (PATTERN (insn)) != CLOBBER)
2341 can_issue_more--;
2342 }
2343 return cycles;
2344 }
2345
2346 /* Checks if PS has resource conflicts according to DFA, starting from
2347 FROM cycle to TO cycle; returns true if there are conflicts and false
2348 if there are no conflicts. Assumes DFA is being used. */
2349 static int
2350 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2351 {
2352 int cycle;
2353
2354 state_reset (curr_state);
2355
2356 for (cycle = from; cycle <= to; cycle++)
2357 {
2358 ps_insn_ptr crr_insn;
2359 /* Holds the remaining issue slots in the current row. */
2360 int can_issue_more = issue_rate;
2361
2362 /* Walk through the DFA for the current row. */
2363 for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2364 crr_insn;
2365 crr_insn = crr_insn->next_in_row)
2366 {
2367 rtx insn = crr_insn->node->insn;
2368
2369 if (!INSN_P (insn))
2370 continue;
2371
2372 /* Check if there is room for the current insn. */
2373 if (!can_issue_more || state_dead_lock_p (curr_state))
2374 return true;
2375
2376 /* Update the DFA state and return with failure if the DFA found
2377 recource conflicts. */
2378 if (state_transition (curr_state, insn) >= 0)
2379 return true;
2380
2381 if (targetm.sched.variable_issue)
2382 can_issue_more =
2383 targetm.sched.variable_issue (sched_dump, sched_verbose,
2384 insn, can_issue_more);
2385 /* A naked CLOBBER or USE generates no instruction, so don't
2386 let them consume issue slots. */
2387 else if (GET_CODE (PATTERN (insn)) != USE
2388 && GET_CODE (PATTERN (insn)) != CLOBBER)
2389 can_issue_more--;
2390 }
2391
2392 /* Advance the DFA to the next cycle. */
2393 advance_one_cycle ();
2394 }
2395 return false;
2396 }
2397
2398 /* Checks if the given node causes resource conflicts when added to PS at
2399 cycle C. If not the node is added to PS and returned; otherwise zero
2400 is returned. Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2401 cuid N must be come before/after (respectively) the node pointed to by
2402 PS_I when scheduled in the same cycle. */
2403 ps_insn_ptr
2404 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2405 int c, sbitmap must_precede,
2406 sbitmap must_follow)
2407 {
2408 int has_conflicts = 0;
2409 ps_insn_ptr ps_i;
2410
2411 /* First add the node to the PS, if this succeeds check for
2412 conflicts, trying different issue slots in the same row. */
2413 if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2414 return NULL; /* Failed to insert the node at the given cycle. */
2415
2416 has_conflicts = ps_has_conflicts (ps, c, c)
2417 || (ps->history > 0
2418 && ps_has_conflicts (ps,
2419 c - ps->history,
2420 c + ps->history));
2421
2422 /* Try different issue slots to find one that the given node can be
2423 scheduled in without conflicts. */
2424 while (has_conflicts)
2425 {
2426 if (! ps_insn_advance_column (ps, ps_i, must_follow))
2427 break;
2428 has_conflicts = ps_has_conflicts (ps, c, c)
2429 || (ps->history > 0
2430 && ps_has_conflicts (ps,
2431 c - ps->history,
2432 c + ps->history));
2433 }
2434
2435 if (has_conflicts)
2436 {
2437 remove_node_from_ps (ps, ps_i);
2438 return NULL;
2439 }
2440
2441 ps->min_cycle = MIN (ps->min_cycle, c);
2442 ps->max_cycle = MAX (ps->max_cycle, c);
2443 return ps_i;
2444 }
2445
2446 /* Rotate the rows of PS such that insns scheduled at time
2447 START_CYCLE will appear in row 0. Updates max/min_cycles. */
2448 void
2449 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2450 {
2451 int i, row, backward_rotates;
2452 int last_row = ps->ii - 1;
2453
2454 if (start_cycle == 0)
2455 return;
2456
2457 backward_rotates = SMODULO (start_cycle, ps->ii);
2458
2459 /* Revisit later and optimize this into a single loop. */
2460 for (i = 0; i < backward_rotates; i++)
2461 {
2462 ps_insn_ptr first_row = ps->rows[0];
2463
2464 for (row = 0; row < last_row; row++)
2465 ps->rows[row] = ps->rows[row+1];
2466
2467 ps->rows[last_row] = first_row;
2468 }
2469
2470 ps->max_cycle -= start_cycle;
2471 ps->min_cycle -= start_cycle;
2472 }
2473
2474 /* Remove the node N from the partial schedule PS; because we restart the DFA
2475 each time we want to check for resource conflicts; this is equivalent to
2476 unscheduling the node N. */
2477 static bool
2478 ps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n)
2479 {
2480 ps_insn_ptr ps_i;
2481 int row = SMODULO (SCHED_TIME (n), ps->ii);
2482
2483 if (row < 0 || row > ps->ii)
2484 return false;
2485
2486 for (ps_i = ps->rows[row];
2487 ps_i && ps_i->node != n;
2488 ps_i = ps_i->next_in_row);
2489 if (!ps_i)
2490 return false;
2491
2492 return remove_node_from_ps (ps, ps_i);
2493 }
2494 #endif /* INSN_SCHEDULING */
2495 \f
2496 static bool
2497 gate_handle_sms (void)
2498 {
2499 return (optimize > 0 && flag_modulo_sched);
2500 }
2501
2502
2503 /* Run instruction scheduler. */
2504 /* Perform SMS module scheduling. */
2505 static unsigned int
2506 rest_of_handle_sms (void)
2507 {
2508 #ifdef INSN_SCHEDULING
2509 basic_block bb;
2510
2511 /* We want to be able to create new pseudos. */
2512 no_new_pseudos = 0;
2513 /* Collect loop information to be used in SMS. */
2514 cfg_layout_initialize (CLEANUP_UPDATE_LIFE);
2515 sms_schedule ();
2516
2517 /* Update the life information, because we add pseudos. */
2518 max_regno = max_reg_num ();
2519 allocate_reg_info (max_regno, FALSE, FALSE);
2520 update_life_info (NULL, UPDATE_LIFE_GLOBAL_RM_NOTES,
2521 (PROP_DEATH_NOTES
2522 | PROP_REG_INFO
2523 | PROP_KILL_DEAD_CODE
2524 | PROP_SCAN_DEAD_CODE));
2525
2526 no_new_pseudos = 1;
2527
2528 /* Finalize layout changes. */
2529 FOR_EACH_BB (bb)
2530 if (bb->next_bb != EXIT_BLOCK_PTR)
2531 bb->aux = bb->next_bb;
2532 cfg_layout_finalize ();
2533 free_dominance_info (CDI_DOMINATORS);
2534 #endif /* INSN_SCHEDULING */
2535 return 0;
2536 }
2537
2538 struct tree_opt_pass pass_sms =
2539 {
2540 "sms", /* name */
2541 gate_handle_sms, /* gate */
2542 rest_of_handle_sms, /* execute */
2543 NULL, /* sub */
2544 NULL, /* next */
2545 0, /* static_pass_number */
2546 TV_SMS, /* tv_id */
2547 0, /* properties_required */
2548 0, /* properties_provided */
2549 0, /* properties_destroyed */
2550 0, /* todo_flags_start */
2551 TODO_dump_func |
2552 TODO_ggc_collect, /* todo_flags_finish */
2553 'm' /* letter */
2554 };
2555