vc4: Add separate write-after-read dependency tracking for pairing.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_schedule.c
1 /*
2 * Copyright © 2010 Intel Corporation
3 * Copyright © 2014 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 /**
26 * @file vc4_qpu_schedule.c
27 *
28 * The basic model of the list scheduler is to take a basic block, compute a
29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically
30 * pick a DAG head, then put all the children that are now DAG heads into the
31 * list of things to schedule.
32 *
33 * The goal of scheduling here is to pack pairs of operations together in a
34 * single QPU instruction.
35 */
36
37 #include "vc4_qir.h"
38 #include "vc4_qpu.h"
39 #include "util/ralloc.h"
40
41 static bool debug;
42
43 struct schedule_node_child;
44
45 struct schedule_node {
46 struct simple_node link;
47 struct queued_qpu_inst *inst;
48 struct schedule_node_child *children;
49 uint32_t child_count;
50 uint32_t child_array_size;
51 uint32_t parent_count;
52 uint32_t delay;
53 };
54
55 struct schedule_node_child {
56 struct schedule_node *node;
57 bool write_after_read;
58 };
59
60 /* When walking the instructions in reverse, we need to swap before/after in
61 * add_dep().
62 */
63 enum direction { F, R };
64
65 struct schedule_state {
66 struct schedule_node *last_r[6];
67 struct schedule_node *last_ra[32];
68 struct schedule_node *last_rb[32];
69 struct schedule_node *last_sf;
70 struct schedule_node *last_vpm_read;
71 struct schedule_node *last_unif_read;
72 struct schedule_node *last_tmu_write;
73 struct schedule_node *last_tlb;
74 struct schedule_node *last_vpm;
75 enum direction dir;
76 };
77
78 static void
79 add_dep(struct schedule_state *state,
80 struct schedule_node *before,
81 struct schedule_node *after,
82 bool write)
83 {
84 bool write_after_read = !write && state->dir == R;
85
86 if (!before || !after)
87 return;
88
89 assert(before != after);
90
91 if (state->dir == R) {
92 struct schedule_node *t = before;
93 before = after;
94 after = t;
95 }
96
97 for (int i = 0; i < before->child_count; i++) {
98 if (before->children[i].node == after &&
99 (before->children[i].write_after_read == write_after_read)) {
100 return;
101 }
102 }
103
104 if (before->child_array_size <= before->child_count) {
105 before->child_array_size = MAX2(before->child_array_size * 2, 16);
106 before->children = reralloc(before, before->children,
107 struct schedule_node_child,
108 before->child_array_size);
109 }
110
111 before->children[before->child_count].node = after;
112 before->children[before->child_count].write_after_read =
113 write_after_read;
114 before->child_count++;
115 after->parent_count++;
116 }
117
118 static void
119 add_read_dep(struct schedule_state *state,
120 struct schedule_node *before,
121 struct schedule_node *after)
122 {
123 add_dep(state, before, after, false);
124 }
125
126 static void
127 add_write_dep(struct schedule_state *state,
128 struct schedule_node **before,
129 struct schedule_node *after)
130 {
131 add_dep(state, *before, after, true);
132 *before = after;
133 }
134
135 static bool
136 qpu_writes_r4(uint64_t inst)
137 {
138 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
139
140 switch(sig) {
141 case QPU_SIG_COLOR_LOAD:
142 case QPU_SIG_LOAD_TMU0:
143 case QPU_SIG_LOAD_TMU1:
144 case QPU_SIG_ALPHA_MASK_LOAD:
145 return true;
146 default:
147 return false;
148 }
149 }
150
151 static void
152 process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
153 uint32_t raddr, bool is_a)
154 {
155 switch (raddr) {
156 case QPU_R_VARY:
157 add_write_dep(state, &state->last_r[5], n);
158 break;
159
160 case QPU_R_VPM:
161 add_write_dep(state, &state->last_vpm_read, n);
162 break;
163
164 case QPU_R_UNIF:
165 add_write_dep(state, &state->last_unif_read, n);
166 break;
167
168 case QPU_R_NOP:
169 case QPU_R_ELEM_QPU:
170 case QPU_R_XY_PIXEL_COORD:
171 case QPU_R_MS_REV_FLAGS:
172 break;
173
174 default:
175 if (raddr < 32) {
176 if (is_a)
177 add_read_dep(state, state->last_ra[raddr], n);
178 else
179 add_read_dep(state, state->last_rb[raddr], n);
180 } else {
181 fprintf(stderr, "unknown raddr %d\n", raddr);
182 abort();
183 }
184 break;
185 }
186 }
187
188 static bool
189 is_tmu_write(uint32_t waddr)
190 {
191 switch (waddr) {
192 case QPU_W_TMU0_S:
193 case QPU_W_TMU0_T:
194 case QPU_W_TMU0_R:
195 case QPU_W_TMU0_B:
196 case QPU_W_TMU1_S:
197 case QPU_W_TMU1_T:
198 case QPU_W_TMU1_R:
199 case QPU_W_TMU1_B:
200 return true;
201 default:
202 return false;
203 }
204 }
205
206 static void
207 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
208 uint32_t mux)
209 {
210 if (mux != QPU_MUX_A && mux != QPU_MUX_B)
211 add_read_dep(state, state->last_r[mux], n);
212 }
213
214
215 static bool
216 is_direct_tmu_read(uint64_t inst)
217 {
218 /* If it's a direct read, we happen to structure the code such that
219 * there's an explicit uniform read in the instruction (for kernel
220 * texture reloc processing).
221 */
222 return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF ||
223 QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF);
224 }
225
226 static void
227 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
228 uint32_t waddr, bool is_add)
229 {
230 uint64_t inst = n->inst->inst;
231 bool is_a = is_add ^ ((inst & QPU_WS) != 0);
232
233 if (waddr < 32) {
234 if (is_a) {
235 add_write_dep(state, &state->last_ra[waddr], n);
236 } else {
237 add_write_dep(state, &state->last_rb[waddr], n);
238 }
239 } else if (is_tmu_write(waddr)) {
240 add_write_dep(state, &state->last_tmu_write, n);
241
242 /* There is an implicit uniform read in texture ops in
243 * hardware, unless this is a direct-addressed uniform read,
244 * so we need to keep it in the same order as the other
245 * uniforms.
246 */
247 if (!is_direct_tmu_read(n->inst->inst))
248 add_write_dep(state, &state->last_unif_read, n);
249 } else if (qpu_waddr_is_tlb(waddr)) {
250 add_write_dep(state, &state->last_tlb, n);
251 } else {
252 switch (waddr) {
253 case QPU_W_ACC0:
254 case QPU_W_ACC1:
255 case QPU_W_ACC2:
256 case QPU_W_ACC3:
257 case QPU_W_ACC5:
258 add_write_dep(state, &state->last_r[waddr - QPU_W_ACC0],
259 n);
260 break;
261
262 case QPU_W_VPM:
263 case QPU_W_VPMVCD_SETUP:
264 add_write_dep(state, &state->last_vpm, n);
265 break;
266
267 case QPU_W_SFU_RECIP:
268 case QPU_W_SFU_RECIPSQRT:
269 case QPU_W_SFU_EXP:
270 case QPU_W_SFU_LOG:
271 add_write_dep(state, &state->last_r[4], n);
272 break;
273
274 case QPU_W_TLB_STENCIL_SETUP:
275 /* This isn't a TLB operation that does things like
276 * implicitly lock the scoreboard, but it does have to
277 * appear before TLB_Z, and each of the TLB_STENCILs
278 * have to schedule in the same order relative to each
279 * other.
280 */
281 add_write_dep(state, &state->last_tlb, n);
282 break;
283
284 case QPU_W_NOP:
285 break;
286
287 default:
288 fprintf(stderr, "Unknown waddr %d\n", waddr);
289 abort();
290 }
291 }
292 }
293
294 static void
295 process_cond_deps(struct schedule_state *state, struct schedule_node *n,
296 uint32_t cond)
297 {
298 switch (cond) {
299 case QPU_COND_NEVER:
300 case QPU_COND_ALWAYS:
301 break;
302 default:
303 add_read_dep(state, state->last_sf, n);
304 break;
305 }
306 }
307
308 /**
309 * Common code for dependencies that need to be tracked both forward and
310 * backward.
311 *
312 * This is for things like "all reads of r4 have to happen between the r4
313 * writes that surround them".
314 */
315 static void
316 calculate_deps(struct schedule_state *state, struct schedule_node *n)
317 {
318 uint64_t inst = n->inst->inst;
319 uint32_t add_op = QPU_GET_FIELD(inst, QPU_OP_ADD);
320 uint32_t mul_op = QPU_GET_FIELD(inst, QPU_OP_MUL);
321 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
322 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
323 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
324 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
325 uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
326 uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
327 uint32_t mul_a = QPU_GET_FIELD(inst, QPU_MUL_A);
328 uint32_t mul_b = QPU_GET_FIELD(inst, QPU_MUL_B);
329 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
330
331 process_raddr_deps(state, n, raddr_a, true);
332 process_raddr_deps(state, n, raddr_b, false);
333 if (add_op != QPU_A_NOP) {
334 process_mux_deps(state, n, add_a);
335 process_mux_deps(state, n, add_b);
336 }
337 if (mul_op != QPU_M_NOP) {
338 process_mux_deps(state, n, mul_a);
339 process_mux_deps(state, n, mul_b);
340 }
341
342 process_waddr_deps(state, n, waddr_add, true);
343 process_waddr_deps(state, n, waddr_mul, false);
344 if (qpu_writes_r4(inst))
345 add_write_dep(state, &state->last_r[4], n);
346
347 switch (sig) {
348 case QPU_SIG_SW_BREAKPOINT:
349 case QPU_SIG_NONE:
350 case QPU_SIG_THREAD_SWITCH:
351 case QPU_SIG_LAST_THREAD_SWITCH:
352 case QPU_SIG_SMALL_IMM:
353 case QPU_SIG_LOAD_IMM:
354 break;
355
356 case QPU_SIG_LOAD_TMU0:
357 case QPU_SIG_LOAD_TMU1:
358 /* TMU loads are coming from a FIFO, so ordering is important.
359 */
360 add_write_dep(state, &state->last_tmu_write, n);
361 break;
362
363 case QPU_SIG_COLOR_LOAD:
364 add_read_dep(state, state->last_tlb, n);
365 break;
366
367 case QPU_SIG_PROG_END:
368 case QPU_SIG_WAIT_FOR_SCOREBOARD:
369 case QPU_SIG_SCOREBOARD_UNLOCK:
370 case QPU_SIG_COVERAGE_LOAD:
371 case QPU_SIG_COLOR_LOAD_END:
372 case QPU_SIG_ALPHA_MASK_LOAD:
373 case QPU_SIG_BRANCH:
374 fprintf(stderr, "Unhandled signal bits %d\n", sig);
375 abort();
376 }
377
378 process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
379 process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
380 if (inst & QPU_SF)
381 add_write_dep(state, &state->last_sf, n);
382 }
383
384 static void
385 calculate_forward_deps(struct vc4_compile *c, struct simple_node *schedule_list)
386 {
387 struct simple_node *node;
388 struct schedule_state state;
389
390 memset(&state, 0, sizeof(state));
391 state.dir = F;
392
393 foreach(node, schedule_list)
394 calculate_deps(&state, (struct schedule_node *)node);
395 }
396
397 static void
398 calculate_reverse_deps(struct vc4_compile *c, struct simple_node *schedule_list)
399 {
400 struct simple_node *node;
401 struct schedule_state state;
402
403 memset(&state, 0, sizeof(state));
404 state.dir = R;
405
406 for (node = schedule_list->prev; schedule_list != node; node = node->prev) {
407 calculate_deps(&state, (struct schedule_node *)node);
408 }
409 }
410
411 struct choose_scoreboard {
412 int tick;
413 int last_sfu_write_tick;
414 uint32_t last_waddr_a, last_waddr_b;
415 };
416
417 static bool
418 reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
419 {
420 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
421 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
422 uint32_t src_muxes[] = {
423 QPU_GET_FIELD(inst, QPU_ADD_A),
424 QPU_GET_FIELD(inst, QPU_ADD_B),
425 QPU_GET_FIELD(inst, QPU_MUL_A),
426 QPU_GET_FIELD(inst, QPU_MUL_B),
427 };
428 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
429 if ((src_muxes[i] == QPU_MUX_A &&
430 raddr_a < 32 &&
431 scoreboard->last_waddr_a == raddr_a) ||
432 (src_muxes[i] == QPU_MUX_B &&
433 raddr_b < 32 &&
434 scoreboard->last_waddr_b == raddr_b)) {
435 return true;
436 }
437
438 if (src_muxes[i] == QPU_MUX_R4) {
439 if (scoreboard->tick -
440 scoreboard->last_sfu_write_tick <= 2) {
441 return true;
442 }
443 }
444 }
445
446 return false;
447 }
448
449 static bool
450 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, uint64_t inst)
451 {
452 return (scoreboard->tick < 2 && qpu_inst_is_tlb(inst));
453 }
454
455 static int
456 get_instruction_priority(uint64_t inst)
457 {
458 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
459 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
460 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
461 uint32_t baseline_score;
462 uint32_t next_score = 0;
463
464 /* Schedule TLB operations as late as possible, to get more
465 * parallelism between shaders.
466 */
467 if (qpu_inst_is_tlb(inst))
468 return next_score;
469 next_score++;
470
471 /* Schedule texture read results collection late to hide latency. */
472 if (sig == QPU_SIG_LOAD_TMU0 || sig == QPU_SIG_LOAD_TMU1)
473 return next_score;
474 next_score++;
475
476 /* Default score for things that aren't otherwise special. */
477 baseline_score = next_score;
478 next_score++;
479
480 /* Schedule texture read setup early to hide their latency better. */
481 if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul))
482 return next_score;
483 next_score++;
484
485 return baseline_score;
486 }
487
488 static struct schedule_node *
489 choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
490 struct simple_node *schedule_list,
491 uint64_t prev_inst)
492 {
493 struct schedule_node *chosen = NULL;
494 struct simple_node *node;
495 int chosen_prio = 0;
496
497 foreach(node, schedule_list) {
498 struct schedule_node *n = (struct schedule_node *)node;
499 uint64_t inst = n->inst->inst;
500
501 /* "An instruction must not read from a location in physical
502 * regfile A or B that was written to by the previous
503 * instruction."
504 */
505 if (reads_too_soon_after_write(scoreboard, inst))
506 continue;
507
508 /* "A scoreboard wait must not occur in the first two
509 * instructions of a fragment shader. This is either the
510 * explicit Wait for Scoreboard signal or an implicit wait
511 * with the first tile-buffer read or write instruction."
512 */
513 if (pixel_scoreboard_too_soon(scoreboard, inst))
514 continue;
515
516 /* If we're trying to pair with another instruction, check
517 * that they're compatible.
518 */
519 if (prev_inst != 0) {
520 inst = qpu_merge_inst(prev_inst, inst);
521 if (!inst)
522 continue;
523 }
524
525 int prio = get_instruction_priority(inst);
526
527 /* Found a valid instruction. If nothing better comes along,
528 * this one works.
529 */
530 if (!chosen) {
531 chosen = n;
532 chosen_prio = prio;
533 continue;
534 }
535
536 if (prio > chosen_prio) {
537 chosen = n;
538 chosen_prio = prio;
539 } else if (prio < chosen_prio) {
540 continue;
541 }
542 }
543
544 return chosen;
545 }
546
547 static void
548 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
549 uint64_t inst)
550 {
551 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
552 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
553
554 if (!(inst & QPU_WS)) {
555 scoreboard->last_waddr_a = waddr_add;
556 scoreboard->last_waddr_b = waddr_mul;
557 } else {
558 scoreboard->last_waddr_b = waddr_add;
559 scoreboard->last_waddr_a = waddr_mul;
560 }
561
562 if ((waddr_add >= QPU_W_SFU_RECIP && waddr_add <= QPU_W_SFU_LOG) ||
563 (waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) {
564 scoreboard->last_sfu_write_tick = scoreboard->tick;
565 }
566 }
567
568 static void
569 dump_state(struct simple_node *schedule_list)
570 {
571 struct simple_node *node;
572
573 uint32_t i = 0;
574 foreach(node, schedule_list) {
575 struct schedule_node *n = (struct schedule_node *)node;
576
577 fprintf(stderr, "%3d: ", i++);
578 vc4_qpu_disasm(&n->inst->inst, 1);
579 fprintf(stderr, "\n");
580
581 for (int i = 0; i < n->child_count; i++) {
582 struct schedule_node *child = n->children[i].node;
583 if (!child)
584 continue;
585
586 fprintf(stderr, " - ");
587 vc4_qpu_disasm(&child->inst->inst, 1);
588 fprintf(stderr, " (%d parents, %c)\n",
589 child->parent_count,
590 n->children[i].write_after_read ? 'w' : 'r');
591 }
592 }
593 }
594
595 /** Recursive computation of the delay member of a node. */
596 static void
597 compute_delay(struct schedule_node *n)
598 {
599 if (!n->child_count) {
600 n->delay = 1;
601 } else {
602 for (int i = 0; i < n->child_count; i++) {
603 if (!n->children[i].node->delay)
604 compute_delay(n->children[i].node);
605 n->delay = MAX2(n->delay,
606 n->children[i].node->delay + 1);
607 }
608 }
609 }
610
611 static void
612 mark_instruction_scheduled(struct simple_node *schedule_list,
613 struct schedule_node *node,
614 bool war_only)
615 {
616 if (!node)
617 return;
618
619 for (int i = node->child_count - 1; i >= 0; i--) {
620 struct schedule_node *child =
621 node->children[i].node;
622
623 if (!child)
624 continue;
625
626 if (war_only && !node->children[i].write_after_read)
627 continue;
628
629 child->parent_count--;
630 if (child->parent_count == 0)
631 insert_at_head(schedule_list, &child->link);
632
633 node->children[i].node = NULL;
634 }
635 }
636
637 static void
638 schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
639 {
640 struct simple_node *node, *t;
641 struct choose_scoreboard scoreboard;
642
643 memset(&scoreboard, 0, sizeof(scoreboard));
644 scoreboard.last_waddr_a = ~0;
645 scoreboard.last_waddr_b = ~0;
646 scoreboard.last_sfu_write_tick = -10;
647
648 if (debug) {
649 fprintf(stderr, "initial deps:\n");
650 dump_state(schedule_list);
651 fprintf(stderr, "\n");
652 }
653
654 /* Remove non-DAG heads from the list. */
655 foreach_s(node, t, schedule_list) {
656 struct schedule_node *n = (struct schedule_node *)node;
657
658 if (n->parent_count != 0)
659 remove_from_list(&n->link);
660 }
661
662 while (!is_empty_list(schedule_list)) {
663 struct schedule_node *chosen =
664 choose_instruction_to_schedule(&scoreboard,
665 schedule_list,
666 0);
667 struct schedule_node *merge = NULL;
668
669 /* If there are no valid instructions to schedule, drop a NOP
670 * in.
671 */
672 uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP();
673
674 if (debug) {
675 fprintf(stderr, "current list:\n");
676 dump_state(schedule_list);
677 fprintf(stderr, "chose: ");
678 vc4_qpu_disasm(&inst, 1);
679 fprintf(stderr, "\n");
680 }
681
682 /* Schedule this instruction onto the QPU list. Also try to
683 * find an instruction to pair with it.
684 */
685 if (chosen) {
686 remove_from_list(&chosen->link);
687 mark_instruction_scheduled(schedule_list, chosen, true);
688
689 merge = choose_instruction_to_schedule(&scoreboard,
690 schedule_list,
691 inst);
692 if (merge) {
693 remove_from_list(&merge->link);
694 inst = qpu_merge_inst(inst, merge->inst->inst);
695 assert(inst != 0);
696
697 if (debug) {
698 fprintf(stderr, "merging: ");
699 vc4_qpu_disasm(&merge->inst->inst, 1);
700 fprintf(stderr, "\n");
701 fprintf(stderr, "resulting in: ");
702 vc4_qpu_disasm(&inst, 1);
703 fprintf(stderr, "\n");
704 }
705 }
706 }
707
708 if (debug) {
709 fprintf(stderr, "\n");
710 }
711
712 qpu_serialize_one_inst(c, inst);
713
714 update_scoreboard_for_chosen(&scoreboard, inst);
715
716 /* Now that we've scheduled a new instruction, some of its
717 * children can be promoted to the list of instructions ready to
718 * be scheduled. Update the children's unblocked time for this
719 * DAG edge as we do so.
720 */
721 mark_instruction_scheduled(schedule_list, chosen, false);
722 mark_instruction_scheduled(schedule_list, merge, false);
723
724 scoreboard.tick++;
725 }
726 }
727
728 void
729 qpu_schedule_instructions(struct vc4_compile *c)
730 {
731 void *mem_ctx = ralloc_context(NULL);
732 struct simple_node schedule_list;
733 struct simple_node *node;
734
735 make_empty_list(&schedule_list);
736
737 if (debug) {
738 fprintf(stderr, "Pre-schedule instructions\n");
739 foreach(node, &c->qpu_inst_list) {
740 struct queued_qpu_inst *q =
741 (struct queued_qpu_inst *)node;
742 vc4_qpu_disasm(&q->inst, 1);
743 fprintf(stderr, "\n");
744 }
745 fprintf(stderr, "\n");
746 }
747
748 /* Wrap each instruction in a scheduler structure. */
749 while (!is_empty_list(&c->qpu_inst_list)) {
750 struct queued_qpu_inst *inst =
751 (struct queued_qpu_inst *)c->qpu_inst_list.next;
752 struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
753
754 n->inst = inst;
755 remove_from_list(&inst->link);
756 insert_at_tail(&schedule_list, &n->link);
757 }
758
759 calculate_forward_deps(c, &schedule_list);
760 calculate_reverse_deps(c, &schedule_list);
761
762 foreach(node, &schedule_list) {
763 struct schedule_node *n = (struct schedule_node *)node;
764 compute_delay(n);
765 }
766
767 schedule_instructions(c, &schedule_list);
768
769 if (debug) {
770 fprintf(stderr, "Post-schedule instructions\n");
771 vc4_qpu_disasm(c->qpu_insts, c->qpu_inst_count);
772 fprintf(stderr, "\n");
773 }
774
775 ralloc_free(mem_ctx);
776 }