vc4: Skip raddr dependencies for 32-bit immediate loads.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_schedule.c
1 /*
2 * Copyright © 2010 Intel Corporation
3 * Copyright © 2014 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 /**
26 * @file vc4_qpu_schedule.c
27 *
28 * The basic model of the list scheduler is to take a basic block, compute a
29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically
30 * pick a DAG head, then put all the children that are now DAG heads into the
31 * list of things to schedule.
32 *
33 * The goal of scheduling here is to pack pairs of operations together in a
34 * single QPU instruction.
35 */
36
37 #include "vc4_qir.h"
38 #include "vc4_qpu.h"
39 #include "util/ralloc.h"
40
41 static bool debug;
42
43 struct schedule_node_child;
44
45 struct schedule_node {
46 struct simple_node link;
47 struct queued_qpu_inst *inst;
48 struct schedule_node_child *children;
49 uint32_t child_count;
50 uint32_t child_array_size;
51 uint32_t parent_count;
52 uint32_t delay;
53 };
54
55 struct schedule_node_child {
56 struct schedule_node *node;
57 bool write_after_read;
58 };
59
60 /* When walking the instructions in reverse, we need to swap before/after in
61 * add_dep().
62 */
63 enum direction { F, R };
64
65 struct schedule_state {
66 struct schedule_node *last_r[6];
67 struct schedule_node *last_ra[32];
68 struct schedule_node *last_rb[32];
69 struct schedule_node *last_sf;
70 struct schedule_node *last_vpm_read;
71 struct schedule_node *last_unif_read;
72 struct schedule_node *last_tmu_write;
73 struct schedule_node *last_tlb;
74 struct schedule_node *last_vpm;
75 enum direction dir;
76 };
77
78 static void
79 add_dep(struct schedule_state *state,
80 struct schedule_node *before,
81 struct schedule_node *after,
82 bool write)
83 {
84 bool write_after_read = !write && state->dir == R;
85
86 if (!before || !after)
87 return;
88
89 assert(before != after);
90
91 if (state->dir == R) {
92 struct schedule_node *t = before;
93 before = after;
94 after = t;
95 }
96
97 for (int i = 0; i < before->child_count; i++) {
98 if (before->children[i].node == after &&
99 (before->children[i].write_after_read == write_after_read)) {
100 return;
101 }
102 }
103
104 if (before->child_array_size <= before->child_count) {
105 before->child_array_size = MAX2(before->child_array_size * 2, 16);
106 before->children = reralloc(before, before->children,
107 struct schedule_node_child,
108 before->child_array_size);
109 }
110
111 before->children[before->child_count].node = after;
112 before->children[before->child_count].write_after_read =
113 write_after_read;
114 before->child_count++;
115 after->parent_count++;
116 }
117
118 static void
119 add_read_dep(struct schedule_state *state,
120 struct schedule_node *before,
121 struct schedule_node *after)
122 {
123 add_dep(state, before, after, false);
124 }
125
126 static void
127 add_write_dep(struct schedule_state *state,
128 struct schedule_node **before,
129 struct schedule_node *after)
130 {
131 add_dep(state, *before, after, true);
132 *before = after;
133 }
134
135 static bool
136 qpu_writes_r4(uint64_t inst)
137 {
138 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
139
140 switch(sig) {
141 case QPU_SIG_COLOR_LOAD:
142 case QPU_SIG_LOAD_TMU0:
143 case QPU_SIG_LOAD_TMU1:
144 case QPU_SIG_ALPHA_MASK_LOAD:
145 return true;
146 default:
147 return false;
148 }
149 }
150
151 static void
152 process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
153 uint32_t raddr, bool is_a)
154 {
155 switch (raddr) {
156 case QPU_R_VARY:
157 add_write_dep(state, &state->last_r[5], n);
158 break;
159
160 case QPU_R_VPM:
161 add_write_dep(state, &state->last_vpm_read, n);
162 break;
163
164 case QPU_R_UNIF:
165 add_write_dep(state, &state->last_unif_read, n);
166 break;
167
168 case QPU_R_NOP:
169 case QPU_R_ELEM_QPU:
170 case QPU_R_XY_PIXEL_COORD:
171 case QPU_R_MS_REV_FLAGS:
172 break;
173
174 default:
175 if (raddr < 32) {
176 if (is_a)
177 add_read_dep(state, state->last_ra[raddr], n);
178 else
179 add_read_dep(state, state->last_rb[raddr], n);
180 } else {
181 fprintf(stderr, "unknown raddr %d\n", raddr);
182 abort();
183 }
184 break;
185 }
186 }
187
188 static bool
189 is_tmu_write(uint32_t waddr)
190 {
191 switch (waddr) {
192 case QPU_W_TMU0_S:
193 case QPU_W_TMU0_T:
194 case QPU_W_TMU0_R:
195 case QPU_W_TMU0_B:
196 case QPU_W_TMU1_S:
197 case QPU_W_TMU1_T:
198 case QPU_W_TMU1_R:
199 case QPU_W_TMU1_B:
200 return true;
201 default:
202 return false;
203 }
204 }
205
206 static void
207 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
208 uint32_t mux)
209 {
210 if (mux != QPU_MUX_A && mux != QPU_MUX_B)
211 add_read_dep(state, state->last_r[mux], n);
212 }
213
214
215 static bool
216 is_direct_tmu_read(uint64_t inst)
217 {
218 /* If it's a direct read, we happen to structure the code such that
219 * there's an explicit uniform read in the instruction (for kernel
220 * texture reloc processing).
221 */
222 return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF ||
223 QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF);
224 }
225
226 static void
227 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
228 uint32_t waddr, bool is_add)
229 {
230 uint64_t inst = n->inst->inst;
231 bool is_a = is_add ^ ((inst & QPU_WS) != 0);
232
233 if (waddr < 32) {
234 if (is_a) {
235 add_write_dep(state, &state->last_ra[waddr], n);
236 } else {
237 add_write_dep(state, &state->last_rb[waddr], n);
238 }
239 } else if (is_tmu_write(waddr)) {
240 add_write_dep(state, &state->last_tmu_write, n);
241
242 /* There is an implicit uniform read in texture ops in
243 * hardware, unless this is a direct-addressed uniform read,
244 * so we need to keep it in the same order as the other
245 * uniforms.
246 */
247 if (!is_direct_tmu_read(n->inst->inst))
248 add_write_dep(state, &state->last_unif_read, n);
249 } else if (qpu_waddr_is_tlb(waddr)) {
250 add_write_dep(state, &state->last_tlb, n);
251 } else {
252 switch (waddr) {
253 case QPU_W_ACC0:
254 case QPU_W_ACC1:
255 case QPU_W_ACC2:
256 case QPU_W_ACC3:
257 case QPU_W_ACC5:
258 add_write_dep(state, &state->last_r[waddr - QPU_W_ACC0],
259 n);
260 break;
261
262 case QPU_W_VPM:
263 add_write_dep(state, &state->last_vpm, n);
264 break;
265
266 case QPU_W_VPMVCD_SETUP:
267 if (is_a)
268 add_write_dep(state, &state->last_vpm_read, n);
269 else
270 add_write_dep(state, &state->last_vpm, n);
271 break;
272
273 case QPU_W_SFU_RECIP:
274 case QPU_W_SFU_RECIPSQRT:
275 case QPU_W_SFU_EXP:
276 case QPU_W_SFU_LOG:
277 add_write_dep(state, &state->last_r[4], n);
278 break;
279
280 case QPU_W_TLB_STENCIL_SETUP:
281 /* This isn't a TLB operation that does things like
282 * implicitly lock the scoreboard, but it does have to
283 * appear before TLB_Z, and each of the TLB_STENCILs
284 * have to schedule in the same order relative to each
285 * other.
286 */
287 add_write_dep(state, &state->last_tlb, n);
288 break;
289
290 case QPU_W_NOP:
291 break;
292
293 default:
294 fprintf(stderr, "Unknown waddr %d\n", waddr);
295 abort();
296 }
297 }
298 }
299
300 static void
301 process_cond_deps(struct schedule_state *state, struct schedule_node *n,
302 uint32_t cond)
303 {
304 switch (cond) {
305 case QPU_COND_NEVER:
306 case QPU_COND_ALWAYS:
307 break;
308 default:
309 add_read_dep(state, state->last_sf, n);
310 break;
311 }
312 }
313
314 /**
315 * Common code for dependencies that need to be tracked both forward and
316 * backward.
317 *
318 * This is for things like "all reads of r4 have to happen between the r4
319 * writes that surround them".
320 */
321 static void
322 calculate_deps(struct schedule_state *state, struct schedule_node *n)
323 {
324 uint64_t inst = n->inst->inst;
325 uint32_t add_op = QPU_GET_FIELD(inst, QPU_OP_ADD);
326 uint32_t mul_op = QPU_GET_FIELD(inst, QPU_OP_MUL);
327 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
328 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
329 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
330 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
331 uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
332 uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
333 uint32_t mul_a = QPU_GET_FIELD(inst, QPU_MUL_A);
334 uint32_t mul_b = QPU_GET_FIELD(inst, QPU_MUL_B);
335 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
336
337 if (sig != QPU_SIG_LOAD_IMM) {
338 process_raddr_deps(state, n, raddr_a, true);
339 process_raddr_deps(state, n, raddr_b, false);
340 }
341
342 if (add_op != QPU_A_NOP) {
343 process_mux_deps(state, n, add_a);
344 process_mux_deps(state, n, add_b);
345 }
346 if (mul_op != QPU_M_NOP) {
347 process_mux_deps(state, n, mul_a);
348 process_mux_deps(state, n, mul_b);
349 }
350
351 process_waddr_deps(state, n, waddr_add, true);
352 process_waddr_deps(state, n, waddr_mul, false);
353 if (qpu_writes_r4(inst))
354 add_write_dep(state, &state->last_r[4], n);
355
356 switch (sig) {
357 case QPU_SIG_SW_BREAKPOINT:
358 case QPU_SIG_NONE:
359 case QPU_SIG_THREAD_SWITCH:
360 case QPU_SIG_LAST_THREAD_SWITCH:
361 case QPU_SIG_SMALL_IMM:
362 case QPU_SIG_LOAD_IMM:
363 break;
364
365 case QPU_SIG_LOAD_TMU0:
366 case QPU_SIG_LOAD_TMU1:
367 /* TMU loads are coming from a FIFO, so ordering is important.
368 */
369 add_write_dep(state, &state->last_tmu_write, n);
370 break;
371
372 case QPU_SIG_COLOR_LOAD:
373 add_read_dep(state, state->last_tlb, n);
374 break;
375
376 case QPU_SIG_PROG_END:
377 case QPU_SIG_WAIT_FOR_SCOREBOARD:
378 case QPU_SIG_SCOREBOARD_UNLOCK:
379 case QPU_SIG_COVERAGE_LOAD:
380 case QPU_SIG_COLOR_LOAD_END:
381 case QPU_SIG_ALPHA_MASK_LOAD:
382 case QPU_SIG_BRANCH:
383 fprintf(stderr, "Unhandled signal bits %d\n", sig);
384 abort();
385 }
386
387 process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
388 process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
389 if (inst & QPU_SF)
390 add_write_dep(state, &state->last_sf, n);
391 }
392
393 static void
394 calculate_forward_deps(struct vc4_compile *c, struct simple_node *schedule_list)
395 {
396 struct simple_node *node;
397 struct schedule_state state;
398
399 memset(&state, 0, sizeof(state));
400 state.dir = F;
401
402 foreach(node, schedule_list)
403 calculate_deps(&state, (struct schedule_node *)node);
404 }
405
406 static void
407 calculate_reverse_deps(struct vc4_compile *c, struct simple_node *schedule_list)
408 {
409 struct simple_node *node;
410 struct schedule_state state;
411
412 memset(&state, 0, sizeof(state));
413 state.dir = R;
414
415 for (node = schedule_list->prev; schedule_list != node; node = node->prev) {
416 calculate_deps(&state, (struct schedule_node *)node);
417 }
418 }
419
420 struct choose_scoreboard {
421 int tick;
422 int last_sfu_write_tick;
423 uint32_t last_waddr_a, last_waddr_b;
424 };
425
426 static bool
427 reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
428 {
429 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
430 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
431 uint32_t src_muxes[] = {
432 QPU_GET_FIELD(inst, QPU_ADD_A),
433 QPU_GET_FIELD(inst, QPU_ADD_B),
434 QPU_GET_FIELD(inst, QPU_MUL_A),
435 QPU_GET_FIELD(inst, QPU_MUL_B),
436 };
437 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
438 if ((src_muxes[i] == QPU_MUX_A &&
439 raddr_a < 32 &&
440 scoreboard->last_waddr_a == raddr_a) ||
441 (src_muxes[i] == QPU_MUX_B &&
442 raddr_b < 32 &&
443 scoreboard->last_waddr_b == raddr_b)) {
444 return true;
445 }
446
447 if (src_muxes[i] == QPU_MUX_R4) {
448 if (scoreboard->tick -
449 scoreboard->last_sfu_write_tick <= 2) {
450 return true;
451 }
452 }
453 }
454
455 return false;
456 }
457
458 static bool
459 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, uint64_t inst)
460 {
461 return (scoreboard->tick < 2 && qpu_inst_is_tlb(inst));
462 }
463
464 static int
465 get_instruction_priority(uint64_t inst)
466 {
467 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
468 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
469 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
470 uint32_t baseline_score;
471 uint32_t next_score = 0;
472
473 /* Schedule TLB operations as late as possible, to get more
474 * parallelism between shaders.
475 */
476 if (qpu_inst_is_tlb(inst))
477 return next_score;
478 next_score++;
479
480 /* Schedule texture read results collection late to hide latency. */
481 if (sig == QPU_SIG_LOAD_TMU0 || sig == QPU_SIG_LOAD_TMU1)
482 return next_score;
483 next_score++;
484
485 /* Default score for things that aren't otherwise special. */
486 baseline_score = next_score;
487 next_score++;
488
489 /* Schedule texture read setup early to hide their latency better. */
490 if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul))
491 return next_score;
492 next_score++;
493
494 return baseline_score;
495 }
496
497 static struct schedule_node *
498 choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
499 struct simple_node *schedule_list,
500 uint64_t prev_inst)
501 {
502 struct schedule_node *chosen = NULL;
503 struct simple_node *node;
504 int chosen_prio = 0;
505
506 foreach(node, schedule_list) {
507 struct schedule_node *n = (struct schedule_node *)node;
508 uint64_t inst = n->inst->inst;
509
510 /* "An instruction must not read from a location in physical
511 * regfile A or B that was written to by the previous
512 * instruction."
513 */
514 if (reads_too_soon_after_write(scoreboard, inst))
515 continue;
516
517 /* "A scoreboard wait must not occur in the first two
518 * instructions of a fragment shader. This is either the
519 * explicit Wait for Scoreboard signal or an implicit wait
520 * with the first tile-buffer read or write instruction."
521 */
522 if (pixel_scoreboard_too_soon(scoreboard, inst))
523 continue;
524
525 /* If we're trying to pair with another instruction, check
526 * that they're compatible.
527 */
528 if (prev_inst != 0) {
529 inst = qpu_merge_inst(prev_inst, inst);
530 if (!inst)
531 continue;
532 }
533
534 int prio = get_instruction_priority(inst);
535
536 /* Found a valid instruction. If nothing better comes along,
537 * this one works.
538 */
539 if (!chosen) {
540 chosen = n;
541 chosen_prio = prio;
542 continue;
543 }
544
545 if (prio > chosen_prio) {
546 chosen = n;
547 chosen_prio = prio;
548 } else if (prio < chosen_prio) {
549 continue;
550 }
551 }
552
553 return chosen;
554 }
555
556 static void
557 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
558 uint64_t inst)
559 {
560 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
561 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
562
563 if (!(inst & QPU_WS)) {
564 scoreboard->last_waddr_a = waddr_add;
565 scoreboard->last_waddr_b = waddr_mul;
566 } else {
567 scoreboard->last_waddr_b = waddr_add;
568 scoreboard->last_waddr_a = waddr_mul;
569 }
570
571 if ((waddr_add >= QPU_W_SFU_RECIP && waddr_add <= QPU_W_SFU_LOG) ||
572 (waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) {
573 scoreboard->last_sfu_write_tick = scoreboard->tick;
574 }
575 }
576
577 static void
578 dump_state(struct simple_node *schedule_list)
579 {
580 struct simple_node *node;
581
582 uint32_t i = 0;
583 foreach(node, schedule_list) {
584 struct schedule_node *n = (struct schedule_node *)node;
585
586 fprintf(stderr, "%3d: ", i++);
587 vc4_qpu_disasm(&n->inst->inst, 1);
588 fprintf(stderr, "\n");
589
590 for (int i = 0; i < n->child_count; i++) {
591 struct schedule_node *child = n->children[i].node;
592 if (!child)
593 continue;
594
595 fprintf(stderr, " - ");
596 vc4_qpu_disasm(&child->inst->inst, 1);
597 fprintf(stderr, " (%d parents, %c)\n",
598 child->parent_count,
599 n->children[i].write_after_read ? 'w' : 'r');
600 }
601 }
602 }
603
604 /** Recursive computation of the delay member of a node. */
605 static void
606 compute_delay(struct schedule_node *n)
607 {
608 if (!n->child_count) {
609 n->delay = 1;
610 } else {
611 for (int i = 0; i < n->child_count; i++) {
612 if (!n->children[i].node->delay)
613 compute_delay(n->children[i].node);
614 n->delay = MAX2(n->delay,
615 n->children[i].node->delay + 1);
616 }
617 }
618 }
619
620 static void
621 mark_instruction_scheduled(struct simple_node *schedule_list,
622 struct schedule_node *node,
623 bool war_only)
624 {
625 if (!node)
626 return;
627
628 for (int i = node->child_count - 1; i >= 0; i--) {
629 struct schedule_node *child =
630 node->children[i].node;
631
632 if (!child)
633 continue;
634
635 if (war_only && !node->children[i].write_after_read)
636 continue;
637
638 child->parent_count--;
639 if (child->parent_count == 0)
640 insert_at_head(schedule_list, &child->link);
641
642 node->children[i].node = NULL;
643 }
644 }
645
646 static void
647 schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
648 {
649 struct simple_node *node, *t;
650 struct choose_scoreboard scoreboard;
651
652 memset(&scoreboard, 0, sizeof(scoreboard));
653 scoreboard.last_waddr_a = ~0;
654 scoreboard.last_waddr_b = ~0;
655 scoreboard.last_sfu_write_tick = -10;
656
657 if (debug) {
658 fprintf(stderr, "initial deps:\n");
659 dump_state(schedule_list);
660 fprintf(stderr, "\n");
661 }
662
663 /* Remove non-DAG heads from the list. */
664 foreach_s(node, t, schedule_list) {
665 struct schedule_node *n = (struct schedule_node *)node;
666
667 if (n->parent_count != 0)
668 remove_from_list(&n->link);
669 }
670
671 while (!is_empty_list(schedule_list)) {
672 struct schedule_node *chosen =
673 choose_instruction_to_schedule(&scoreboard,
674 schedule_list,
675 0);
676 struct schedule_node *merge = NULL;
677
678 /* If there are no valid instructions to schedule, drop a NOP
679 * in.
680 */
681 uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP();
682
683 if (debug) {
684 fprintf(stderr, "current list:\n");
685 dump_state(schedule_list);
686 fprintf(stderr, "chose: ");
687 vc4_qpu_disasm(&inst, 1);
688 fprintf(stderr, "\n");
689 }
690
691 /* Schedule this instruction onto the QPU list. Also try to
692 * find an instruction to pair with it.
693 */
694 if (chosen) {
695 remove_from_list(&chosen->link);
696 mark_instruction_scheduled(schedule_list, chosen, true);
697
698 merge = choose_instruction_to_schedule(&scoreboard,
699 schedule_list,
700 inst);
701 if (merge) {
702 remove_from_list(&merge->link);
703 inst = qpu_merge_inst(inst, merge->inst->inst);
704 assert(inst != 0);
705
706 if (debug) {
707 fprintf(stderr, "merging: ");
708 vc4_qpu_disasm(&merge->inst->inst, 1);
709 fprintf(stderr, "\n");
710 fprintf(stderr, "resulting in: ");
711 vc4_qpu_disasm(&inst, 1);
712 fprintf(stderr, "\n");
713 }
714 }
715 }
716
717 if (debug) {
718 fprintf(stderr, "\n");
719 }
720
721 qpu_serialize_one_inst(c, inst);
722
723 update_scoreboard_for_chosen(&scoreboard, inst);
724
725 /* Now that we've scheduled a new instruction, some of its
726 * children can be promoted to the list of instructions ready to
727 * be scheduled. Update the children's unblocked time for this
728 * DAG edge as we do so.
729 */
730 mark_instruction_scheduled(schedule_list, chosen, false);
731 mark_instruction_scheduled(schedule_list, merge, false);
732
733 scoreboard.tick++;
734 }
735 }
736
737 void
738 qpu_schedule_instructions(struct vc4_compile *c)
739 {
740 void *mem_ctx = ralloc_context(NULL);
741 struct simple_node schedule_list;
742 struct simple_node *node;
743
744 make_empty_list(&schedule_list);
745
746 if (debug) {
747 fprintf(stderr, "Pre-schedule instructions\n");
748 foreach(node, &c->qpu_inst_list) {
749 struct queued_qpu_inst *q =
750 (struct queued_qpu_inst *)node;
751 vc4_qpu_disasm(&q->inst, 1);
752 fprintf(stderr, "\n");
753 }
754 fprintf(stderr, "\n");
755 }
756
757 /* Wrap each instruction in a scheduler structure. */
758 while (!is_empty_list(&c->qpu_inst_list)) {
759 struct queued_qpu_inst *inst =
760 (struct queued_qpu_inst *)c->qpu_inst_list.next;
761 struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
762
763 n->inst = inst;
764 remove_from_list(&inst->link);
765 insert_at_tail(&schedule_list, &n->link);
766 }
767
768 calculate_forward_deps(c, &schedule_list);
769 calculate_reverse_deps(c, &schedule_list);
770
771 foreach(node, &schedule_list) {
772 struct schedule_node *n = (struct schedule_node *)node;
773 compute_delay(n);
774 }
775
776 schedule_instructions(c, &schedule_list);
777
778 if (debug) {
779 fprintf(stderr, "Post-schedule instructions\n");
780 vc4_qpu_disasm(c->qpu_insts, c->qpu_inst_count);
781 fprintf(stderr, "\n");
782 }
783
784 ralloc_free(mem_ctx);
785 }