vc4: Populate the delay field better, and schedule high delay first.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_schedule.c
1 /*
2 * Copyright © 2010 Intel Corporation
3 * Copyright © 2014 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 /**
26 * @file vc4_qpu_schedule.c
27 *
28 * The basic model of the list scheduler is to take a basic block, compute a
29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically
30 * pick a DAG head, then put all the children that are now DAG heads into the
31 * list of things to schedule.
32 *
33 * The goal of scheduling here is to pack pairs of operations together in a
34 * single QPU instruction.
35 */
36
37 #include "vc4_qir.h"
38 #include "vc4_qpu.h"
39 #include "util/ralloc.h"
40
41 static bool debug;
42
43 struct schedule_node_child;
44
45 struct schedule_node {
46 struct simple_node link;
47 struct queued_qpu_inst *inst;
48 struct schedule_node_child *children;
49 uint32_t child_count;
50 uint32_t child_array_size;
51 uint32_t parent_count;
52
53 /**
54 * Minimum number of cycles from scheduling this instruction until the
55 * end of the program, based on the slowest dependency chain through
56 * the children.
57 */
58 uint32_t delay;
59
60 /**
61 * cycles between this instruction being scheduled and when its result
62 * can be consumed.
63 */
64 uint32_t latency;
65 };
66
67 struct schedule_node_child {
68 struct schedule_node *node;
69 bool write_after_read;
70 };
71
72 /* When walking the instructions in reverse, we need to swap before/after in
73 * add_dep().
74 */
75 enum direction { F, R };
76
77 struct schedule_state {
78 struct schedule_node *last_r[6];
79 struct schedule_node *last_ra[32];
80 struct schedule_node *last_rb[32];
81 struct schedule_node *last_sf;
82 struct schedule_node *last_vpm_read;
83 struct schedule_node *last_unif_read;
84 struct schedule_node *last_tmu_write;
85 struct schedule_node *last_tlb;
86 struct schedule_node *last_vpm;
87 enum direction dir;
88 };
89
90 static void
91 add_dep(struct schedule_state *state,
92 struct schedule_node *before,
93 struct schedule_node *after,
94 bool write)
95 {
96 bool write_after_read = !write && state->dir == R;
97
98 if (!before || !after)
99 return;
100
101 assert(before != after);
102
103 if (state->dir == R) {
104 struct schedule_node *t = before;
105 before = after;
106 after = t;
107 }
108
109 for (int i = 0; i < before->child_count; i++) {
110 if (before->children[i].node == after &&
111 (before->children[i].write_after_read == write_after_read)) {
112 return;
113 }
114 }
115
116 if (before->child_array_size <= before->child_count) {
117 before->child_array_size = MAX2(before->child_array_size * 2, 16);
118 before->children = reralloc(before, before->children,
119 struct schedule_node_child,
120 before->child_array_size);
121 }
122
123 before->children[before->child_count].node = after;
124 before->children[before->child_count].write_after_read =
125 write_after_read;
126 before->child_count++;
127 after->parent_count++;
128 }
129
130 static void
131 add_read_dep(struct schedule_state *state,
132 struct schedule_node *before,
133 struct schedule_node *after)
134 {
135 add_dep(state, before, after, false);
136 }
137
138 static void
139 add_write_dep(struct schedule_state *state,
140 struct schedule_node **before,
141 struct schedule_node *after)
142 {
143 add_dep(state, *before, after, true);
144 *before = after;
145 }
146
147 static bool
148 qpu_writes_r4(uint64_t inst)
149 {
150 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
151
152 switch(sig) {
153 case QPU_SIG_COLOR_LOAD:
154 case QPU_SIG_LOAD_TMU0:
155 case QPU_SIG_LOAD_TMU1:
156 case QPU_SIG_ALPHA_MASK_LOAD:
157 return true;
158 default:
159 return false;
160 }
161 }
162
163 static void
164 process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
165 uint32_t raddr, bool is_a)
166 {
167 switch (raddr) {
168 case QPU_R_VARY:
169 add_write_dep(state, &state->last_r[5], n);
170 break;
171
172 case QPU_R_VPM:
173 add_write_dep(state, &state->last_vpm_read, n);
174 break;
175
176 case QPU_R_UNIF:
177 add_write_dep(state, &state->last_unif_read, n);
178 break;
179
180 case QPU_R_NOP:
181 case QPU_R_ELEM_QPU:
182 case QPU_R_XY_PIXEL_COORD:
183 case QPU_R_MS_REV_FLAGS:
184 break;
185
186 default:
187 if (raddr < 32) {
188 if (is_a)
189 add_read_dep(state, state->last_ra[raddr], n);
190 else
191 add_read_dep(state, state->last_rb[raddr], n);
192 } else {
193 fprintf(stderr, "unknown raddr %d\n", raddr);
194 abort();
195 }
196 break;
197 }
198 }
199
200 static bool
201 is_tmu_write(uint32_t waddr)
202 {
203 switch (waddr) {
204 case QPU_W_TMU0_S:
205 case QPU_W_TMU0_T:
206 case QPU_W_TMU0_R:
207 case QPU_W_TMU0_B:
208 case QPU_W_TMU1_S:
209 case QPU_W_TMU1_T:
210 case QPU_W_TMU1_R:
211 case QPU_W_TMU1_B:
212 return true;
213 default:
214 return false;
215 }
216 }
217
218 static void
219 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
220 uint32_t mux)
221 {
222 if (mux != QPU_MUX_A && mux != QPU_MUX_B)
223 add_read_dep(state, state->last_r[mux], n);
224 }
225
226
227 static bool
228 is_direct_tmu_read(uint64_t inst)
229 {
230 /* If it's a direct read, we happen to structure the code such that
231 * there's an explicit uniform read in the instruction (for kernel
232 * texture reloc processing).
233 */
234 return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF ||
235 QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF);
236 }
237
238 static void
239 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
240 uint32_t waddr, bool is_add)
241 {
242 uint64_t inst = n->inst->inst;
243 bool is_a = is_add ^ ((inst & QPU_WS) != 0);
244
245 if (waddr < 32) {
246 if (is_a) {
247 add_write_dep(state, &state->last_ra[waddr], n);
248 } else {
249 add_write_dep(state, &state->last_rb[waddr], n);
250 }
251 } else if (is_tmu_write(waddr)) {
252 add_write_dep(state, &state->last_tmu_write, n);
253
254 /* There is an implicit uniform read in texture ops in
255 * hardware, unless this is a direct-addressed uniform read,
256 * so we need to keep it in the same order as the other
257 * uniforms.
258 */
259 if (!is_direct_tmu_read(n->inst->inst))
260 add_write_dep(state, &state->last_unif_read, n);
261 } else if (qpu_waddr_is_tlb(waddr)) {
262 add_write_dep(state, &state->last_tlb, n);
263 } else {
264 switch (waddr) {
265 case QPU_W_ACC0:
266 case QPU_W_ACC1:
267 case QPU_W_ACC2:
268 case QPU_W_ACC3:
269 case QPU_W_ACC5:
270 add_write_dep(state, &state->last_r[waddr - QPU_W_ACC0],
271 n);
272 break;
273
274 case QPU_W_VPM:
275 add_write_dep(state, &state->last_vpm, n);
276 break;
277
278 case QPU_W_VPMVCD_SETUP:
279 if (is_a)
280 add_write_dep(state, &state->last_vpm_read, n);
281 else
282 add_write_dep(state, &state->last_vpm, n);
283 break;
284
285 case QPU_W_SFU_RECIP:
286 case QPU_W_SFU_RECIPSQRT:
287 case QPU_W_SFU_EXP:
288 case QPU_W_SFU_LOG:
289 add_write_dep(state, &state->last_r[4], n);
290 break;
291
292 case QPU_W_TLB_STENCIL_SETUP:
293 /* This isn't a TLB operation that does things like
294 * implicitly lock the scoreboard, but it does have to
295 * appear before TLB_Z, and each of the TLB_STENCILs
296 * have to schedule in the same order relative to each
297 * other.
298 */
299 add_write_dep(state, &state->last_tlb, n);
300 break;
301
302 case QPU_W_NOP:
303 break;
304
305 default:
306 fprintf(stderr, "Unknown waddr %d\n", waddr);
307 abort();
308 }
309 }
310 }
311
312 static void
313 process_cond_deps(struct schedule_state *state, struct schedule_node *n,
314 uint32_t cond)
315 {
316 switch (cond) {
317 case QPU_COND_NEVER:
318 case QPU_COND_ALWAYS:
319 break;
320 default:
321 add_read_dep(state, state->last_sf, n);
322 break;
323 }
324 }
325
326 /**
327 * Common code for dependencies that need to be tracked both forward and
328 * backward.
329 *
330 * This is for things like "all reads of r4 have to happen between the r4
331 * writes that surround them".
332 */
333 static void
334 calculate_deps(struct schedule_state *state, struct schedule_node *n)
335 {
336 uint64_t inst = n->inst->inst;
337 uint32_t add_op = QPU_GET_FIELD(inst, QPU_OP_ADD);
338 uint32_t mul_op = QPU_GET_FIELD(inst, QPU_OP_MUL);
339 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
340 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
341 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
342 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
343 uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
344 uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
345 uint32_t mul_a = QPU_GET_FIELD(inst, QPU_MUL_A);
346 uint32_t mul_b = QPU_GET_FIELD(inst, QPU_MUL_B);
347 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
348
349 if (sig != QPU_SIG_LOAD_IMM) {
350 process_raddr_deps(state, n, raddr_a, true);
351 process_raddr_deps(state, n, raddr_b, false);
352 }
353
354 if (add_op != QPU_A_NOP) {
355 process_mux_deps(state, n, add_a);
356 process_mux_deps(state, n, add_b);
357 }
358 if (mul_op != QPU_M_NOP) {
359 process_mux_deps(state, n, mul_a);
360 process_mux_deps(state, n, mul_b);
361 }
362
363 process_waddr_deps(state, n, waddr_add, true);
364 process_waddr_deps(state, n, waddr_mul, false);
365 if (qpu_writes_r4(inst))
366 add_write_dep(state, &state->last_r[4], n);
367
368 switch (sig) {
369 case QPU_SIG_SW_BREAKPOINT:
370 case QPU_SIG_NONE:
371 case QPU_SIG_THREAD_SWITCH:
372 case QPU_SIG_LAST_THREAD_SWITCH:
373 case QPU_SIG_SMALL_IMM:
374 case QPU_SIG_LOAD_IMM:
375 break;
376
377 case QPU_SIG_LOAD_TMU0:
378 case QPU_SIG_LOAD_TMU1:
379 /* TMU loads are coming from a FIFO, so ordering is important.
380 */
381 add_write_dep(state, &state->last_tmu_write, n);
382 break;
383
384 case QPU_SIG_COLOR_LOAD:
385 add_read_dep(state, state->last_tlb, n);
386 break;
387
388 case QPU_SIG_PROG_END:
389 case QPU_SIG_WAIT_FOR_SCOREBOARD:
390 case QPU_SIG_SCOREBOARD_UNLOCK:
391 case QPU_SIG_COVERAGE_LOAD:
392 case QPU_SIG_COLOR_LOAD_END:
393 case QPU_SIG_ALPHA_MASK_LOAD:
394 case QPU_SIG_BRANCH:
395 fprintf(stderr, "Unhandled signal bits %d\n", sig);
396 abort();
397 }
398
399 process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
400 process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
401 if (inst & QPU_SF)
402 add_write_dep(state, &state->last_sf, n);
403 }
404
405 static void
406 calculate_forward_deps(struct vc4_compile *c, struct simple_node *schedule_list)
407 {
408 struct simple_node *node;
409 struct schedule_state state;
410
411 memset(&state, 0, sizeof(state));
412 state.dir = F;
413
414 foreach(node, schedule_list)
415 calculate_deps(&state, (struct schedule_node *)node);
416 }
417
418 static void
419 calculate_reverse_deps(struct vc4_compile *c, struct simple_node *schedule_list)
420 {
421 struct simple_node *node;
422 struct schedule_state state;
423
424 memset(&state, 0, sizeof(state));
425 state.dir = R;
426
427 for (node = schedule_list->prev; schedule_list != node; node = node->prev) {
428 calculate_deps(&state, (struct schedule_node *)node);
429 }
430 }
431
432 struct choose_scoreboard {
433 int tick;
434 int last_sfu_write_tick;
435 uint32_t last_waddr_a, last_waddr_b;
436 };
437
438 static bool
439 reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
440 {
441 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
442 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
443 uint32_t src_muxes[] = {
444 QPU_GET_FIELD(inst, QPU_ADD_A),
445 QPU_GET_FIELD(inst, QPU_ADD_B),
446 QPU_GET_FIELD(inst, QPU_MUL_A),
447 QPU_GET_FIELD(inst, QPU_MUL_B),
448 };
449 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
450 if ((src_muxes[i] == QPU_MUX_A &&
451 raddr_a < 32 &&
452 scoreboard->last_waddr_a == raddr_a) ||
453 (src_muxes[i] == QPU_MUX_B &&
454 raddr_b < 32 &&
455 scoreboard->last_waddr_b == raddr_b)) {
456 return true;
457 }
458
459 if (src_muxes[i] == QPU_MUX_R4) {
460 if (scoreboard->tick -
461 scoreboard->last_sfu_write_tick <= 2) {
462 return true;
463 }
464 }
465 }
466
467 return false;
468 }
469
470 static bool
471 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, uint64_t inst)
472 {
473 return (scoreboard->tick < 2 && qpu_inst_is_tlb(inst));
474 }
475
476 static int
477 get_instruction_priority(uint64_t inst)
478 {
479 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
480 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
481 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
482 uint32_t baseline_score;
483 uint32_t next_score = 0;
484
485 /* Schedule TLB operations as late as possible, to get more
486 * parallelism between shaders.
487 */
488 if (qpu_inst_is_tlb(inst))
489 return next_score;
490 next_score++;
491
492 /* Schedule texture read results collection late to hide latency. */
493 if (sig == QPU_SIG_LOAD_TMU0 || sig == QPU_SIG_LOAD_TMU1)
494 return next_score;
495 next_score++;
496
497 /* Default score for things that aren't otherwise special. */
498 baseline_score = next_score;
499 next_score++;
500
501 /* Schedule texture read setup early to hide their latency better. */
502 if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul))
503 return next_score;
504 next_score++;
505
506 return baseline_score;
507 }
508
509 static struct schedule_node *
510 choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
511 struct simple_node *schedule_list,
512 uint64_t prev_inst)
513 {
514 struct schedule_node *chosen = NULL;
515 struct simple_node *node;
516 int chosen_prio = 0;
517
518 foreach(node, schedule_list) {
519 struct schedule_node *n = (struct schedule_node *)node;
520 uint64_t inst = n->inst->inst;
521
522 /* "An instruction must not read from a location in physical
523 * regfile A or B that was written to by the previous
524 * instruction."
525 */
526 if (reads_too_soon_after_write(scoreboard, inst))
527 continue;
528
529 /* "A scoreboard wait must not occur in the first two
530 * instructions of a fragment shader. This is either the
531 * explicit Wait for Scoreboard signal or an implicit wait
532 * with the first tile-buffer read or write instruction."
533 */
534 if (pixel_scoreboard_too_soon(scoreboard, inst))
535 continue;
536
537 /* If we're trying to pair with another instruction, check
538 * that they're compatible.
539 */
540 if (prev_inst != 0) {
541 inst = qpu_merge_inst(prev_inst, inst);
542 if (!inst)
543 continue;
544 }
545
546 int prio = get_instruction_priority(inst);
547
548 /* Found a valid instruction. If nothing better comes along,
549 * this one works.
550 */
551 if (!chosen) {
552 chosen = n;
553 chosen_prio = prio;
554 continue;
555 }
556
557 if (prio > chosen_prio) {
558 chosen = n;
559 chosen_prio = prio;
560 } else if (prio < chosen_prio) {
561 continue;
562 }
563
564 if (n->delay > chosen->delay) {
565 chosen = n;
566 chosen_prio = prio;
567 } else if (n->delay < chosen->delay) {
568 continue;
569 }
570 }
571
572 return chosen;
573 }
574
575 static void
576 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
577 uint64_t inst)
578 {
579 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
580 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
581
582 if (!(inst & QPU_WS)) {
583 scoreboard->last_waddr_a = waddr_add;
584 scoreboard->last_waddr_b = waddr_mul;
585 } else {
586 scoreboard->last_waddr_b = waddr_add;
587 scoreboard->last_waddr_a = waddr_mul;
588 }
589
590 if ((waddr_add >= QPU_W_SFU_RECIP && waddr_add <= QPU_W_SFU_LOG) ||
591 (waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) {
592 scoreboard->last_sfu_write_tick = scoreboard->tick;
593 }
594 }
595
596 static void
597 dump_state(struct simple_node *schedule_list)
598 {
599 struct simple_node *node;
600
601 uint32_t i = 0;
602 foreach(node, schedule_list) {
603 struct schedule_node *n = (struct schedule_node *)node;
604
605 fprintf(stderr, "%3d: ", i++);
606 vc4_qpu_disasm(&n->inst->inst, 1);
607 fprintf(stderr, "\n");
608
609 for (int i = 0; i < n->child_count; i++) {
610 struct schedule_node *child = n->children[i].node;
611 if (!child)
612 continue;
613
614 fprintf(stderr, " - ");
615 vc4_qpu_disasm(&child->inst->inst, 1);
616 fprintf(stderr, " (%d parents, %c)\n",
617 child->parent_count,
618 n->children[i].write_after_read ? 'w' : 'r');
619 }
620 }
621 }
622
623 /** Recursive computation of the delay member of a node. */
624 static void
625 compute_delay(struct schedule_node *n)
626 {
627 if (!n->child_count) {
628 n->delay = 1;
629 } else {
630 for (int i = 0; i < n->child_count; i++) {
631 if (!n->children[i].node->delay)
632 compute_delay(n->children[i].node);
633 n->delay = MAX2(n->delay,
634 n->children[i].node->delay + n->latency);
635 }
636 }
637 }
638
639 static void
640 mark_instruction_scheduled(struct simple_node *schedule_list,
641 struct schedule_node *node,
642 bool war_only)
643 {
644 if (!node)
645 return;
646
647 for (int i = node->child_count - 1; i >= 0; i--) {
648 struct schedule_node *child =
649 node->children[i].node;
650
651 if (!child)
652 continue;
653
654 if (war_only && !node->children[i].write_after_read)
655 continue;
656
657 child->parent_count--;
658 if (child->parent_count == 0)
659 insert_at_head(schedule_list, &child->link);
660
661 node->children[i].node = NULL;
662 }
663 }
664
665 static void
666 schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
667 {
668 struct simple_node *node, *t;
669 struct choose_scoreboard scoreboard;
670
671 memset(&scoreboard, 0, sizeof(scoreboard));
672 scoreboard.last_waddr_a = ~0;
673 scoreboard.last_waddr_b = ~0;
674 scoreboard.last_sfu_write_tick = -10;
675
676 if (debug) {
677 fprintf(stderr, "initial deps:\n");
678 dump_state(schedule_list);
679 fprintf(stderr, "\n");
680 }
681
682 /* Remove non-DAG heads from the list. */
683 foreach_s(node, t, schedule_list) {
684 struct schedule_node *n = (struct schedule_node *)node;
685
686 if (n->parent_count != 0)
687 remove_from_list(&n->link);
688 }
689
690 while (!is_empty_list(schedule_list)) {
691 struct schedule_node *chosen =
692 choose_instruction_to_schedule(&scoreboard,
693 schedule_list,
694 0);
695 struct schedule_node *merge = NULL;
696
697 /* If there are no valid instructions to schedule, drop a NOP
698 * in.
699 */
700 uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP();
701
702 if (debug) {
703 fprintf(stderr, "current list:\n");
704 dump_state(schedule_list);
705 fprintf(stderr, "chose: ");
706 vc4_qpu_disasm(&inst, 1);
707 fprintf(stderr, "\n");
708 }
709
710 /* Schedule this instruction onto the QPU list. Also try to
711 * find an instruction to pair with it.
712 */
713 if (chosen) {
714 remove_from_list(&chosen->link);
715 mark_instruction_scheduled(schedule_list, chosen, true);
716
717 merge = choose_instruction_to_schedule(&scoreboard,
718 schedule_list,
719 inst);
720 if (merge) {
721 remove_from_list(&merge->link);
722 inst = qpu_merge_inst(inst, merge->inst->inst);
723 assert(inst != 0);
724
725 if (debug) {
726 fprintf(stderr, "merging: ");
727 vc4_qpu_disasm(&merge->inst->inst, 1);
728 fprintf(stderr, "\n");
729 fprintf(stderr, "resulting in: ");
730 vc4_qpu_disasm(&inst, 1);
731 fprintf(stderr, "\n");
732 }
733 }
734 }
735
736 if (debug) {
737 fprintf(stderr, "\n");
738 }
739
740 qpu_serialize_one_inst(c, inst);
741
742 update_scoreboard_for_chosen(&scoreboard, inst);
743
744 /* Now that we've scheduled a new instruction, some of its
745 * children can be promoted to the list of instructions ready to
746 * be scheduled. Update the children's unblocked time for this
747 * DAG edge as we do so.
748 */
749 mark_instruction_scheduled(schedule_list, chosen, false);
750 mark_instruction_scheduled(schedule_list, merge, false);
751
752 scoreboard.tick++;
753 }
754 }
755
756 static uint32_t waddr_latency(uint32_t waddr)
757 {
758 if (waddr < 32)
759 return 2;
760
761 /* Some huge number, really. */
762 if (waddr >= QPU_W_TMU0_S && waddr <= QPU_W_TMU1_B)
763 return 10;
764
765 switch(waddr) {
766 case QPU_W_SFU_RECIP:
767 case QPU_W_SFU_RECIPSQRT:
768 case QPU_W_SFU_EXP:
769 case QPU_W_SFU_LOG:
770 return 3;
771 default:
772 return 1;
773 }
774 }
775
776 static uint32_t
777 instruction_latency(uint64_t inst)
778 {
779 return MAX2(waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_ADD)),
780 waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
781 }
782
783 void
784 qpu_schedule_instructions(struct vc4_compile *c)
785 {
786 void *mem_ctx = ralloc_context(NULL);
787 struct simple_node schedule_list;
788 struct simple_node *node;
789
790 make_empty_list(&schedule_list);
791
792 if (debug) {
793 fprintf(stderr, "Pre-schedule instructions\n");
794 foreach(node, &c->qpu_inst_list) {
795 struct queued_qpu_inst *q =
796 (struct queued_qpu_inst *)node;
797 vc4_qpu_disasm(&q->inst, 1);
798 fprintf(stderr, "\n");
799 }
800 fprintf(stderr, "\n");
801 }
802
803 /* Wrap each instruction in a scheduler structure. */
804 while (!is_empty_list(&c->qpu_inst_list)) {
805 struct queued_qpu_inst *inst =
806 (struct queued_qpu_inst *)c->qpu_inst_list.next;
807 struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
808
809 n->inst = inst;
810 n->latency = instruction_latency(inst->inst);
811
812 remove_from_list(&inst->link);
813 insert_at_tail(&schedule_list, &n->link);
814 }
815
816 calculate_forward_deps(c, &schedule_list);
817 calculate_reverse_deps(c, &schedule_list);
818
819 foreach(node, &schedule_list) {
820 struct schedule_node *n = (struct schedule_node *)node;
821 compute_delay(n);
822 }
823
824 schedule_instructions(c, &schedule_list);
825
826 if (debug) {
827 fprintf(stderr, "Post-schedule instructions\n");
828 vc4_qpu_disasm(c->qpu_insts, c->qpu_inst_count);
829 fprintf(stderr, "\n");
830 }
831
832 ralloc_free(mem_ctx);
833 }