vc4: Add support for scheduling of branch instructions.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_schedule.c
1 /*
2 * Copyright © 2010 Intel Corporation
3 * Copyright © 2014 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 /**
26 * @file vc4_qpu_schedule.c
27 *
28 * The basic model of the list scheduler is to take a basic block, compute a
29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically
30 * pick a DAG head, then put all the children that are now DAG heads into the
31 * list of things to schedule.
32 *
33 * The goal of scheduling here is to pack pairs of operations together in a
34 * single QPU instruction.
35 */
36
37 #include "vc4_qir.h"
38 #include "vc4_qpu.h"
39 #include "util/ralloc.h"
40
41 static bool debug;
42
43 struct schedule_node_child;
44
45 struct schedule_node {
46 struct list_head link;
47 struct queued_qpu_inst *inst;
48 struct schedule_node_child *children;
49 uint32_t child_count;
50 uint32_t child_array_size;
51 uint32_t parent_count;
52
53 /* Longest cycles + instruction_latency() of any parent of this node. */
54 uint32_t unblocked_time;
55
56 /**
57 * Minimum number of cycles from scheduling this instruction until the
58 * end of the program, based on the slowest dependency chain through
59 * the children.
60 */
61 uint32_t delay;
62
63 /**
64 * cycles between this instruction being scheduled and when its result
65 * can be consumed.
66 */
67 uint32_t latency;
68
69 /**
70 * Which uniform from uniform_data[] this instruction read, or -1 if
71 * not reading a uniform.
72 */
73 int uniform;
74 };
75
76 struct schedule_node_child {
77 struct schedule_node *node;
78 bool write_after_read;
79 };
80
81 /* When walking the instructions in reverse, we need to swap before/after in
82 * add_dep().
83 */
84 enum direction { F, R };
85
86 struct schedule_state {
87 struct schedule_node *last_r[6];
88 struct schedule_node *last_ra[32];
89 struct schedule_node *last_rb[32];
90 struct schedule_node *last_sf;
91 struct schedule_node *last_vpm_read;
92 struct schedule_node *last_tmu_write;
93 struct schedule_node *last_tlb;
94 struct schedule_node *last_vpm;
95 enum direction dir;
96 /* Estimated cycle when the current instruction would start. */
97 uint32_t time;
98 };
99
100 static void
101 add_dep(struct schedule_state *state,
102 struct schedule_node *before,
103 struct schedule_node *after,
104 bool write)
105 {
106 bool write_after_read = !write && state->dir == R;
107
108 if (!before || !after)
109 return;
110
111 assert(before != after);
112
113 if (state->dir == R) {
114 struct schedule_node *t = before;
115 before = after;
116 after = t;
117 }
118
119 for (int i = 0; i < before->child_count; i++) {
120 if (before->children[i].node == after &&
121 (before->children[i].write_after_read == write_after_read)) {
122 return;
123 }
124 }
125
126 if (before->child_array_size <= before->child_count) {
127 before->child_array_size = MAX2(before->child_array_size * 2, 16);
128 before->children = reralloc(before, before->children,
129 struct schedule_node_child,
130 before->child_array_size);
131 }
132
133 before->children[before->child_count].node = after;
134 before->children[before->child_count].write_after_read =
135 write_after_read;
136 before->child_count++;
137 after->parent_count++;
138 }
139
140 static void
141 add_read_dep(struct schedule_state *state,
142 struct schedule_node *before,
143 struct schedule_node *after)
144 {
145 add_dep(state, before, after, false);
146 }
147
148 static void
149 add_write_dep(struct schedule_state *state,
150 struct schedule_node **before,
151 struct schedule_node *after)
152 {
153 add_dep(state, *before, after, true);
154 *before = after;
155 }
156
157 static bool
158 qpu_writes_r4(uint64_t inst)
159 {
160 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
161
162 switch(sig) {
163 case QPU_SIG_COLOR_LOAD:
164 case QPU_SIG_LOAD_TMU0:
165 case QPU_SIG_LOAD_TMU1:
166 case QPU_SIG_ALPHA_MASK_LOAD:
167 return true;
168 default:
169 return false;
170 }
171 }
172
173 static void
174 process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
175 uint32_t raddr, bool is_a)
176 {
177 switch (raddr) {
178 case QPU_R_VARY:
179 add_write_dep(state, &state->last_r[5], n);
180 break;
181
182 case QPU_R_VPM:
183 add_write_dep(state, &state->last_vpm_read, n);
184 break;
185
186 case QPU_R_UNIF:
187 case QPU_R_NOP:
188 case QPU_R_ELEM_QPU:
189 case QPU_R_XY_PIXEL_COORD:
190 case QPU_R_MS_REV_FLAGS:
191 break;
192
193 default:
194 if (raddr < 32) {
195 if (is_a)
196 add_read_dep(state, state->last_ra[raddr], n);
197 else
198 add_read_dep(state, state->last_rb[raddr], n);
199 } else {
200 fprintf(stderr, "unknown raddr %d\n", raddr);
201 abort();
202 }
203 break;
204 }
205 }
206
207 static bool
208 is_tmu_write(uint32_t waddr)
209 {
210 switch (waddr) {
211 case QPU_W_TMU0_S:
212 case QPU_W_TMU0_T:
213 case QPU_W_TMU0_R:
214 case QPU_W_TMU0_B:
215 case QPU_W_TMU1_S:
216 case QPU_W_TMU1_T:
217 case QPU_W_TMU1_R:
218 case QPU_W_TMU1_B:
219 return true;
220 default:
221 return false;
222 }
223 }
224
225 static bool
226 reads_uniform(uint64_t inst)
227 {
228 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
229 return false;
230
231 return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF ||
232 (QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF &&
233 QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM) ||
234 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_ADD)) ||
235 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
236 }
237
238 static void
239 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
240 uint32_t mux)
241 {
242 if (mux != QPU_MUX_A && mux != QPU_MUX_B)
243 add_read_dep(state, state->last_r[mux], n);
244 }
245
246
247 static void
248 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
249 uint32_t waddr, bool is_add)
250 {
251 uint64_t inst = n->inst->inst;
252 bool is_a = is_add ^ ((inst & QPU_WS) != 0);
253
254 if (waddr < 32) {
255 if (is_a) {
256 add_write_dep(state, &state->last_ra[waddr], n);
257 } else {
258 add_write_dep(state, &state->last_rb[waddr], n);
259 }
260 } else if (is_tmu_write(waddr)) {
261 add_write_dep(state, &state->last_tmu_write, n);
262 } else if (qpu_waddr_is_tlb(waddr) ||
263 waddr == QPU_W_MS_FLAGS) {
264 add_write_dep(state, &state->last_tlb, n);
265 } else {
266 switch (waddr) {
267 case QPU_W_ACC0:
268 case QPU_W_ACC1:
269 case QPU_W_ACC2:
270 case QPU_W_ACC3:
271 case QPU_W_ACC5:
272 add_write_dep(state, &state->last_r[waddr - QPU_W_ACC0],
273 n);
274 break;
275
276 case QPU_W_VPM:
277 add_write_dep(state, &state->last_vpm, n);
278 break;
279
280 case QPU_W_VPMVCD_SETUP:
281 if (is_a)
282 add_write_dep(state, &state->last_vpm_read, n);
283 else
284 add_write_dep(state, &state->last_vpm, n);
285 break;
286
287 case QPU_W_SFU_RECIP:
288 case QPU_W_SFU_RECIPSQRT:
289 case QPU_W_SFU_EXP:
290 case QPU_W_SFU_LOG:
291 add_write_dep(state, &state->last_r[4], n);
292 break;
293
294 case QPU_W_TLB_STENCIL_SETUP:
295 /* This isn't a TLB operation that does things like
296 * implicitly lock the scoreboard, but it does have to
297 * appear before TLB_Z, and each of the TLB_STENCILs
298 * have to schedule in the same order relative to each
299 * other.
300 */
301 add_write_dep(state, &state->last_tlb, n);
302 break;
303
304 case QPU_W_MS_FLAGS:
305 add_write_dep(state, &state->last_tlb, n);
306 break;
307
308 case QPU_W_NOP:
309 break;
310
311 default:
312 fprintf(stderr, "Unknown waddr %d\n", waddr);
313 abort();
314 }
315 }
316 }
317
318 static void
319 process_cond_deps(struct schedule_state *state, struct schedule_node *n,
320 uint32_t cond)
321 {
322 switch (cond) {
323 case QPU_COND_NEVER:
324 case QPU_COND_ALWAYS:
325 break;
326 default:
327 add_read_dep(state, state->last_sf, n);
328 break;
329 }
330 }
331
332 /**
333 * Common code for dependencies that need to be tracked both forward and
334 * backward.
335 *
336 * This is for things like "all reads of r4 have to happen between the r4
337 * writes that surround them".
338 */
339 static void
340 calculate_deps(struct schedule_state *state, struct schedule_node *n)
341 {
342 uint64_t inst = n->inst->inst;
343 uint32_t add_op = QPU_GET_FIELD(inst, QPU_OP_ADD);
344 uint32_t mul_op = QPU_GET_FIELD(inst, QPU_OP_MUL);
345 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
346 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
347 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
348 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
349 uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
350 uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
351 uint32_t mul_a = QPU_GET_FIELD(inst, QPU_MUL_A);
352 uint32_t mul_b = QPU_GET_FIELD(inst, QPU_MUL_B);
353 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
354
355 if (sig != QPU_SIG_LOAD_IMM) {
356 process_raddr_deps(state, n, raddr_a, true);
357 if (sig != QPU_SIG_SMALL_IMM &&
358 sig != QPU_SIG_BRANCH)
359 process_raddr_deps(state, n, raddr_b, false);
360 }
361
362 if (add_op != QPU_A_NOP) {
363 process_mux_deps(state, n, add_a);
364 process_mux_deps(state, n, add_b);
365 }
366 if (mul_op != QPU_M_NOP) {
367 process_mux_deps(state, n, mul_a);
368 process_mux_deps(state, n, mul_b);
369 }
370
371 process_waddr_deps(state, n, waddr_add, true);
372 process_waddr_deps(state, n, waddr_mul, false);
373 if (qpu_writes_r4(inst))
374 add_write_dep(state, &state->last_r[4], n);
375
376 switch (sig) {
377 case QPU_SIG_SW_BREAKPOINT:
378 case QPU_SIG_NONE:
379 case QPU_SIG_THREAD_SWITCH:
380 case QPU_SIG_LAST_THREAD_SWITCH:
381 case QPU_SIG_SMALL_IMM:
382 case QPU_SIG_LOAD_IMM:
383 break;
384
385 case QPU_SIG_LOAD_TMU0:
386 case QPU_SIG_LOAD_TMU1:
387 /* TMU loads are coming from a FIFO, so ordering is important.
388 */
389 add_write_dep(state, &state->last_tmu_write, n);
390 break;
391
392 case QPU_SIG_COLOR_LOAD:
393 add_read_dep(state, state->last_tlb, n);
394 break;
395
396 case QPU_SIG_BRANCH:
397 add_read_dep(state, state->last_sf, n);
398 break;
399
400 case QPU_SIG_PROG_END:
401 case QPU_SIG_WAIT_FOR_SCOREBOARD:
402 case QPU_SIG_SCOREBOARD_UNLOCK:
403 case QPU_SIG_COVERAGE_LOAD:
404 case QPU_SIG_COLOR_LOAD_END:
405 case QPU_SIG_ALPHA_MASK_LOAD:
406 fprintf(stderr, "Unhandled signal bits %d\n", sig);
407 abort();
408 }
409
410 process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
411 process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL));
412 if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH)
413 add_write_dep(state, &state->last_sf, n);
414 }
415
416 static void
417 calculate_forward_deps(struct vc4_compile *c, struct list_head *schedule_list)
418 {
419 struct schedule_state state;
420
421 memset(&state, 0, sizeof(state));
422 state.dir = F;
423
424 list_for_each_entry(struct schedule_node, node, schedule_list, link)
425 calculate_deps(&state, node);
426 }
427
428 static void
429 calculate_reverse_deps(struct vc4_compile *c, struct list_head *schedule_list)
430 {
431 struct list_head *node;
432 struct schedule_state state;
433
434 memset(&state, 0, sizeof(state));
435 state.dir = R;
436
437 for (node = schedule_list->prev; schedule_list != node; node = node->prev) {
438 calculate_deps(&state, (struct schedule_node *)node);
439 }
440 }
441
442 struct choose_scoreboard {
443 int tick;
444 int last_sfu_write_tick;
445 uint32_t last_waddr_a, last_waddr_b;
446 };
447
448 static bool
449 reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
450 {
451 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
452 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
453 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
454 uint32_t src_muxes[] = {
455 QPU_GET_FIELD(inst, QPU_ADD_A),
456 QPU_GET_FIELD(inst, QPU_ADD_B),
457 QPU_GET_FIELD(inst, QPU_MUL_A),
458 QPU_GET_FIELD(inst, QPU_MUL_B),
459 };
460 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
461 if ((src_muxes[i] == QPU_MUX_A &&
462 raddr_a < 32 &&
463 scoreboard->last_waddr_a == raddr_a) ||
464 (src_muxes[i] == QPU_MUX_B &&
465 sig != QPU_SIG_SMALL_IMM &&
466 raddr_b < 32 &&
467 scoreboard->last_waddr_b == raddr_b)) {
468 return true;
469 }
470
471 if (src_muxes[i] == QPU_MUX_R4) {
472 if (scoreboard->tick -
473 scoreboard->last_sfu_write_tick <= 2) {
474 return true;
475 }
476 }
477 }
478
479 return false;
480 }
481
482 static bool
483 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, uint64_t inst)
484 {
485 return (scoreboard->tick < 2 && qpu_inst_is_tlb(inst));
486 }
487
488 static int
489 get_instruction_priority(uint64_t inst)
490 {
491 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
492 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
493 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
494 uint32_t baseline_score;
495 uint32_t next_score = 0;
496
497 /* Schedule TLB operations as late as possible, to get more
498 * parallelism between shaders.
499 */
500 if (qpu_inst_is_tlb(inst))
501 return next_score;
502 next_score++;
503
504 /* Schedule texture read results collection late to hide latency. */
505 if (sig == QPU_SIG_LOAD_TMU0 || sig == QPU_SIG_LOAD_TMU1)
506 return next_score;
507 next_score++;
508
509 /* Default score for things that aren't otherwise special. */
510 baseline_score = next_score;
511 next_score++;
512
513 /* Schedule texture read setup early to hide their latency better. */
514 if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul))
515 return next_score;
516 next_score++;
517
518 return baseline_score;
519 }
520
521 static struct schedule_node *
522 choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
523 struct list_head *schedule_list,
524 struct schedule_node *prev_inst)
525 {
526 struct schedule_node *chosen = NULL;
527 int chosen_prio = 0;
528
529 list_for_each_entry(struct schedule_node, n, schedule_list, link) {
530 uint64_t inst = n->inst->inst;
531
532 /* Don't choose the branch instruction until it's the last one
533 * left. XXX: We could potentially choose it before it's the
534 * last one, if the remaining instructions fit in the delay
535 * slots.
536 */
537 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH &&
538 !list_is_singular(schedule_list)) {
539 continue;
540 }
541
542 /* "An instruction must not read from a location in physical
543 * regfile A or B that was written to by the previous
544 * instruction."
545 */
546 if (reads_too_soon_after_write(scoreboard, inst))
547 continue;
548
549 /* "A scoreboard wait must not occur in the first two
550 * instructions of a fragment shader. This is either the
551 * explicit Wait for Scoreboard signal or an implicit wait
552 * with the first tile-buffer read or write instruction."
553 */
554 if (pixel_scoreboard_too_soon(scoreboard, inst))
555 continue;
556
557 /* If we're trying to pair with another instruction, check
558 * that they're compatible.
559 */
560 if (prev_inst) {
561 if (prev_inst->uniform != -1 && n->uniform != -1)
562 continue;
563
564 inst = qpu_merge_inst(prev_inst->inst->inst, inst);
565 if (!inst)
566 continue;
567 }
568
569 int prio = get_instruction_priority(inst);
570
571 /* Found a valid instruction. If nothing better comes along,
572 * this one works.
573 */
574 if (!chosen) {
575 chosen = n;
576 chosen_prio = prio;
577 continue;
578 }
579
580 if (prio > chosen_prio) {
581 chosen = n;
582 chosen_prio = prio;
583 } else if (prio < chosen_prio) {
584 continue;
585 }
586
587 if (n->delay > chosen->delay) {
588 chosen = n;
589 chosen_prio = prio;
590 } else if (n->delay < chosen->delay) {
591 continue;
592 }
593 }
594
595 return chosen;
596 }
597
598 static void
599 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
600 uint64_t inst)
601 {
602 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
603 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
604
605 if (!(inst & QPU_WS)) {
606 scoreboard->last_waddr_a = waddr_add;
607 scoreboard->last_waddr_b = waddr_mul;
608 } else {
609 scoreboard->last_waddr_b = waddr_add;
610 scoreboard->last_waddr_a = waddr_mul;
611 }
612
613 if ((waddr_add >= QPU_W_SFU_RECIP && waddr_add <= QPU_W_SFU_LOG) ||
614 (waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) {
615 scoreboard->last_sfu_write_tick = scoreboard->tick;
616 }
617 }
618
619 static void
620 dump_state(struct list_head *schedule_list)
621 {
622 list_for_each_entry(struct schedule_node, n, schedule_list, link) {
623 fprintf(stderr, " t=%4d: ", n->unblocked_time);
624 vc4_qpu_disasm(&n->inst->inst, 1);
625 fprintf(stderr, "\n");
626
627 for (int i = 0; i < n->child_count; i++) {
628 struct schedule_node *child = n->children[i].node;
629 if (!child)
630 continue;
631
632 fprintf(stderr, " - ");
633 vc4_qpu_disasm(&child->inst->inst, 1);
634 fprintf(stderr, " (%d parents, %c)\n",
635 child->parent_count,
636 n->children[i].write_after_read ? 'w' : 'r');
637 }
638 }
639 }
640
641 static uint32_t waddr_latency(uint32_t waddr, uint64_t after)
642 {
643 if (waddr < 32)
644 return 2;
645
646 /* Apply some huge latency between texture fetch requests and getting
647 * their results back.
648 */
649 if (waddr == QPU_W_TMU0_S) {
650 if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)
651 return 100;
652 }
653 if (waddr == QPU_W_TMU1_S) {
654 if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1)
655 return 100;
656 }
657
658 switch(waddr) {
659 case QPU_W_SFU_RECIP:
660 case QPU_W_SFU_RECIPSQRT:
661 case QPU_W_SFU_EXP:
662 case QPU_W_SFU_LOG:
663 return 3;
664 default:
665 return 1;
666 }
667 }
668
669 static uint32_t
670 instruction_latency(struct schedule_node *before, struct schedule_node *after)
671 {
672 uint64_t before_inst = before->inst->inst;
673 uint64_t after_inst = after->inst->inst;
674
675 return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD),
676 after_inst),
677 waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL),
678 after_inst));
679 }
680
681 /** Recursive computation of the delay member of a node. */
682 static void
683 compute_delay(struct schedule_node *n)
684 {
685 if (!n->child_count) {
686 n->delay = 1;
687 } else {
688 for (int i = 0; i < n->child_count; i++) {
689 if (!n->children[i].node->delay)
690 compute_delay(n->children[i].node);
691 n->delay = MAX2(n->delay,
692 n->children[i].node->delay +
693 instruction_latency(n, n->children[i].node));
694 }
695 }
696 }
697
698 static void
699 mark_instruction_scheduled(struct list_head *schedule_list,
700 uint32_t time,
701 struct schedule_node *node,
702 bool war_only)
703 {
704 if (!node)
705 return;
706
707 for (int i = node->child_count - 1; i >= 0; i--) {
708 struct schedule_node *child =
709 node->children[i].node;
710
711 if (!child)
712 continue;
713
714 if (war_only && !node->children[i].write_after_read)
715 continue;
716
717 /* If the requirement is only that the node not appear before
718 * the last read of its destination, then it can be scheduled
719 * immediately after (or paired with!) the thing reading the
720 * destination.
721 */
722 uint32_t latency = 0;
723 if (!war_only) {
724 latency = instruction_latency(node,
725 node->children[i].node);
726 }
727
728 child->unblocked_time = MAX2(child->unblocked_time,
729 time + latency);
730 child->parent_count--;
731 if (child->parent_count == 0)
732 list_add(&child->link, schedule_list);
733
734 node->children[i].node = NULL;
735 }
736 }
737
738 static uint32_t
739 schedule_instructions(struct vc4_compile *c,
740 struct choose_scoreboard *scoreboard,
741 struct qblock *block,
742 struct list_head *schedule_list,
743 enum quniform_contents *orig_uniform_contents,
744 uint32_t *orig_uniform_data,
745 uint32_t *next_uniform)
746 {
747 uint32_t time = 0;
748
749 if (debug) {
750 fprintf(stderr, "initial deps:\n");
751 dump_state(schedule_list);
752 fprintf(stderr, "\n");
753 }
754
755 /* Remove non-DAG heads from the list. */
756 list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) {
757 if (n->parent_count != 0)
758 list_del(&n->link);
759 }
760
761 while (!list_empty(schedule_list)) {
762 struct schedule_node *chosen =
763 choose_instruction_to_schedule(scoreboard,
764 schedule_list,
765 NULL);
766 struct schedule_node *merge = NULL;
767
768 /* If there are no valid instructions to schedule, drop a NOP
769 * in.
770 */
771 uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP();
772
773 if (debug) {
774 fprintf(stderr, "t=%4d: current list:\n",
775 time);
776 dump_state(schedule_list);
777 fprintf(stderr, "t=%4d: chose: ", time);
778 vc4_qpu_disasm(&inst, 1);
779 fprintf(stderr, "\n");
780 }
781
782 /* Schedule this instruction onto the QPU list. Also try to
783 * find an instruction to pair with it.
784 */
785 if (chosen) {
786 time = MAX2(chosen->unblocked_time, time);
787 list_del(&chosen->link);
788 mark_instruction_scheduled(schedule_list, time,
789 chosen, true);
790 if (chosen->uniform != -1) {
791 c->uniform_data[*next_uniform] =
792 orig_uniform_data[chosen->uniform];
793 c->uniform_contents[*next_uniform] =
794 orig_uniform_contents[chosen->uniform];
795 (*next_uniform)++;
796 }
797
798 merge = choose_instruction_to_schedule(scoreboard,
799 schedule_list,
800 chosen);
801 if (merge) {
802 time = MAX2(merge->unblocked_time, time);
803 list_del(&merge->link);
804 inst = qpu_merge_inst(inst, merge->inst->inst);
805 assert(inst != 0);
806 if (merge->uniform != -1) {
807 c->uniform_data[*next_uniform] =
808 orig_uniform_data[merge->uniform];
809 c->uniform_contents[*next_uniform] =
810 orig_uniform_contents[merge->uniform];
811 (*next_uniform)++;
812 }
813
814 if (debug) {
815 fprintf(stderr, "t=%4d: merging: ",
816 time);
817 vc4_qpu_disasm(&merge->inst->inst, 1);
818 fprintf(stderr, "\n");
819 fprintf(stderr, " resulting in: ");
820 vc4_qpu_disasm(&inst, 1);
821 fprintf(stderr, "\n");
822 }
823 }
824 }
825
826 if (debug) {
827 fprintf(stderr, "\n");
828 }
829
830 qpu_serialize_one_inst(c, inst);
831
832 update_scoreboard_for_chosen(scoreboard, inst);
833
834 /* Now that we've scheduled a new instruction, some of its
835 * children can be promoted to the list of instructions ready to
836 * be scheduled. Update the children's unblocked time for this
837 * DAG edge as we do so.
838 */
839 mark_instruction_scheduled(schedule_list, time, chosen, false);
840 mark_instruction_scheduled(schedule_list, time, merge, false);
841
842 scoreboard->tick++;
843 time++;
844
845 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) {
846 block->branch_qpu_ip = c->qpu_inst_count - 1;
847 /* Fill the delay slots.
848 *
849 * We should fill these with actual instructions,
850 * instead, but that will probably need to be done
851 * after this, once we know what the leading
852 * instructions of the successors are (so we can
853 * handle A/B register file write latency)
854 */
855 inst = qpu_NOP();
856 update_scoreboard_for_chosen(scoreboard, inst);
857 qpu_serialize_one_inst(c, inst);
858 qpu_serialize_one_inst(c, inst);
859 qpu_serialize_one_inst(c, inst);
860 }
861 }
862
863 return time;
864 }
865
866 static uint32_t
867 qpu_schedule_instructions_block(struct vc4_compile *c,
868 struct choose_scoreboard *scoreboard,
869 struct qblock *block,
870 enum quniform_contents *orig_uniform_contents,
871 uint32_t *orig_uniform_data,
872 uint32_t *next_uniform)
873 {
874 void *mem_ctx = ralloc_context(NULL);
875 struct list_head schedule_list;
876
877 list_inithead(&schedule_list);
878
879 /* Wrap each instruction in a scheduler structure. */
880 uint32_t next_sched_uniform = *next_uniform;
881 while (!list_empty(&block->qpu_inst_list)) {
882 struct queued_qpu_inst *inst =
883 (struct queued_qpu_inst *)block->qpu_inst_list.next;
884 struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
885
886 n->inst = inst;
887
888 if (reads_uniform(inst->inst)) {
889 n->uniform = next_sched_uniform++;
890 } else {
891 n->uniform = -1;
892 }
893 list_del(&inst->link);
894 list_addtail(&n->link, &schedule_list);
895 }
896
897 calculate_forward_deps(c, &schedule_list);
898 calculate_reverse_deps(c, &schedule_list);
899
900 list_for_each_entry(struct schedule_node, n, &schedule_list, link) {
901 compute_delay(n);
902 }
903
904 uint32_t cycles = schedule_instructions(c, scoreboard, block,
905 &schedule_list,
906 orig_uniform_contents,
907 orig_uniform_data,
908 next_uniform);
909
910 ralloc_free(mem_ctx);
911
912 return cycles;
913 }
914
915 static void
916 qpu_set_branch_targets(struct vc4_compile *c)
917 {
918 qir_for_each_block(block, c) {
919 /* The end block of the program has no branch. */
920 if (!block->successors[0])
921 continue;
922
923 /* If there was no branch instruction, then the successor
924 * block must follow immediately after this one.
925 */
926 if (block->branch_qpu_ip == ~0) {
927 assert(block->end_qpu_ip + 1 ==
928 block->successors[0]->start_qpu_ip);
929 continue;
930 }
931
932 /* Set the branch target for the block that doesn't follow
933 * immediately after ours.
934 */
935 uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip];
936 assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH);
937 assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0);
938
939 uint32_t branch_target =
940 (block->successors[0]->start_qpu_ip -
941 (block->branch_qpu_ip + 4)) * sizeof(uint64_t);
942 *branch_inst = (*branch_inst |
943 QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET));
944
945 /* Make sure that the if-we-don't-jump successor was scheduled
946 * just after the delay slots.
947 */
948 if (block->successors[1]) {
949 assert(block->successors[1]->start_qpu_ip ==
950 block->branch_qpu_ip + 4);
951 }
952 }
953 }
954
955 uint32_t
956 qpu_schedule_instructions(struct vc4_compile *c)
957 {
958 /* We reorder the uniforms as we schedule instructions, so save the
959 * old data off and replace it.
960 */
961 uint32_t *uniform_data = c->uniform_data;
962 enum quniform_contents *uniform_contents = c->uniform_contents;
963 c->uniform_contents = ralloc_array(c, enum quniform_contents,
964 c->num_uniforms);
965 c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
966 c->uniform_array_size = c->num_uniforms;
967 uint32_t next_uniform = 0;
968
969 struct choose_scoreboard scoreboard;
970 memset(&scoreboard, 0, sizeof(scoreboard));
971 scoreboard.last_waddr_a = ~0;
972 scoreboard.last_waddr_b = ~0;
973 scoreboard.last_sfu_write_tick = -10;
974
975 if (debug) {
976 fprintf(stderr, "Pre-schedule instructions\n");
977 qir_for_each_block(block, c) {
978 fprintf(stderr, "BLOCK %d\n", block->index);
979 list_for_each_entry(struct queued_qpu_inst, q,
980 &block->qpu_inst_list, link) {
981 vc4_qpu_disasm(&q->inst, 1);
982 fprintf(stderr, "\n");
983 }
984 }
985 fprintf(stderr, "\n");
986 }
987
988 uint32_t cycles = 0;
989 qir_for_each_block(block, c) {
990 block->start_qpu_ip = c->qpu_inst_count;
991 block->branch_qpu_ip = ~0;
992
993 cycles += qpu_schedule_instructions_block(c,
994 &scoreboard,
995 block,
996 uniform_contents,
997 uniform_data,
998 &next_uniform);
999
1000 block->end_qpu_ip = c->qpu_inst_count - 1;
1001 }
1002
1003 qpu_set_branch_targets(c);
1004
1005 assert(next_uniform == c->num_uniforms);
1006
1007 if (debug) {
1008 fprintf(stderr, "Post-schedule instructions\n");
1009 vc4_qpu_disasm(c->qpu_insts, c->qpu_inst_count);
1010 fprintf(stderr, "\n");
1011 }
1012
1013 return cycles;
1014 }