broadcom/vc5: Don't pair VPMSETUP with other peripheral access.
[mesa.git] / src / broadcom / compiler / qpu_schedule.c
1 /*
2 * Copyright © 2010 Intel Corporation
3 * Copyright © 2014-2017 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 /**
26 * @file
27 *
28 * The basic model of the list scheduler is to take a basic block, compute a
29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically
30 * pick a DAG head, then put all the children that are now DAG heads into the
31 * list of things to schedule.
32 *
33 * The goal of scheduling here is to pack pairs of operations together in a
34 * single QPU instruction.
35 */
36
37 #include "qpu/qpu_disasm.h"
38 #include "v3d_compiler.h"
39 #include "util/ralloc.h"
40
41 static bool debug;
42
43 struct schedule_node_child;
44
45 struct schedule_node {
46 struct list_head link;
47 struct qinst *inst;
48 struct schedule_node_child *children;
49 uint32_t child_count;
50 uint32_t child_array_size;
51 uint32_t parent_count;
52
53 /* Longest cycles + instruction_latency() of any parent of this node. */
54 uint32_t unblocked_time;
55
56 /**
57 * Minimum number of cycles from scheduling this instruction until the
58 * end of the program, based on the slowest dependency chain through
59 * the children.
60 */
61 uint32_t delay;
62
63 /**
64 * cycles between this instruction being scheduled and when its result
65 * can be consumed.
66 */
67 uint32_t latency;
68 };
69
70 struct schedule_node_child {
71 struct schedule_node *node;
72 bool write_after_read;
73 };
74
75 /* When walking the instructions in reverse, we need to swap before/after in
76 * add_dep().
77 */
78 enum direction { F, R };
79
80 struct schedule_state {
81 struct schedule_node *last_r[6];
82 struct schedule_node *last_rf[64];
83 struct schedule_node *last_sf;
84 struct schedule_node *last_vpm_read;
85 struct schedule_node *last_tmu_write;
86 struct schedule_node *last_tlb;
87 struct schedule_node *last_vpm;
88 struct schedule_node *last_unif;
89 struct schedule_node *last_rtop;
90 enum direction dir;
91 /* Estimated cycle when the current instruction would start. */
92 uint32_t time;
93 };
94
95 static void
96 add_dep(struct schedule_state *state,
97 struct schedule_node *before,
98 struct schedule_node *after,
99 bool write)
100 {
101 bool write_after_read = !write && state->dir == R;
102
103 if (!before || !after)
104 return;
105
106 assert(before != after);
107
108 if (state->dir == R) {
109 struct schedule_node *t = before;
110 before = after;
111 after = t;
112 }
113
114 for (int i = 0; i < before->child_count; i++) {
115 if (before->children[i].node == after &&
116 (before->children[i].write_after_read == write_after_read)) {
117 return;
118 }
119 }
120
121 if (before->child_array_size <= before->child_count) {
122 before->child_array_size = MAX2(before->child_array_size * 2, 16);
123 before->children = reralloc(before, before->children,
124 struct schedule_node_child,
125 before->child_array_size);
126 }
127
128 before->children[before->child_count].node = after;
129 before->children[before->child_count].write_after_read =
130 write_after_read;
131 before->child_count++;
132 after->parent_count++;
133 }
134
135 static void
136 add_read_dep(struct schedule_state *state,
137 struct schedule_node *before,
138 struct schedule_node *after)
139 {
140 add_dep(state, before, after, false);
141 }
142
143 static void
144 add_write_dep(struct schedule_state *state,
145 struct schedule_node **before,
146 struct schedule_node *after)
147 {
148 add_dep(state, *before, after, true);
149 *before = after;
150 }
151
152 static bool
153 qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
154 {
155 if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
156 return false;
157
158 if (inst->alu.add.magic_write &&
159 (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
160 inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
161 return true;
162
163 if (inst->alu.mul.magic_write &&
164 (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
165 inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
166 return true;
167
168 return false;
169 }
170
171 static void
172 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
173 enum v3d_qpu_mux mux)
174 {
175 switch (mux) {
176 case V3D_QPU_MUX_A:
177 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
178 break;
179 case V3D_QPU_MUX_B:
180 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n);
181 break;
182 default:
183 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
184 break;
185 }
186 }
187
188
189 static void
190 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
191 uint32_t waddr, bool magic)
192 {
193 if (!magic) {
194 add_write_dep(state, &state->last_rf[waddr], n);
195 } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
196 add_write_dep(state, &state->last_tmu_write, n);
197 } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
198 /* Handled by v3d_qpu_writes_r4() check. */
199 } else {
200 switch (waddr) {
201 case V3D_QPU_WADDR_R0:
202 case V3D_QPU_WADDR_R1:
203 case V3D_QPU_WADDR_R2:
204 case V3D_QPU_WADDR_R3:
205 case V3D_QPU_WADDR_R4:
206 case V3D_QPU_WADDR_R5:
207 add_write_dep(state,
208 &state->last_r[waddr - V3D_QPU_WADDR_R0],
209 n);
210 break;
211
212 case V3D_QPU_WADDR_VPM:
213 case V3D_QPU_WADDR_VPMU:
214 add_write_dep(state, &state->last_vpm, n);
215 break;
216
217 case V3D_QPU_WADDR_TLB:
218 case V3D_QPU_WADDR_TLBU:
219 add_write_dep(state, &state->last_tlb, n);
220 break;
221
222 case V3D_QPU_WADDR_NOP:
223 break;
224
225 default:
226 fprintf(stderr, "Unknown waddr %d\n", waddr);
227 abort();
228 }
229 }
230 }
231
232 static void
233 process_cond_deps(struct schedule_state *state, struct schedule_node *n,
234 enum v3d_qpu_cond cond)
235 {
236 if (cond != V3D_QPU_COND_NONE)
237 add_read_dep(state, state->last_sf, n);
238 }
239
240 static void
241 process_pf_deps(struct schedule_state *state, struct schedule_node *n,
242 enum v3d_qpu_pf pf)
243 {
244 if (pf != V3D_QPU_PF_NONE)
245 add_write_dep(state, &state->last_sf, n);
246 }
247
248 static void
249 process_uf_deps(struct schedule_state *state, struct schedule_node *n,
250 enum v3d_qpu_uf uf)
251 {
252 if (uf != V3D_QPU_UF_NONE)
253 add_write_dep(state, &state->last_sf, n);
254 }
255
256 /**
257 * Common code for dependencies that need to be tracked both forward and
258 * backward.
259 *
260 * This is for things like "all reads of r4 have to happen between the r4
261 * writes that surround them".
262 */
263 static void
264 calculate_deps(struct schedule_state *state, struct schedule_node *n)
265 {
266 struct qinst *qinst = n->inst;
267 struct v3d_qpu_instr *inst = &qinst->qpu;
268
269 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
270 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
271 add_read_dep(state, state->last_sf, n);
272
273 /* XXX: BDI */
274 /* XXX: BDU */
275 /* XXX: ub */
276 /* XXX: raddr_a */
277
278 add_write_dep(state, &state->last_unif, n);
279 return;
280 }
281
282 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
283
284 /* XXX: LOAD_IMM */
285
286 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
287 process_mux_deps(state, n, inst->alu.add.a);
288 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
289 process_mux_deps(state, n, inst->alu.add.b);
290
291 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
292 process_mux_deps(state, n, inst->alu.mul.a);
293 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
294 process_mux_deps(state, n, inst->alu.mul.b);
295
296 switch (inst->alu.add.op) {
297 case V3D_QPU_A_VPMSETUP:
298 /* Could distinguish read/write by unpacking the uniform. */
299 add_write_dep(state, &state->last_vpm, n);
300 add_write_dep(state, &state->last_vpm_read, n);
301 break;
302
303 case V3D_QPU_A_STVPMV:
304 case V3D_QPU_A_STVPMD:
305 case V3D_QPU_A_STVPMP:
306 add_write_dep(state, &state->last_vpm, n);
307 break;
308
309 case V3D_QPU_A_MSF:
310 add_read_dep(state, state->last_tlb, n);
311 break;
312
313 case V3D_QPU_A_SETMSF:
314 case V3D_QPU_A_SETREVF:
315 add_write_dep(state, &state->last_tlb, n);
316 break;
317
318 case V3D_QPU_A_FLAPUSH:
319 case V3D_QPU_A_FLBPUSH:
320 case V3D_QPU_A_VFLA:
321 case V3D_QPU_A_VFLNA:
322 case V3D_QPU_A_VFLB:
323 case V3D_QPU_A_VFLNB:
324 add_read_dep(state, state->last_sf, n);
325 break;
326
327 case V3D_QPU_A_FLBPOP:
328 add_write_dep(state, &state->last_sf, n);
329 break;
330
331 default:
332 break;
333 }
334
335 switch (inst->alu.mul.op) {
336 case V3D_QPU_M_MULTOP:
337 case V3D_QPU_M_UMUL24:
338 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
339 * resets it to 0. We could possibly reorder umul24s relative
340 * to each other, but for now just keep all the MUL parts in
341 * order.
342 */
343 add_write_dep(state, &state->last_rtop, n);
344 break;
345 default:
346 break;
347 }
348
349 if (inst->alu.add.op != V3D_QPU_A_NOP) {
350 process_waddr_deps(state, n, inst->alu.add.waddr,
351 inst->alu.add.magic_write);
352 }
353 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
354 process_waddr_deps(state, n, inst->alu.mul.waddr,
355 inst->alu.mul.magic_write);
356 }
357
358 if (v3d_qpu_writes_r3(inst))
359 add_write_dep(state, &state->last_r[3], n);
360 if (v3d_qpu_writes_r4(inst))
361 add_write_dep(state, &state->last_r[4], n);
362 if (v3d_qpu_writes_r5(inst))
363 add_write_dep(state, &state->last_r[5], n);
364
365 if (inst->sig.thrsw) {
366 /* All accumulator contents and flags are undefined after the
367 * switch.
368 */
369 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
370 add_write_dep(state, &state->last_r[i], n);
371 add_write_dep(state, &state->last_sf, n);
372
373 /* Scoreboard-locking operations have to stay after the last
374 * thread switch.
375 */
376 add_write_dep(state, &state->last_tlb, n);
377
378 add_write_dep(state, &state->last_tmu_write, n);
379 }
380
381 if (inst->sig.ldtmu) {
382 /* TMU loads are coming from a FIFO, so ordering is important.
383 */
384 add_write_dep(state, &state->last_tmu_write, n);
385 }
386
387 if (inst->sig.ldtlb | inst->sig.ldtlbu)
388 add_read_dep(state, state->last_tlb, n);
389
390 if (inst->sig.ldvpm)
391 add_write_dep(state, &state->last_vpm_read, n);
392
393 /* inst->sig.ldunif or sideband uniform read */
394 if (qinst->uniform != ~0)
395 add_write_dep(state, &state->last_unif, n);
396
397 process_cond_deps(state, n, inst->flags.ac);
398 process_cond_deps(state, n, inst->flags.mc);
399 process_pf_deps(state, n, inst->flags.apf);
400 process_pf_deps(state, n, inst->flags.mpf);
401 process_uf_deps(state, n, inst->flags.auf);
402 process_uf_deps(state, n, inst->flags.muf);
403 }
404
405 static void
406 calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list)
407 {
408 struct schedule_state state;
409
410 memset(&state, 0, sizeof(state));
411 state.dir = F;
412
413 list_for_each_entry(struct schedule_node, node, schedule_list, link)
414 calculate_deps(&state, node);
415 }
416
417 static void
418 calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list)
419 {
420 struct list_head *node;
421 struct schedule_state state;
422
423 memset(&state, 0, sizeof(state));
424 state.dir = R;
425
426 for (node = schedule_list->prev; schedule_list != node; node = node->prev) {
427 calculate_deps(&state, (struct schedule_node *)node);
428 }
429 }
430
431 struct choose_scoreboard {
432 int tick;
433 int last_sfu_write_tick;
434 int last_ldvary_tick;
435 int last_uniforms_reset_tick;
436 uint32_t last_waddr_add, last_waddr_mul;
437 bool tlb_locked;
438 };
439
440 static bool
441 mux_reads_too_soon(struct choose_scoreboard *scoreboard,
442 const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
443 {
444 switch (mux) {
445 case V3D_QPU_MUX_A:
446 if (scoreboard->last_waddr_add == inst->raddr_a ||
447 scoreboard->last_waddr_mul == inst->raddr_a) {
448 return true;
449 }
450 break;
451
452 case V3D_QPU_MUX_B:
453 if (scoreboard->last_waddr_add == inst->raddr_b ||
454 scoreboard->last_waddr_mul == inst->raddr_b) {
455 return true;
456 }
457 break;
458
459 case V3D_QPU_MUX_R4:
460 if (scoreboard->tick - scoreboard->last_sfu_write_tick <= 2)
461 return true;
462 break;
463
464 case V3D_QPU_MUX_R5:
465 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
466 return true;
467 break;
468 default:
469 break;
470 }
471
472 return false;
473 }
474
475 static bool
476 reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
477 struct qinst *qinst)
478 {
479 const struct v3d_qpu_instr *inst = &qinst->qpu;
480
481 /* XXX: Branching off of raddr. */
482 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
483 return false;
484
485 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
486
487 if (inst->alu.add.op != V3D_QPU_A_NOP) {
488 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
489 mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
490 return true;
491 }
492 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
493 mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
494 return true;
495 }
496 }
497
498 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
499 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
500 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
501 return true;
502 }
503 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
504 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
505 return true;
506 }
507 }
508
509 /* XXX: imm */
510
511 return false;
512 }
513
514 static bool
515 writes_too_soon_after_write(struct choose_scoreboard *scoreboard,
516 struct qinst *qinst)
517 {
518 const struct v3d_qpu_instr *inst = &qinst->qpu;
519
520 /* Don't schedule any other r4 write too soon after an SFU write.
521 * This would normally be prevented by dependency tracking, but might
522 * occur if a dead SFU computation makes it to scheduling.
523 */
524 if (scoreboard->tick - scoreboard->last_sfu_write_tick < 2 &&
525 v3d_qpu_writes_r4(inst))
526 return true;
527
528 return false;
529 }
530
531 static bool
532 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
533 const struct v3d_qpu_instr *inst)
534 {
535 return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
536 }
537
538 static int
539 get_instruction_priority(const struct v3d_qpu_instr *inst)
540 {
541 uint32_t baseline_score;
542 uint32_t next_score = 0;
543
544 /* Schedule TLB operations as late as possible, to get more
545 * parallelism between shaders.
546 */
547 if (qpu_inst_is_tlb(inst))
548 return next_score;
549 next_score++;
550
551 /* Schedule texture read results collection late to hide latency. */
552 if (inst->sig.ldtmu)
553 return next_score;
554 next_score++;
555
556 /* Default score for things that aren't otherwise special. */
557 baseline_score = next_score;
558 next_score++;
559
560 /* Schedule texture read setup early to hide their latency better. */
561 if (inst->type == V3D_QPU_INSTR_TYPE_ALU &&
562 ((inst->alu.add.magic_write &&
563 v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr)) ||
564 (inst->alu.mul.magic_write &&
565 v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr)))) {
566 return next_score;
567 }
568 next_score++;
569
570 return baseline_score;
571 }
572
573 static bool
574 qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr)
575 {
576 return (v3d_qpu_magic_waddr_is_tmu(waddr) ||
577 v3d_qpu_magic_waddr_is_sfu(waddr) ||
578 v3d_qpu_magic_waddr_is_tlb(waddr) ||
579 v3d_qpu_magic_waddr_is_vpm(waddr) ||
580 v3d_qpu_magic_waddr_is_tsy(waddr));
581 }
582
583 static bool
584 qpu_accesses_peripheral(const struct v3d_qpu_instr *inst)
585 {
586 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
587 if (inst->alu.add.op != V3D_QPU_A_NOP &&
588 inst->alu.add.magic_write &&
589 qpu_magic_waddr_is_periph(inst->alu.add.waddr)) {
590 return true;
591 }
592
593 if (inst->alu.add.op == V3D_QPU_A_VPMSETUP)
594 return true;
595
596 if (inst->alu.mul.op != V3D_QPU_M_NOP &&
597 inst->alu.mul.magic_write &&
598 qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) {
599 return true;
600 }
601 }
602
603 return (inst->sig.ldvpm ||
604 inst->sig.ldtmu ||
605 inst->sig.ldtlb ||
606 inst->sig.ldtlbu);
607 }
608
609 static bool
610 qpu_merge_inst(const struct v3d_device_info *devinfo,
611 struct v3d_qpu_instr *result,
612 const struct v3d_qpu_instr *a,
613 const struct v3d_qpu_instr *b)
614 {
615 if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
616 b->type != V3D_QPU_INSTR_TYPE_ALU) {
617 return false;
618 }
619
620 /* Can't do more than one peripheral access in an instruction. */
621 if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b))
622 return false;
623
624 struct v3d_qpu_instr merge = *a;
625
626 if (b->alu.add.op != V3D_QPU_A_NOP) {
627 if (a->alu.add.op != V3D_QPU_A_NOP)
628 return false;
629 merge.alu.add = b->alu.add;
630
631 merge.flags.ac = b->flags.ac;
632 merge.flags.apf = b->flags.apf;
633 merge.flags.auf = b->flags.auf;
634 }
635
636 if (b->alu.mul.op != V3D_QPU_M_NOP) {
637 if (a->alu.mul.op != V3D_QPU_M_NOP)
638 return false;
639 merge.alu.mul = b->alu.mul;
640
641 merge.flags.mc = b->flags.mc;
642 merge.flags.mpf = b->flags.mpf;
643 merge.flags.muf = b->flags.muf;
644 }
645
646 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) {
647 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) &&
648 a->raddr_a != b->raddr_a) {
649 return false;
650 }
651 merge.raddr_a = b->raddr_a;
652 }
653
654 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) {
655 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) &&
656 a->raddr_b != b->raddr_b) {
657 return false;
658 }
659 merge.raddr_b = b->raddr_b;
660 }
661
662 merge.sig.thrsw |= b->sig.thrsw;
663 merge.sig.ldunif |= b->sig.ldunif;
664 merge.sig.ldtmu |= b->sig.ldtmu;
665 merge.sig.ldvary |= b->sig.ldvary;
666 merge.sig.ldvpm |= b->sig.ldvpm;
667 merge.sig.small_imm |= b->sig.small_imm;
668 merge.sig.ldtlb |= b->sig.ldtlb;
669 merge.sig.ldtlbu |= b->sig.ldtlbu;
670 merge.sig.ucb |= b->sig.ucb;
671 merge.sig.rotate |= b->sig.rotate;
672 merge.sig.wrtmuc |= b->sig.wrtmuc;
673
674 uint64_t packed;
675 bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
676
677 *result = merge;
678 /* No modifying the real instructions on failure. */
679 assert(ok || (a != result && b != result));
680
681 return ok;
682 }
683
684 static struct schedule_node *
685 choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
686 struct choose_scoreboard *scoreboard,
687 struct list_head *schedule_list,
688 struct schedule_node *prev_inst)
689 {
690 struct schedule_node *chosen = NULL;
691 int chosen_prio = 0;
692
693 /* Don't pair up anything with a thread switch signal -- emit_thrsw()
694 * will handle pairing it along with filling the delay slots.
695 */
696 if (prev_inst) {
697 if (prev_inst->inst->qpu.sig.thrsw)
698 return NULL;
699 }
700
701 list_for_each_entry(struct schedule_node, n, schedule_list, link) {
702 const struct v3d_qpu_instr *inst = &n->inst->qpu;
703
704 /* Don't choose the branch instruction until it's the last one
705 * left. We'll move it up to fit its delay slots after we
706 * choose it.
707 */
708 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
709 !list_is_singular(schedule_list)) {
710 continue;
711 }
712
713 /* "An instruction must not read from a location in physical
714 * regfile A or B that was written to by the previous
715 * instruction."
716 */
717 if (reads_too_soon_after_write(scoreboard, n->inst))
718 continue;
719
720 if (writes_too_soon_after_write(scoreboard, n->inst))
721 continue;
722
723 /* "A scoreboard wait must not occur in the first two
724 * instructions of a fragment shader. This is either the
725 * explicit Wait for Scoreboard signal or an implicit wait
726 * with the first tile-buffer read or write instruction."
727 */
728 if (pixel_scoreboard_too_soon(scoreboard, inst))
729 continue;
730
731 /* ldunif and ldvary both write r5, but ldunif does so a tick
732 * sooner. If the ldvary's r5 wasn't used, then ldunif might
733 * otherwise get scheduled so ldunif and ldvary try to update
734 * r5 in the same tick.
735 */
736 if (inst->sig.ldunif &&
737 scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
738 continue;
739 }
740
741 /* If we're trying to pair with another instruction, check
742 * that they're compatible.
743 */
744 if (prev_inst) {
745 /* Don't pair up a thread switch signal -- we'll
746 * handle pairing it when we pick it on its own.
747 */
748 if (inst->sig.thrsw)
749 continue;
750
751 if (prev_inst->inst->uniform != -1 &&
752 n->inst->uniform != -1)
753 continue;
754
755 /* Don't merge in something that will lock the TLB.
756 * Hopwefully what we have in inst will release some
757 * other instructions, allowing us to delay the
758 * TLB-locking instruction until later.
759 */
760 if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
761 continue;
762
763 struct v3d_qpu_instr merged_inst;
764 if (!qpu_merge_inst(devinfo, &merged_inst,
765 &prev_inst->inst->qpu, inst)) {
766 continue;
767 }
768 }
769
770 int prio = get_instruction_priority(inst);
771
772 /* Found a valid instruction. If nothing better comes along,
773 * this one works.
774 */
775 if (!chosen) {
776 chosen = n;
777 chosen_prio = prio;
778 continue;
779 }
780
781 if (prio > chosen_prio) {
782 chosen = n;
783 chosen_prio = prio;
784 } else if (prio < chosen_prio) {
785 continue;
786 }
787
788 if (n->delay > chosen->delay) {
789 chosen = n;
790 chosen_prio = prio;
791 } else if (n->delay < chosen->delay) {
792 continue;
793 }
794 }
795
796 return chosen;
797 }
798
799 static void
800 update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
801 enum v3d_qpu_waddr waddr)
802 {
803 if (v3d_qpu_magic_waddr_is_sfu(waddr))
804 scoreboard->last_sfu_write_tick = scoreboard->tick;
805 }
806
807 static void
808 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
809 const struct v3d_qpu_instr *inst)
810 {
811 scoreboard->last_waddr_add = ~0;
812 scoreboard->last_waddr_mul = ~0;
813
814 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
815 return;
816
817 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
818
819 if (inst->alu.add.op != V3D_QPU_A_NOP) {
820 if (inst->alu.add.magic_write) {
821 update_scoreboard_for_magic_waddr(scoreboard,
822 inst->alu.add.waddr);
823 } else {
824 scoreboard->last_waddr_add = inst->alu.add.waddr;
825 }
826 }
827
828 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
829 if (inst->alu.mul.magic_write) {
830 update_scoreboard_for_magic_waddr(scoreboard,
831 inst->alu.mul.waddr);
832 } else {
833 scoreboard->last_waddr_mul = inst->alu.mul.waddr;
834 }
835 }
836
837 if (inst->sig.ldvary)
838 scoreboard->last_ldvary_tick = scoreboard->tick;
839
840 if (qpu_inst_is_tlb(inst))
841 scoreboard->tlb_locked = true;
842 }
843
844 static void
845 dump_state(const struct v3d_device_info *devinfo,
846 struct list_head *schedule_list)
847 {
848 list_for_each_entry(struct schedule_node, n, schedule_list, link) {
849 fprintf(stderr, " t=%4d: ", n->unblocked_time);
850 v3d_qpu_dump(devinfo, &n->inst->qpu);
851 fprintf(stderr, "\n");
852
853 for (int i = 0; i < n->child_count; i++) {
854 struct schedule_node *child = n->children[i].node;
855 if (!child)
856 continue;
857
858 fprintf(stderr, " - ");
859 v3d_qpu_dump(devinfo, &child->inst->qpu);
860 fprintf(stderr, " (%d parents, %c)\n",
861 child->parent_count,
862 n->children[i].write_after_read ? 'w' : 'r');
863 }
864 }
865 }
866
867 static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr,
868 const struct v3d_qpu_instr *after)
869 {
870 /* Apply some huge latency between texture fetch requests and getting
871 * their results back.
872 *
873 * FIXME: This is actually pretty bogus. If we do:
874 *
875 * mov tmu0_s, a
876 * <a bit of math>
877 * mov tmu0_s, b
878 * load_tmu0
879 * <more math>
880 * load_tmu0
881 *
882 * we count that as worse than
883 *
884 * mov tmu0_s, a
885 * mov tmu0_s, b
886 * <lots of math>
887 * load_tmu0
888 * <more math>
889 * load_tmu0
890 *
891 * because we associate the first load_tmu0 with the *second* tmu0_s.
892 */
893 if (v3d_qpu_magic_waddr_is_tmu(waddr) && after->sig.ldtmu)
894 return 100;
895
896 /* Assume that anything depending on us is consuming the SFU result. */
897 if (v3d_qpu_magic_waddr_is_sfu(waddr))
898 return 3;
899
900 return 1;
901 }
902
903 static uint32_t
904 instruction_latency(struct schedule_node *before, struct schedule_node *after)
905 {
906 const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
907 const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
908 uint32_t latency = 1;
909
910 if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
911 after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
912 return latency;
913
914 if (before_inst->alu.add.magic_write) {
915 latency = MAX2(latency,
916 magic_waddr_latency(before_inst->alu.add.waddr,
917 after_inst));
918 }
919
920 if (before_inst->alu.mul.magic_write) {
921 latency = MAX2(latency,
922 magic_waddr_latency(before_inst->alu.mul.waddr,
923 after_inst));
924 }
925
926 return latency;
927 }
928
929 /** Recursive computation of the delay member of a node. */
930 static void
931 compute_delay(struct schedule_node *n)
932 {
933 if (!n->child_count) {
934 n->delay = 1;
935 } else {
936 for (int i = 0; i < n->child_count; i++) {
937 if (!n->children[i].node->delay)
938 compute_delay(n->children[i].node);
939 n->delay = MAX2(n->delay,
940 n->children[i].node->delay +
941 instruction_latency(n, n->children[i].node));
942 }
943 }
944 }
945
946 static void
947 mark_instruction_scheduled(struct list_head *schedule_list,
948 uint32_t time,
949 struct schedule_node *node,
950 bool war_only)
951 {
952 if (!node)
953 return;
954
955 for (int i = node->child_count - 1; i >= 0; i--) {
956 struct schedule_node *child =
957 node->children[i].node;
958
959 if (!child)
960 continue;
961
962 if (war_only && !node->children[i].write_after_read)
963 continue;
964
965 /* If the requirement is only that the node not appear before
966 * the last read of its destination, then it can be scheduled
967 * immediately after (or paired with!) the thing reading the
968 * destination.
969 */
970 uint32_t latency = 0;
971 if (!war_only) {
972 latency = instruction_latency(node,
973 node->children[i].node);
974 }
975
976 child->unblocked_time = MAX2(child->unblocked_time,
977 time + latency);
978 child->parent_count--;
979 if (child->parent_count == 0)
980 list_add(&child->link, schedule_list);
981
982 node->children[i].node = NULL;
983 }
984 }
985
986 static struct qinst *
987 vir_nop()
988 {
989 struct qreg undef = { QFILE_NULL, 0 };
990 struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
991
992 return qinst;
993 }
994
995 #if 0
996 static struct qinst *
997 nop_after(struct qinst *inst)
998 {
999 struct qinst *q = vir_nop();
1000
1001 list_add(&q->link, &inst->link);
1002
1003 return q;
1004 }
1005
1006 /**
1007 * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair
1008 * with another instruction.
1009 */
1010 static void
1011 emit_thrsw(struct v3d_compile *c,
1012 struct choose_scoreboard *scoreboard,
1013 const struct v3d_qpu_instr *inst)
1014 {
1015 /* There should be nothing in a thrsw inst being scheduled other than
1016 * the signal bits.
1017 */
1018 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
1019 assert(inst->alu.add.op == V3D_QPU_A_NOP);
1020 assert(inst->alu.mul.op == V3D_QPU_M_NOP);
1021
1022 /* Try to find an earlier scheduled instruction that we can merge the
1023 * thrsw into.
1024 */
1025 int thrsw_ip = c->qpu_inst_count;
1026 for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {
1027 uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];
1028 uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);
1029
1030 if (prev_sig == QPU_SIG_NONE)
1031 thrsw_ip = c->qpu_inst_count - i;
1032 }
1033
1034 if (thrsw_ip != c->qpu_inst_count) {
1035 /* Merge the thrsw into the existing instruction. */
1036 c->qpu_insts[thrsw_ip] =
1037 QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);
1038 } else {
1039 qpu_serialize_one_inst(c, inst);
1040 update_scoreboard_for_chosen(scoreboard, inst);
1041 }
1042
1043 /* Fill the delay slots. */
1044 while (c->qpu_inst_count < thrsw_ip + 3) {
1045 update_scoreboard_for_chosen(scoreboard, v3d_qpu_nop());
1046 qpu_serialize_one_inst(c, v3d_qpu_nop());
1047 }
1048 }
1049 #endif
1050
1051 static uint32_t
1052 schedule_instructions(struct v3d_compile *c,
1053 struct choose_scoreboard *scoreboard,
1054 struct qblock *block,
1055 struct list_head *schedule_list,
1056 enum quniform_contents *orig_uniform_contents,
1057 uint32_t *orig_uniform_data,
1058 uint32_t *next_uniform)
1059 {
1060 const struct v3d_device_info *devinfo = c->devinfo;
1061 uint32_t time = 0;
1062
1063 if (debug) {
1064 fprintf(stderr, "initial deps:\n");
1065 dump_state(devinfo, schedule_list);
1066 fprintf(stderr, "\n");
1067 }
1068
1069 /* Remove non-DAG heads from the list. */
1070 list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) {
1071 if (n->parent_count != 0)
1072 list_del(&n->link);
1073 }
1074
1075 while (!list_empty(schedule_list)) {
1076 struct schedule_node *chosen =
1077 choose_instruction_to_schedule(devinfo,
1078 scoreboard,
1079 schedule_list,
1080 NULL);
1081 struct schedule_node *merge = NULL;
1082
1083 /* If there are no valid instructions to schedule, drop a NOP
1084 * in.
1085 */
1086 struct qinst *qinst = chosen ? chosen->inst : vir_nop();
1087 struct v3d_qpu_instr *inst = &qinst->qpu;
1088
1089 if (debug) {
1090 fprintf(stderr, "t=%4d: current list:\n",
1091 time);
1092 dump_state(devinfo, schedule_list);
1093 fprintf(stderr, "t=%4d: chose: ", time);
1094 v3d_qpu_dump(devinfo, inst);
1095 fprintf(stderr, "\n");
1096 }
1097
1098 /* Schedule this instruction onto the QPU list. Also try to
1099 * find an instruction to pair with it.
1100 */
1101 if (chosen) {
1102 time = MAX2(chosen->unblocked_time, time);
1103 list_del(&chosen->link);
1104 mark_instruction_scheduled(schedule_list, time,
1105 chosen, true);
1106
1107 merge = choose_instruction_to_schedule(devinfo,
1108 scoreboard,
1109 schedule_list,
1110 chosen);
1111 if (merge) {
1112 time = MAX2(merge->unblocked_time, time);
1113 list_del(&merge->link);
1114 (void)qpu_merge_inst(devinfo, inst,
1115 inst, &merge->inst->qpu);
1116 if (merge->inst->uniform != -1) {
1117 chosen->inst->uniform =
1118 merge->inst->uniform;
1119 }
1120
1121 if (debug) {
1122 fprintf(stderr, "t=%4d: merging: ",
1123 time);
1124 v3d_qpu_dump(devinfo, &merge->inst->qpu);
1125 fprintf(stderr, "\n");
1126 fprintf(stderr, " result: ");
1127 v3d_qpu_dump(devinfo, inst);
1128 fprintf(stderr, "\n");
1129 }
1130 }
1131 }
1132
1133 /* Update the uniform index for the rewritten location --
1134 * branch target updating will still need to change
1135 * c->uniform_data[] using this index.
1136 */
1137 if (qinst->uniform != -1) {
1138 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
1139 block->branch_uniform = *next_uniform;
1140
1141 c->uniform_data[*next_uniform] =
1142 orig_uniform_data[qinst->uniform];
1143 c->uniform_contents[*next_uniform] =
1144 orig_uniform_contents[qinst->uniform];
1145 qinst->uniform = *next_uniform;
1146 (*next_uniform)++;
1147 }
1148
1149 if (debug) {
1150 fprintf(stderr, "\n");
1151 }
1152
1153 /* Now that we've scheduled a new instruction, some of its
1154 * children can be promoted to the list of instructions ready to
1155 * be scheduled. Update the children's unblocked time for this
1156 * DAG edge as we do so.
1157 */
1158 mark_instruction_scheduled(schedule_list, time, chosen, false);
1159
1160 if (merge) {
1161 mark_instruction_scheduled(schedule_list, time, merge,
1162 false);
1163
1164 /* The merged VIR instruction doesn't get re-added to the
1165 * block, so free it now.
1166 */
1167 free(merge->inst);
1168 }
1169
1170 if (0 && inst->sig.thrsw) {
1171 /* XXX emit_thrsw(c, scoreboard, qinst); */
1172 } else {
1173 c->qpu_inst_count++;
1174 list_addtail(&qinst->link, &block->instructions);
1175 update_scoreboard_for_chosen(scoreboard, inst);
1176 }
1177
1178 scoreboard->tick++;
1179 time++;
1180
1181 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ||
1182 inst->sig.thrsw /* XXX */) {
1183 block->branch_qpu_ip = c->qpu_inst_count - 1;
1184 /* Fill the delay slots.
1185 *
1186 * We should fill these with actual instructions,
1187 * instead, but that will probably need to be done
1188 * after this, once we know what the leading
1189 * instructions of the successors are (so we can
1190 * handle A/B register file write latency)
1191 */
1192 /* XXX: scoreboard */
1193 int slots = (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ?
1194 3 : 2);
1195 for (int i = 0; i < slots; i++) {
1196 struct qinst *nop = vir_nop();
1197 list_addtail(&nop->link, &block->instructions);
1198
1199 update_scoreboard_for_chosen(scoreboard,
1200 &nop->qpu);
1201 c->qpu_inst_count++;
1202 scoreboard->tick++;
1203 time++;
1204 }
1205 }
1206 }
1207
1208 return time;
1209 }
1210
1211 static uint32_t
1212 qpu_schedule_instructions_block(struct v3d_compile *c,
1213 struct choose_scoreboard *scoreboard,
1214 struct qblock *block,
1215 enum quniform_contents *orig_uniform_contents,
1216 uint32_t *orig_uniform_data,
1217 uint32_t *next_uniform)
1218 {
1219 void *mem_ctx = ralloc_context(NULL);
1220 struct list_head schedule_list;
1221
1222 list_inithead(&schedule_list);
1223
1224 /* Wrap each instruction in a scheduler structure. */
1225 while (!list_empty(&block->instructions)) {
1226 struct qinst *qinst = (struct qinst *)block->instructions.next;
1227 struct schedule_node *n =
1228 rzalloc(mem_ctx, struct schedule_node);
1229
1230 n->inst = qinst;
1231
1232 list_del(&qinst->link);
1233 list_addtail(&n->link, &schedule_list);
1234 }
1235
1236 calculate_forward_deps(c, &schedule_list);
1237 calculate_reverse_deps(c, &schedule_list);
1238
1239 list_for_each_entry(struct schedule_node, n, &schedule_list, link) {
1240 compute_delay(n);
1241 }
1242
1243 uint32_t cycles = schedule_instructions(c, scoreboard, block,
1244 &schedule_list,
1245 orig_uniform_contents,
1246 orig_uniform_data,
1247 next_uniform);
1248
1249 ralloc_free(mem_ctx);
1250
1251 return cycles;
1252 }
1253
1254 static void
1255 qpu_set_branch_targets(struct v3d_compile *c)
1256 {
1257 vir_for_each_block(block, c) {
1258 /* The end block of the program has no branch. */
1259 if (!block->successors[0])
1260 continue;
1261
1262 /* If there was no branch instruction, then the successor
1263 * block must follow immediately after this one.
1264 */
1265 if (block->branch_qpu_ip == ~0) {
1266 assert(block->end_qpu_ip + 1 ==
1267 block->successors[0]->start_qpu_ip);
1268 continue;
1269 }
1270
1271 /* Walk back through the delay slots to find the branch
1272 * instr.
1273 */
1274 struct list_head *entry = block->instructions.prev;
1275 for (int i = 0; i < 3; i++)
1276 entry = entry->prev;
1277 struct qinst *branch = container_of(entry, branch, link);
1278 assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
1279
1280 /* Make sure that the if-we-don't-jump
1281 * successor was scheduled just after the
1282 * delay slots.
1283 */
1284 assert(!block->successors[1] ||
1285 block->successors[1]->start_qpu_ip ==
1286 block->branch_qpu_ip + 4);
1287
1288 branch->qpu.branch.offset =
1289 ((block->successors[0]->start_qpu_ip -
1290 (block->branch_qpu_ip + 4)) *
1291 sizeof(uint64_t));
1292
1293 /* Set up the relative offset to jump in the
1294 * uniform stream.
1295 *
1296 * Use a temporary here, because
1297 * uniform_data[inst->uniform] may be shared
1298 * between multiple instructions.
1299 */
1300 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
1301 c->uniform_data[branch->uniform] =
1302 (block->successors[0]->start_uniform -
1303 (block->branch_uniform + 1)) * 4;
1304 }
1305 }
1306
1307 uint32_t
1308 v3d_qpu_schedule_instructions(struct v3d_compile *c)
1309 {
1310 const struct v3d_device_info *devinfo = c->devinfo;
1311
1312 /* We reorder the uniforms as we schedule instructions, so save the
1313 * old data off and replace it.
1314 */
1315 uint32_t *uniform_data = c->uniform_data;
1316 enum quniform_contents *uniform_contents = c->uniform_contents;
1317 c->uniform_contents = ralloc_array(c, enum quniform_contents,
1318 c->num_uniforms);
1319 c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
1320 c->uniform_array_size = c->num_uniforms;
1321 uint32_t next_uniform = 0;
1322
1323 struct choose_scoreboard scoreboard;
1324 memset(&scoreboard, 0, sizeof(scoreboard));
1325 scoreboard.last_waddr_add = ~0;
1326 scoreboard.last_waddr_mul = ~0;
1327 scoreboard.last_ldvary_tick = -10;
1328 scoreboard.last_sfu_write_tick = -10;
1329 scoreboard.last_uniforms_reset_tick = -10;
1330
1331 if (debug) {
1332 fprintf(stderr, "Pre-schedule instructions\n");
1333 vir_for_each_block(block, c) {
1334 fprintf(stderr, "BLOCK %d\n", block->index);
1335 list_for_each_entry(struct qinst, qinst,
1336 &block->instructions, link) {
1337 v3d_qpu_dump(devinfo, &qinst->qpu);
1338 fprintf(stderr, "\n");
1339 }
1340 }
1341 fprintf(stderr, "\n");
1342 }
1343
1344 uint32_t cycles = 0;
1345 vir_for_each_block(block, c) {
1346 block->start_qpu_ip = c->qpu_inst_count;
1347 block->branch_qpu_ip = ~0;
1348 block->start_uniform = next_uniform;
1349
1350 cycles += qpu_schedule_instructions_block(c,
1351 &scoreboard,
1352 block,
1353 uniform_contents,
1354 uniform_data,
1355 &next_uniform);
1356
1357 block->end_qpu_ip = c->qpu_inst_count - 1;
1358 }
1359
1360 qpu_set_branch_targets(c);
1361
1362 assert(next_uniform == c->num_uniforms);
1363
1364 return cycles;
1365 }