broadcom: Add VC5 NIR compiler.
[mesa.git] / src / broadcom / compiler / qpu_schedule.c
1 /*
2 * Copyright © 2010 Intel Corporation
3 * Copyright © 2014-2017 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 /**
26 * @file
27 *
28 * The basic model of the list scheduler is to take a basic block, compute a
29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically
30 * pick a DAG head, then put all the children that are now DAG heads into the
31 * list of things to schedule.
32 *
33 * The goal of scheduling here is to pack pairs of operations together in a
34 * single QPU instruction.
35 */
36
37 #include "qpu/qpu_disasm.h"
38 #include "v3d_compiler.h"
39 #include "util/ralloc.h"
40
41 static bool debug;
42
43 struct schedule_node_child;
44
45 struct schedule_node {
46 struct list_head link;
47 struct qinst *inst;
48 struct schedule_node_child *children;
49 uint32_t child_count;
50 uint32_t child_array_size;
51 uint32_t parent_count;
52
53 /* Longest cycles + instruction_latency() of any parent of this node. */
54 uint32_t unblocked_time;
55
56 /**
57 * Minimum number of cycles from scheduling this instruction until the
58 * end of the program, based on the slowest dependency chain through
59 * the children.
60 */
61 uint32_t delay;
62
63 /**
64 * cycles between this instruction being scheduled and when its result
65 * can be consumed.
66 */
67 uint32_t latency;
68 };
69
70 struct schedule_node_child {
71 struct schedule_node *node;
72 bool write_after_read;
73 };
74
75 /* When walking the instructions in reverse, we need to swap before/after in
76 * add_dep().
77 */
78 enum direction { F, R };
79
80 struct schedule_state {
81 struct schedule_node *last_r[6];
82 struct schedule_node *last_rf[64];
83 struct schedule_node *last_sf;
84 struct schedule_node *last_vpm_read;
85 struct schedule_node *last_tmu_write;
86 struct schedule_node *last_tlb;
87 struct schedule_node *last_vpm;
88 struct schedule_node *last_unif;
89 struct schedule_node *last_rtop;
90 enum direction dir;
91 /* Estimated cycle when the current instruction would start. */
92 uint32_t time;
93 };
94
95 static void
96 add_dep(struct schedule_state *state,
97 struct schedule_node *before,
98 struct schedule_node *after,
99 bool write)
100 {
101 bool write_after_read = !write && state->dir == R;
102
103 if (!before || !after)
104 return;
105
106 assert(before != after);
107
108 if (state->dir == R) {
109 struct schedule_node *t = before;
110 before = after;
111 after = t;
112 }
113
114 for (int i = 0; i < before->child_count; i++) {
115 if (before->children[i].node == after &&
116 (before->children[i].write_after_read == write_after_read)) {
117 return;
118 }
119 }
120
121 if (before->child_array_size <= before->child_count) {
122 before->child_array_size = MAX2(before->child_array_size * 2, 16);
123 before->children = reralloc(before, before->children,
124 struct schedule_node_child,
125 before->child_array_size);
126 }
127
128 before->children[before->child_count].node = after;
129 before->children[before->child_count].write_after_read =
130 write_after_read;
131 before->child_count++;
132 after->parent_count++;
133 }
134
135 static void
136 add_read_dep(struct schedule_state *state,
137 struct schedule_node *before,
138 struct schedule_node *after)
139 {
140 add_dep(state, before, after, false);
141 }
142
143 static void
144 add_write_dep(struct schedule_state *state,
145 struct schedule_node **before,
146 struct schedule_node *after)
147 {
148 add_dep(state, *before, after, true);
149 *before = after;
150 }
151
152 static bool
153 qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
154 {
155 if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
156 return false;
157
158 if (inst->alu.add.magic_write &&
159 (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
160 inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
161 return true;
162
163 if (inst->alu.mul.magic_write &&
164 (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
165 inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
166 return true;
167
168 return false;
169 }
170
171 static void
172 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
173 enum v3d_qpu_mux mux)
174 {
175 switch (mux) {
176 case V3D_QPU_MUX_A:
177 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
178 break;
179 case V3D_QPU_MUX_B:
180 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n);
181 break;
182 default:
183 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
184 break;
185 }
186 }
187
188
189 static void
190 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
191 uint32_t waddr, bool magic)
192 {
193 if (!magic) {
194 add_write_dep(state, &state->last_rf[waddr], n);
195 } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
196 add_write_dep(state, &state->last_tmu_write, n);
197 } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
198 /* Handled by v3d_qpu_writes_r4() check. */
199 } else {
200 switch (waddr) {
201 case V3D_QPU_WADDR_R0:
202 case V3D_QPU_WADDR_R1:
203 case V3D_QPU_WADDR_R2:
204 case V3D_QPU_WADDR_R3:
205 case V3D_QPU_WADDR_R4:
206 case V3D_QPU_WADDR_R5:
207 add_write_dep(state,
208 &state->last_r[waddr - V3D_QPU_WADDR_R0],
209 n);
210 break;
211
212 case V3D_QPU_WADDR_VPM:
213 case V3D_QPU_WADDR_VPMU:
214 add_write_dep(state, &state->last_vpm, n);
215 break;
216
217 case V3D_QPU_WADDR_TLB:
218 case V3D_QPU_WADDR_TLBU:
219 add_write_dep(state, &state->last_tlb, n);
220 break;
221
222 case V3D_QPU_WADDR_NOP:
223 break;
224
225 default:
226 fprintf(stderr, "Unknown waddr %d\n", waddr);
227 abort();
228 }
229 }
230 }
231
232 static void
233 process_cond_deps(struct schedule_state *state, struct schedule_node *n,
234 enum v3d_qpu_cond cond)
235 {
236 if (cond != V3D_QPU_COND_NONE)
237 add_read_dep(state, state->last_sf, n);
238 }
239
240 static void
241 process_pf_deps(struct schedule_state *state, struct schedule_node *n,
242 enum v3d_qpu_pf pf)
243 {
244 if (pf != V3D_QPU_PF_NONE)
245 add_write_dep(state, &state->last_sf, n);
246 }
247
248 static void
249 process_uf_deps(struct schedule_state *state, struct schedule_node *n,
250 enum v3d_qpu_uf uf)
251 {
252 if (uf != V3D_QPU_UF_NONE)
253 add_write_dep(state, &state->last_sf, n);
254 }
255
256 /**
257 * Common code for dependencies that need to be tracked both forward and
258 * backward.
259 *
260 * This is for things like "all reads of r4 have to happen between the r4
261 * writes that surround them".
262 */
263 static void
264 calculate_deps(struct schedule_state *state, struct schedule_node *n)
265 {
266 struct qinst *qinst = n->inst;
267 struct v3d_qpu_instr *inst = &qinst->qpu;
268
269 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
270 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
271 add_read_dep(state, state->last_sf, n);
272
273 /* XXX: BDI */
274 /* XXX: BDU */
275 /* XXX: ub */
276 /* XXX: raddr_a */
277
278 add_write_dep(state, &state->last_unif, n);
279 return;
280 }
281
282 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
283
284 /* XXX: LOAD_IMM */
285
286 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
287 process_mux_deps(state, n, inst->alu.add.a);
288 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
289 process_mux_deps(state, n, inst->alu.add.b);
290
291 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
292 process_mux_deps(state, n, inst->alu.mul.a);
293 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
294 process_mux_deps(state, n, inst->alu.mul.b);
295
296 switch (inst->alu.add.op) {
297 case V3D_QPU_A_VPMSETUP:
298 /* Could distinguish read/write by unpacking the uniform. */
299 add_write_dep(state, &state->last_vpm, n);
300 add_write_dep(state, &state->last_vpm_read, n);
301 break;
302
303 case V3D_QPU_A_STVPMV:
304 case V3D_QPU_A_STVPMD:
305 case V3D_QPU_A_STVPMP:
306 add_write_dep(state, &state->last_vpm, n);
307 break;
308
309 case V3D_QPU_A_MSF:
310 add_read_dep(state, state->last_tlb, n);
311 break;
312
313 case V3D_QPU_A_SETMSF:
314 case V3D_QPU_A_SETREVF:
315 add_write_dep(state, &state->last_tlb, n);
316 break;
317
318 case V3D_QPU_A_FLAPUSH:
319 case V3D_QPU_A_FLBPUSH:
320 case V3D_QPU_A_VFLA:
321 case V3D_QPU_A_VFLNA:
322 case V3D_QPU_A_VFLB:
323 case V3D_QPU_A_VFLNB:
324 add_read_dep(state, state->last_sf, n);
325 break;
326
327 case V3D_QPU_A_FLBPOP:
328 add_write_dep(state, &state->last_sf, n);
329 break;
330
331 default:
332 break;
333 }
334
335 switch (inst->alu.mul.op) {
336 case V3D_QPU_M_MULTOP:
337 case V3D_QPU_M_UMUL24:
338 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
339 * resets it to 0. We could possibly reorder umul24s relative
340 * to each other, but for now just keep all the MUL parts in
341 * order.
342 */
343 add_write_dep(state, &state->last_rtop, n);
344 break;
345 default:
346 break;
347 }
348
349 if (inst->alu.add.op != V3D_QPU_A_NOP) {
350 process_waddr_deps(state, n, inst->alu.add.waddr,
351 inst->alu.add.magic_write);
352 }
353 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
354 process_waddr_deps(state, n, inst->alu.mul.waddr,
355 inst->alu.mul.magic_write);
356 }
357
358 if (v3d_qpu_writes_r3(inst))
359 add_write_dep(state, &state->last_r[3], n);
360 if (v3d_qpu_writes_r4(inst))
361 add_write_dep(state, &state->last_r[4], n);
362 if (v3d_qpu_writes_r5(inst))
363 add_write_dep(state, &state->last_r[5], n);
364
365 if (inst->sig.thrsw) {
366 /* All accumulator contents and flags are undefined after the
367 * switch.
368 */
369 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
370 add_write_dep(state, &state->last_r[i], n);
371 add_write_dep(state, &state->last_sf, n);
372
373 /* Scoreboard-locking operations have to stay after the last
374 * thread switch.
375 */
376 add_write_dep(state, &state->last_tlb, n);
377
378 add_write_dep(state, &state->last_tmu_write, n);
379 }
380
381 if (inst->sig.ldtmu) {
382 /* TMU loads are coming from a FIFO, so ordering is important.
383 */
384 add_write_dep(state, &state->last_tmu_write, n);
385 }
386
387 if (inst->sig.ldtlb | inst->sig.ldtlbu)
388 add_read_dep(state, state->last_tlb, n);
389
390 if (inst->sig.ldvpm)
391 add_write_dep(state, &state->last_vpm_read, n);
392
393 /* inst->sig.ldunif or sideband uniform read */
394 if (qinst->uniform != ~0)
395 add_write_dep(state, &state->last_unif, n);
396
397 process_cond_deps(state, n, inst->flags.ac);
398 process_cond_deps(state, n, inst->flags.mc);
399 process_pf_deps(state, n, inst->flags.apf);
400 process_pf_deps(state, n, inst->flags.mpf);
401 process_uf_deps(state, n, inst->flags.auf);
402 process_uf_deps(state, n, inst->flags.muf);
403 }
404
405 static void
406 calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list)
407 {
408 struct schedule_state state;
409
410 memset(&state, 0, sizeof(state));
411 state.dir = F;
412
413 list_for_each_entry(struct schedule_node, node, schedule_list, link)
414 calculate_deps(&state, node);
415 }
416
417 static void
418 calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list)
419 {
420 struct list_head *node;
421 struct schedule_state state;
422
423 memset(&state, 0, sizeof(state));
424 state.dir = R;
425
426 for (node = schedule_list->prev; schedule_list != node; node = node->prev) {
427 calculate_deps(&state, (struct schedule_node *)node);
428 }
429 }
430
431 struct choose_scoreboard {
432 int tick;
433 int last_sfu_write_tick;
434 int last_ldvary_tick;
435 int last_uniforms_reset_tick;
436 uint32_t last_waddr_add, last_waddr_mul;
437 bool tlb_locked;
438 };
439
440 static bool
441 mux_reads_too_soon(struct choose_scoreboard *scoreboard,
442 const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
443 {
444 switch (mux) {
445 case V3D_QPU_MUX_A:
446 if (scoreboard->last_waddr_add == inst->raddr_a ||
447 scoreboard->last_waddr_mul == inst->raddr_a) {
448 return true;
449 }
450 break;
451
452 case V3D_QPU_MUX_B:
453 if (scoreboard->last_waddr_add == inst->raddr_b ||
454 scoreboard->last_waddr_mul == inst->raddr_b) {
455 return true;
456 }
457 break;
458
459 case V3D_QPU_MUX_R4:
460 if (scoreboard->tick - scoreboard->last_sfu_write_tick <= 2)
461 return true;
462 break;
463
464 case V3D_QPU_MUX_R5:
465 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
466 return true;
467 break;
468 default:
469 break;
470 }
471
472 return false;
473 }
474
475 static bool
476 reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
477 struct qinst *qinst)
478 {
479 const struct v3d_qpu_instr *inst = &qinst->qpu;
480
481 /* XXX: Branching off of raddr. */
482 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
483 return false;
484
485 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
486
487 if (inst->alu.add.op != V3D_QPU_A_NOP) {
488 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
489 mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
490 return true;
491 }
492 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
493 mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
494 return true;
495 }
496 }
497
498 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
499 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
500 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
501 return true;
502 }
503 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
504 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
505 return true;
506 }
507 }
508
509 /* XXX: imm */
510
511 return false;
512 }
513
514 static bool
515 writes_too_soon_after_write(struct choose_scoreboard *scoreboard,
516 struct qinst *qinst)
517 {
518 const struct v3d_qpu_instr *inst = &qinst->qpu;
519
520 /* Don't schedule any other r4 write too soon after an SFU write.
521 * This would normally be prevented by dependency tracking, but might
522 * occur if a dead SFU computation makes it to scheduling.
523 */
524 if (scoreboard->tick - scoreboard->last_sfu_write_tick < 2 &&
525 v3d_qpu_writes_r4(inst))
526 return true;
527
528 return false;
529 }
530
531 static bool
532 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
533 const struct v3d_qpu_instr *inst)
534 {
535 return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
536 }
537
538 static int
539 get_instruction_priority(const struct v3d_qpu_instr *inst)
540 {
541 uint32_t baseline_score;
542 uint32_t next_score = 0;
543
544 /* Schedule TLB operations as late as possible, to get more
545 * parallelism between shaders.
546 */
547 if (qpu_inst_is_tlb(inst))
548 return next_score;
549 next_score++;
550
551 /* Schedule texture read results collection late to hide latency. */
552 if (inst->sig.ldtmu)
553 return next_score;
554 next_score++;
555
556 /* Default score for things that aren't otherwise special. */
557 baseline_score = next_score;
558 next_score++;
559
560 /* Schedule texture read setup early to hide their latency better. */
561 if (inst->type == V3D_QPU_INSTR_TYPE_ALU &&
562 ((inst->alu.add.magic_write &&
563 v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr)) ||
564 (inst->alu.mul.magic_write &&
565 v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr)))) {
566 return next_score;
567 }
568 next_score++;
569
570 return baseline_score;
571 }
572
573 static bool
574 qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr)
575 {
576 return (v3d_qpu_magic_waddr_is_tmu(waddr) ||
577 v3d_qpu_magic_waddr_is_sfu(waddr) ||
578 v3d_qpu_magic_waddr_is_tlb(waddr) ||
579 v3d_qpu_magic_waddr_is_vpm(waddr) ||
580 v3d_qpu_magic_waddr_is_tsy(waddr));
581 }
582
583 static bool
584 qpu_accesses_peripheral(const struct v3d_qpu_instr *inst)
585 {
586 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
587 if (inst->alu.add.op != V3D_QPU_A_NOP &&
588 inst->alu.add.magic_write &&
589 qpu_magic_waddr_is_periph(inst->alu.add.waddr)) {
590 return true;
591 }
592
593 if (inst->alu.mul.op != V3D_QPU_M_NOP &&
594 inst->alu.mul.magic_write &&
595 qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) {
596 return true;
597 }
598 }
599
600 return (inst->sig.ldvpm ||
601 inst->sig.ldtmu ||
602 inst->sig.ldtlb ||
603 inst->sig.ldtlbu);
604 }
605
606 static bool
607 qpu_merge_inst(const struct v3d_device_info *devinfo,
608 struct v3d_qpu_instr *result,
609 const struct v3d_qpu_instr *a,
610 const struct v3d_qpu_instr *b)
611 {
612 if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
613 b->type != V3D_QPU_INSTR_TYPE_ALU) {
614 return false;
615 }
616
617 /* Can't do more than one peripheral access in an instruction. */
618 if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b))
619 return false;
620
621 struct v3d_qpu_instr merge = *a;
622
623 if (b->alu.add.op != V3D_QPU_A_NOP) {
624 if (a->alu.add.op != V3D_QPU_A_NOP)
625 return false;
626 merge.alu.add = b->alu.add;
627
628 merge.flags.ac = b->flags.ac;
629 merge.flags.apf = b->flags.apf;
630 merge.flags.auf = b->flags.auf;
631 }
632
633 if (b->alu.mul.op != V3D_QPU_M_NOP) {
634 if (a->alu.mul.op != V3D_QPU_M_NOP)
635 return false;
636 merge.alu.mul = b->alu.mul;
637
638 merge.flags.mc = b->flags.mc;
639 merge.flags.mpf = b->flags.mpf;
640 merge.flags.muf = b->flags.muf;
641 }
642
643 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) {
644 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) &&
645 a->raddr_a != b->raddr_a) {
646 return false;
647 }
648 merge.raddr_a = b->raddr_a;
649 }
650
651 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) {
652 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) &&
653 a->raddr_b != b->raddr_b) {
654 return false;
655 }
656 merge.raddr_b = b->raddr_b;
657 }
658
659 merge.sig.thrsw |= b->sig.thrsw;
660 merge.sig.ldunif |= b->sig.ldunif;
661 merge.sig.ldtmu |= b->sig.ldtmu;
662 merge.sig.ldvary |= b->sig.ldvary;
663 merge.sig.ldvpm |= b->sig.ldvpm;
664 merge.sig.small_imm |= b->sig.small_imm;
665 merge.sig.ldtlb |= b->sig.ldtlb;
666 merge.sig.ldtlbu |= b->sig.ldtlbu;
667 merge.sig.ucb |= b->sig.ucb;
668 merge.sig.rotate |= b->sig.rotate;
669 merge.sig.wrtmuc |= b->sig.wrtmuc;
670
671 uint64_t packed;
672 bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
673
674 *result = merge;
675 /* No modifying the real instructions on failure. */
676 assert(ok || (a != result && b != result));
677
678 return ok;
679 }
680
681 static struct schedule_node *
682 choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
683 struct choose_scoreboard *scoreboard,
684 struct list_head *schedule_list,
685 struct schedule_node *prev_inst)
686 {
687 struct schedule_node *chosen = NULL;
688 int chosen_prio = 0;
689
690 /* Don't pair up anything with a thread switch signal -- emit_thrsw()
691 * will handle pairing it along with filling the delay slots.
692 */
693 if (prev_inst) {
694 if (prev_inst->inst->qpu.sig.thrsw)
695 return NULL;
696 }
697
698 list_for_each_entry(struct schedule_node, n, schedule_list, link) {
699 const struct v3d_qpu_instr *inst = &n->inst->qpu;
700
701 /* Don't choose the branch instruction until it's the last one
702 * left. We'll move it up to fit its delay slots after we
703 * choose it.
704 */
705 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
706 !list_is_singular(schedule_list)) {
707 continue;
708 }
709
710 /* "An instruction must not read from a location in physical
711 * regfile A or B that was written to by the previous
712 * instruction."
713 */
714 if (reads_too_soon_after_write(scoreboard, n->inst))
715 continue;
716
717 if (writes_too_soon_after_write(scoreboard, n->inst))
718 continue;
719
720 /* "A scoreboard wait must not occur in the first two
721 * instructions of a fragment shader. This is either the
722 * explicit Wait for Scoreboard signal or an implicit wait
723 * with the first tile-buffer read or write instruction."
724 */
725 if (pixel_scoreboard_too_soon(scoreboard, inst))
726 continue;
727
728 /* ldunif and ldvary both write r5, but ldunif does so a tick
729 * sooner. If the ldvary's r5 wasn't used, then ldunif might
730 * otherwise get scheduled so ldunif and ldvary try to update
731 * r5 in the same tick.
732 */
733 if (inst->sig.ldunif &&
734 scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
735 continue;
736 }
737
738 /* If we're trying to pair with another instruction, check
739 * that they're compatible.
740 */
741 if (prev_inst) {
742 /* Don't pair up a thread switch signal -- we'll
743 * handle pairing it when we pick it on its own.
744 */
745 if (inst->sig.thrsw)
746 continue;
747
748 if (prev_inst->inst->uniform != -1 &&
749 n->inst->uniform != -1)
750 continue;
751
752 /* Don't merge in something that will lock the TLB.
753 * Hopwefully what we have in inst will release some
754 * other instructions, allowing us to delay the
755 * TLB-locking instruction until later.
756 */
757 if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
758 continue;
759
760 struct v3d_qpu_instr merged_inst;
761 if (!qpu_merge_inst(devinfo, &merged_inst,
762 &prev_inst->inst->qpu, inst)) {
763 continue;
764 }
765 }
766
767 int prio = get_instruction_priority(inst);
768
769 /* Found a valid instruction. If nothing better comes along,
770 * this one works.
771 */
772 if (!chosen) {
773 chosen = n;
774 chosen_prio = prio;
775 continue;
776 }
777
778 if (prio > chosen_prio) {
779 chosen = n;
780 chosen_prio = prio;
781 } else if (prio < chosen_prio) {
782 continue;
783 }
784
785 if (n->delay > chosen->delay) {
786 chosen = n;
787 chosen_prio = prio;
788 } else if (n->delay < chosen->delay) {
789 continue;
790 }
791 }
792
793 return chosen;
794 }
795
796 static void
797 update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
798 enum v3d_qpu_waddr waddr)
799 {
800 if (v3d_qpu_magic_waddr_is_sfu(waddr))
801 scoreboard->last_sfu_write_tick = scoreboard->tick;
802 }
803
804 static void
805 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
806 const struct v3d_qpu_instr *inst)
807 {
808 scoreboard->last_waddr_add = ~0;
809 scoreboard->last_waddr_mul = ~0;
810
811 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
812 return;
813
814 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
815
816 if (inst->alu.add.op != V3D_QPU_A_NOP) {
817 if (inst->alu.add.magic_write) {
818 update_scoreboard_for_magic_waddr(scoreboard,
819 inst->alu.add.waddr);
820 } else {
821 scoreboard->last_waddr_add = inst->alu.add.waddr;
822 }
823 }
824
825 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
826 if (inst->alu.mul.magic_write) {
827 update_scoreboard_for_magic_waddr(scoreboard,
828 inst->alu.mul.waddr);
829 } else {
830 scoreboard->last_waddr_mul = inst->alu.mul.waddr;
831 }
832 }
833
834 if (inst->sig.ldvary)
835 scoreboard->last_ldvary_tick = scoreboard->tick;
836
837 if (qpu_inst_is_tlb(inst))
838 scoreboard->tlb_locked = true;
839 }
840
841 static void
842 dump_state(const struct v3d_device_info *devinfo,
843 struct list_head *schedule_list)
844 {
845 list_for_each_entry(struct schedule_node, n, schedule_list, link) {
846 fprintf(stderr, " t=%4d: ", n->unblocked_time);
847 v3d_qpu_dump(devinfo, &n->inst->qpu);
848 fprintf(stderr, "\n");
849
850 for (int i = 0; i < n->child_count; i++) {
851 struct schedule_node *child = n->children[i].node;
852 if (!child)
853 continue;
854
855 fprintf(stderr, " - ");
856 v3d_qpu_dump(devinfo, &child->inst->qpu);
857 fprintf(stderr, " (%d parents, %c)\n",
858 child->parent_count,
859 n->children[i].write_after_read ? 'w' : 'r');
860 }
861 }
862 }
863
864 static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr,
865 const struct v3d_qpu_instr *after)
866 {
867 /* Apply some huge latency between texture fetch requests and getting
868 * their results back.
869 *
870 * FIXME: This is actually pretty bogus. If we do:
871 *
872 * mov tmu0_s, a
873 * <a bit of math>
874 * mov tmu0_s, b
875 * load_tmu0
876 * <more math>
877 * load_tmu0
878 *
879 * we count that as worse than
880 *
881 * mov tmu0_s, a
882 * mov tmu0_s, b
883 * <lots of math>
884 * load_tmu0
885 * <more math>
886 * load_tmu0
887 *
888 * because we associate the first load_tmu0 with the *second* tmu0_s.
889 */
890 if (v3d_qpu_magic_waddr_is_tmu(waddr) && after->sig.ldtmu)
891 return 100;
892
893 /* Assume that anything depending on us is consuming the SFU result. */
894 if (v3d_qpu_magic_waddr_is_sfu(waddr))
895 return 3;
896
897 return 1;
898 }
899
900 static uint32_t
901 instruction_latency(struct schedule_node *before, struct schedule_node *after)
902 {
903 const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
904 const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
905 uint32_t latency = 1;
906
907 if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
908 after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
909 return latency;
910
911 if (before_inst->alu.add.magic_write) {
912 latency = MAX2(latency,
913 magic_waddr_latency(before_inst->alu.add.waddr,
914 after_inst));
915 }
916
917 if (before_inst->alu.mul.magic_write) {
918 latency = MAX2(latency,
919 magic_waddr_latency(before_inst->alu.mul.waddr,
920 after_inst));
921 }
922
923 return latency;
924 }
925
926 /** Recursive computation of the delay member of a node. */
927 static void
928 compute_delay(struct schedule_node *n)
929 {
930 if (!n->child_count) {
931 n->delay = 1;
932 } else {
933 for (int i = 0; i < n->child_count; i++) {
934 if (!n->children[i].node->delay)
935 compute_delay(n->children[i].node);
936 n->delay = MAX2(n->delay,
937 n->children[i].node->delay +
938 instruction_latency(n, n->children[i].node));
939 }
940 }
941 }
942
943 static void
944 mark_instruction_scheduled(struct list_head *schedule_list,
945 uint32_t time,
946 struct schedule_node *node,
947 bool war_only)
948 {
949 if (!node)
950 return;
951
952 for (int i = node->child_count - 1; i >= 0; i--) {
953 struct schedule_node *child =
954 node->children[i].node;
955
956 if (!child)
957 continue;
958
959 if (war_only && !node->children[i].write_after_read)
960 continue;
961
962 /* If the requirement is only that the node not appear before
963 * the last read of its destination, then it can be scheduled
964 * immediately after (or paired with!) the thing reading the
965 * destination.
966 */
967 uint32_t latency = 0;
968 if (!war_only) {
969 latency = instruction_latency(node,
970 node->children[i].node);
971 }
972
973 child->unblocked_time = MAX2(child->unblocked_time,
974 time + latency);
975 child->parent_count--;
976 if (child->parent_count == 0)
977 list_add(&child->link, schedule_list);
978
979 node->children[i].node = NULL;
980 }
981 }
982
983 static struct qinst *
984 vir_nop()
985 {
986 struct qreg undef = { QFILE_NULL, 0 };
987 struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
988
989 return qinst;
990 }
991
992 #if 0
993 static struct qinst *
994 nop_after(struct qinst *inst)
995 {
996 struct qinst *q = vir_nop();
997
998 list_add(&q->link, &inst->link);
999
1000 return q;
1001 }
1002
1003 /**
1004 * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair
1005 * with another instruction.
1006 */
1007 static void
1008 emit_thrsw(struct v3d_compile *c,
1009 struct choose_scoreboard *scoreboard,
1010 const struct v3d_qpu_instr *inst)
1011 {
1012 /* There should be nothing in a thrsw inst being scheduled other than
1013 * the signal bits.
1014 */
1015 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
1016 assert(inst->alu.add.op == V3D_QPU_A_NOP);
1017 assert(inst->alu.mul.op == V3D_QPU_M_NOP);
1018
1019 /* Try to find an earlier scheduled instruction that we can merge the
1020 * thrsw into.
1021 */
1022 int thrsw_ip = c->qpu_inst_count;
1023 for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {
1024 uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];
1025 uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);
1026
1027 if (prev_sig == QPU_SIG_NONE)
1028 thrsw_ip = c->qpu_inst_count - i;
1029 }
1030
1031 if (thrsw_ip != c->qpu_inst_count) {
1032 /* Merge the thrsw into the existing instruction. */
1033 c->qpu_insts[thrsw_ip] =
1034 QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);
1035 } else {
1036 qpu_serialize_one_inst(c, inst);
1037 update_scoreboard_for_chosen(scoreboard, inst);
1038 }
1039
1040 /* Fill the delay slots. */
1041 while (c->qpu_inst_count < thrsw_ip + 3) {
1042 update_scoreboard_for_chosen(scoreboard, v3d_qpu_nop());
1043 qpu_serialize_one_inst(c, v3d_qpu_nop());
1044 }
1045 }
1046 #endif
1047
1048 static uint32_t
1049 schedule_instructions(struct v3d_compile *c,
1050 struct choose_scoreboard *scoreboard,
1051 struct qblock *block,
1052 struct list_head *schedule_list,
1053 enum quniform_contents *orig_uniform_contents,
1054 uint32_t *orig_uniform_data,
1055 uint32_t *next_uniform)
1056 {
1057 const struct v3d_device_info *devinfo = c->devinfo;
1058 uint32_t time = 0;
1059
1060 if (debug) {
1061 fprintf(stderr, "initial deps:\n");
1062 dump_state(devinfo, schedule_list);
1063 fprintf(stderr, "\n");
1064 }
1065
1066 /* Remove non-DAG heads from the list. */
1067 list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) {
1068 if (n->parent_count != 0)
1069 list_del(&n->link);
1070 }
1071
1072 while (!list_empty(schedule_list)) {
1073 struct schedule_node *chosen =
1074 choose_instruction_to_schedule(devinfo,
1075 scoreboard,
1076 schedule_list,
1077 NULL);
1078 struct schedule_node *merge = NULL;
1079
1080 /* If there are no valid instructions to schedule, drop a NOP
1081 * in.
1082 */
1083 struct qinst *qinst = chosen ? chosen->inst : vir_nop();
1084 struct v3d_qpu_instr *inst = &qinst->qpu;
1085
1086 if (debug) {
1087 fprintf(stderr, "t=%4d: current list:\n",
1088 time);
1089 dump_state(devinfo, schedule_list);
1090 fprintf(stderr, "t=%4d: chose: ", time);
1091 v3d_qpu_dump(devinfo, inst);
1092 fprintf(stderr, "\n");
1093 }
1094
1095 /* Schedule this instruction onto the QPU list. Also try to
1096 * find an instruction to pair with it.
1097 */
1098 if (chosen) {
1099 time = MAX2(chosen->unblocked_time, time);
1100 list_del(&chosen->link);
1101 mark_instruction_scheduled(schedule_list, time,
1102 chosen, true);
1103
1104 merge = choose_instruction_to_schedule(devinfo,
1105 scoreboard,
1106 schedule_list,
1107 chosen);
1108 if (merge) {
1109 time = MAX2(merge->unblocked_time, time);
1110 list_del(&merge->link);
1111 (void)qpu_merge_inst(devinfo, inst,
1112 inst, &merge->inst->qpu);
1113 if (merge->inst->uniform != -1) {
1114 chosen->inst->uniform =
1115 merge->inst->uniform;
1116 }
1117
1118 if (debug) {
1119 fprintf(stderr, "t=%4d: merging: ",
1120 time);
1121 v3d_qpu_dump(devinfo, &merge->inst->qpu);
1122 fprintf(stderr, "\n");
1123 fprintf(stderr, " result: ");
1124 v3d_qpu_dump(devinfo, inst);
1125 fprintf(stderr, "\n");
1126 }
1127 }
1128 }
1129
1130 /* Update the uniform index for the rewritten location --
1131 * branch target updating will still need to change
1132 * c->uniform_data[] using this index.
1133 */
1134 if (qinst->uniform != -1) {
1135 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
1136 block->branch_uniform = *next_uniform;
1137
1138 c->uniform_data[*next_uniform] =
1139 orig_uniform_data[qinst->uniform];
1140 c->uniform_contents[*next_uniform] =
1141 orig_uniform_contents[qinst->uniform];
1142 qinst->uniform = *next_uniform;
1143 (*next_uniform)++;
1144 }
1145
1146 if (debug) {
1147 fprintf(stderr, "\n");
1148 }
1149
1150 /* Now that we've scheduled a new instruction, some of its
1151 * children can be promoted to the list of instructions ready to
1152 * be scheduled. Update the children's unblocked time for this
1153 * DAG edge as we do so.
1154 */
1155 mark_instruction_scheduled(schedule_list, time, chosen, false);
1156
1157 if (merge) {
1158 mark_instruction_scheduled(schedule_list, time, merge,
1159 false);
1160
1161 /* The merged VIR instruction doesn't get re-added to the
1162 * block, so free it now.
1163 */
1164 free(merge->inst);
1165 }
1166
1167 if (0 && inst->sig.thrsw) {
1168 /* XXX emit_thrsw(c, scoreboard, qinst); */
1169 } else {
1170 c->qpu_inst_count++;
1171 list_addtail(&qinst->link, &block->instructions);
1172 update_scoreboard_for_chosen(scoreboard, inst);
1173 }
1174
1175 scoreboard->tick++;
1176 time++;
1177
1178 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ||
1179 inst->sig.thrsw /* XXX */) {
1180 block->branch_qpu_ip = c->qpu_inst_count - 1;
1181 /* Fill the delay slots.
1182 *
1183 * We should fill these with actual instructions,
1184 * instead, but that will probably need to be done
1185 * after this, once we know what the leading
1186 * instructions of the successors are (so we can
1187 * handle A/B register file write latency)
1188 */
1189 /* XXX: scoreboard */
1190 int slots = (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ?
1191 3 : 2);
1192 for (int i = 0; i < slots; i++) {
1193 struct qinst *nop = vir_nop();
1194 list_addtail(&nop->link, &block->instructions);
1195
1196 update_scoreboard_for_chosen(scoreboard,
1197 &nop->qpu);
1198 c->qpu_inst_count++;
1199 scoreboard->tick++;
1200 time++;
1201 }
1202 }
1203 }
1204
1205 return time;
1206 }
1207
1208 static uint32_t
1209 qpu_schedule_instructions_block(struct v3d_compile *c,
1210 struct choose_scoreboard *scoreboard,
1211 struct qblock *block,
1212 enum quniform_contents *orig_uniform_contents,
1213 uint32_t *orig_uniform_data,
1214 uint32_t *next_uniform)
1215 {
1216 void *mem_ctx = ralloc_context(NULL);
1217 struct list_head schedule_list;
1218
1219 list_inithead(&schedule_list);
1220
1221 /* Wrap each instruction in a scheduler structure. */
1222 while (!list_empty(&block->instructions)) {
1223 struct qinst *qinst = (struct qinst *)block->instructions.next;
1224 struct schedule_node *n =
1225 rzalloc(mem_ctx, struct schedule_node);
1226
1227 n->inst = qinst;
1228
1229 list_del(&qinst->link);
1230 list_addtail(&n->link, &schedule_list);
1231 }
1232
1233 calculate_forward_deps(c, &schedule_list);
1234 calculate_reverse_deps(c, &schedule_list);
1235
1236 list_for_each_entry(struct schedule_node, n, &schedule_list, link) {
1237 compute_delay(n);
1238 }
1239
1240 uint32_t cycles = schedule_instructions(c, scoreboard, block,
1241 &schedule_list,
1242 orig_uniform_contents,
1243 orig_uniform_data,
1244 next_uniform);
1245
1246 ralloc_free(mem_ctx);
1247
1248 return cycles;
1249 }
1250
1251 static void
1252 qpu_set_branch_targets(struct v3d_compile *c)
1253 {
1254 vir_for_each_block(block, c) {
1255 /* The end block of the program has no branch. */
1256 if (!block->successors[0])
1257 continue;
1258
1259 /* If there was no branch instruction, then the successor
1260 * block must follow immediately after this one.
1261 */
1262 if (block->branch_qpu_ip == ~0) {
1263 assert(block->end_qpu_ip + 1 ==
1264 block->successors[0]->start_qpu_ip);
1265 continue;
1266 }
1267
1268 /* Walk back through the delay slots to find the branch
1269 * instr.
1270 */
1271 struct list_head *entry = block->instructions.prev;
1272 for (int i = 0; i < 3; i++)
1273 entry = entry->prev;
1274 struct qinst *branch = container_of(entry, branch, link);
1275 assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
1276
1277 /* Make sure that the if-we-don't-jump
1278 * successor was scheduled just after the
1279 * delay slots.
1280 */
1281 assert(!block->successors[1] ||
1282 block->successors[1]->start_qpu_ip ==
1283 block->branch_qpu_ip + 4);
1284
1285 branch->qpu.branch.offset =
1286 ((block->successors[0]->start_qpu_ip -
1287 (block->branch_qpu_ip + 4)) *
1288 sizeof(uint64_t));
1289
1290 /* Set up the relative offset to jump in the
1291 * uniform stream.
1292 *
1293 * Use a temporary here, because
1294 * uniform_data[inst->uniform] may be shared
1295 * between multiple instructions.
1296 */
1297 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
1298 c->uniform_data[branch->uniform] =
1299 (block->successors[0]->start_uniform -
1300 (block->branch_uniform + 1)) * 4;
1301 }
1302 }
1303
1304 uint32_t
1305 v3d_qpu_schedule_instructions(struct v3d_compile *c)
1306 {
1307 const struct v3d_device_info *devinfo = c->devinfo;
1308
1309 /* We reorder the uniforms as we schedule instructions, so save the
1310 * old data off and replace it.
1311 */
1312 uint32_t *uniform_data = c->uniform_data;
1313 enum quniform_contents *uniform_contents = c->uniform_contents;
1314 c->uniform_contents = ralloc_array(c, enum quniform_contents,
1315 c->num_uniforms);
1316 c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
1317 c->uniform_array_size = c->num_uniforms;
1318 uint32_t next_uniform = 0;
1319
1320 struct choose_scoreboard scoreboard;
1321 memset(&scoreboard, 0, sizeof(scoreboard));
1322 scoreboard.last_waddr_add = ~0;
1323 scoreboard.last_waddr_mul = ~0;
1324 scoreboard.last_ldvary_tick = -10;
1325 scoreboard.last_sfu_write_tick = -10;
1326 scoreboard.last_uniforms_reset_tick = -10;
1327
1328 if (debug) {
1329 fprintf(stderr, "Pre-schedule instructions\n");
1330 vir_for_each_block(block, c) {
1331 fprintf(stderr, "BLOCK %d\n", block->index);
1332 list_for_each_entry(struct qinst, qinst,
1333 &block->instructions, link) {
1334 v3d_qpu_dump(devinfo, &qinst->qpu);
1335 fprintf(stderr, "\n");
1336 }
1337 }
1338 fprintf(stderr, "\n");
1339 }
1340
1341 uint32_t cycles = 0;
1342 vir_for_each_block(block, c) {
1343 block->start_qpu_ip = c->qpu_inst_count;
1344 block->branch_qpu_ip = ~0;
1345 block->start_uniform = next_uniform;
1346
1347 cycles += qpu_schedule_instructions_block(c,
1348 &scoreboard,
1349 block,
1350 uniform_contents,
1351 uniform_data,
1352 &next_uniform);
1353
1354 block->end_qpu_ip = c->qpu_inst_count - 1;
1355 }
1356
1357 qpu_set_branch_targets(c);
1358
1359 assert(next_uniform == c->num_uniforms);
1360
1361 return cycles;
1362 }