broadcom/vc5: Add support for V3Dv4 signal bits.
[mesa.git] / src / broadcom / compiler / qpu_schedule.c
1 /*
2 * Copyright © 2010 Intel Corporation
3 * Copyright © 2014-2017 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 /**
26 * @file
27 *
28 * The basic model of the list scheduler is to take a basic block, compute a
29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically
30 * pick a DAG head, then put all the children that are now DAG heads into the
31 * list of things to schedule.
32 *
33 * The goal of scheduling here is to pack pairs of operations together in a
34 * single QPU instruction.
35 */
36
37 #include "qpu/qpu_disasm.h"
38 #include "v3d_compiler.h"
39 #include "util/ralloc.h"
40
41 static bool debug;
42
43 struct schedule_node_child;
44
45 struct schedule_node {
46 struct list_head link;
47 struct qinst *inst;
48 struct schedule_node_child *children;
49 uint32_t child_count;
50 uint32_t child_array_size;
51 uint32_t parent_count;
52
53 /* Longest cycles + instruction_latency() of any parent of this node. */
54 uint32_t unblocked_time;
55
56 /**
57 * Minimum number of cycles from scheduling this instruction until the
58 * end of the program, based on the slowest dependency chain through
59 * the children.
60 */
61 uint32_t delay;
62
63 /**
64 * cycles between this instruction being scheduled and when its result
65 * can be consumed.
66 */
67 uint32_t latency;
68 };
69
70 struct schedule_node_child {
71 struct schedule_node *node;
72 bool write_after_read;
73 };
74
75 /* When walking the instructions in reverse, we need to swap before/after in
76 * add_dep().
77 */
78 enum direction { F, R };
79
80 struct schedule_state {
81 const struct v3d_device_info *devinfo;
82 struct schedule_node *last_r[6];
83 struct schedule_node *last_rf[64];
84 struct schedule_node *last_sf;
85 struct schedule_node *last_vpm_read;
86 struct schedule_node *last_tmu_write;
87 struct schedule_node *last_tlb;
88 struct schedule_node *last_vpm;
89 struct schedule_node *last_unif;
90 struct schedule_node *last_rtop;
91 enum direction dir;
92 /* Estimated cycle when the current instruction would start. */
93 uint32_t time;
94 };
95
96 static void
97 add_dep(struct schedule_state *state,
98 struct schedule_node *before,
99 struct schedule_node *after,
100 bool write)
101 {
102 bool write_after_read = !write && state->dir == R;
103
104 if (!before || !after)
105 return;
106
107 assert(before != after);
108
109 if (state->dir == R) {
110 struct schedule_node *t = before;
111 before = after;
112 after = t;
113 }
114
115 for (int i = 0; i < before->child_count; i++) {
116 if (before->children[i].node == after &&
117 (before->children[i].write_after_read == write_after_read)) {
118 return;
119 }
120 }
121
122 if (before->child_array_size <= before->child_count) {
123 before->child_array_size = MAX2(before->child_array_size * 2, 16);
124 before->children = reralloc(before, before->children,
125 struct schedule_node_child,
126 before->child_array_size);
127 }
128
129 before->children[before->child_count].node = after;
130 before->children[before->child_count].write_after_read =
131 write_after_read;
132 before->child_count++;
133 after->parent_count++;
134 }
135
136 static void
137 add_read_dep(struct schedule_state *state,
138 struct schedule_node *before,
139 struct schedule_node *after)
140 {
141 add_dep(state, before, after, false);
142 }
143
144 static void
145 add_write_dep(struct schedule_state *state,
146 struct schedule_node **before,
147 struct schedule_node *after)
148 {
149 add_dep(state, *before, after, true);
150 *before = after;
151 }
152
153 static bool
154 qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
155 {
156 if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
157 return false;
158
159 if (inst->alu.add.magic_write &&
160 (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
161 inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
162 return true;
163
164 if (inst->alu.mul.magic_write &&
165 (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
166 inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
167 return true;
168
169 return false;
170 }
171
172 static void
173 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
174 enum v3d_qpu_mux mux)
175 {
176 switch (mux) {
177 case V3D_QPU_MUX_A:
178 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
179 break;
180 case V3D_QPU_MUX_B:
181 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n);
182 break;
183 default:
184 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
185 break;
186 }
187 }
188
189
190 static void
191 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
192 uint32_t waddr, bool magic)
193 {
194 if (!magic) {
195 add_write_dep(state, &state->last_rf[waddr], n);
196 } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) {
197 add_write_dep(state, &state->last_tmu_write, n);
198 } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
199 /* Handled by v3d_qpu_writes_r4() check. */
200 } else {
201 switch (waddr) {
202 case V3D_QPU_WADDR_R0:
203 case V3D_QPU_WADDR_R1:
204 case V3D_QPU_WADDR_R2:
205 add_write_dep(state,
206 &state->last_r[waddr - V3D_QPU_WADDR_R0],
207 n);
208 break;
209 case V3D_QPU_WADDR_R3:
210 case V3D_QPU_WADDR_R4:
211 case V3D_QPU_WADDR_R5:
212 /* Handled by v3d_qpu_writes_r*() checks below. */
213 break;
214
215 case V3D_QPU_WADDR_VPM:
216 case V3D_QPU_WADDR_VPMU:
217 add_write_dep(state, &state->last_vpm, n);
218 break;
219
220 case V3D_QPU_WADDR_TLB:
221 case V3D_QPU_WADDR_TLBU:
222 add_write_dep(state, &state->last_tlb, n);
223 break;
224
225 case V3D_QPU_WADDR_NOP:
226 break;
227
228 default:
229 fprintf(stderr, "Unknown waddr %d\n", waddr);
230 abort();
231 }
232 }
233 }
234
235 static void
236 process_cond_deps(struct schedule_state *state, struct schedule_node *n,
237 enum v3d_qpu_cond cond)
238 {
239 if (cond != V3D_QPU_COND_NONE)
240 add_read_dep(state, state->last_sf, n);
241 }
242
243 static void
244 process_pf_deps(struct schedule_state *state, struct schedule_node *n,
245 enum v3d_qpu_pf pf)
246 {
247 if (pf != V3D_QPU_PF_NONE)
248 add_write_dep(state, &state->last_sf, n);
249 }
250
251 static void
252 process_uf_deps(struct schedule_state *state, struct schedule_node *n,
253 enum v3d_qpu_uf uf)
254 {
255 if (uf != V3D_QPU_UF_NONE)
256 add_write_dep(state, &state->last_sf, n);
257 }
258
259 /**
260 * Common code for dependencies that need to be tracked both forward and
261 * backward.
262 *
263 * This is for things like "all reads of r4 have to happen between the r4
264 * writes that surround them".
265 */
266 static void
267 calculate_deps(struct schedule_state *state, struct schedule_node *n)
268 {
269 const struct v3d_device_info *devinfo = state->devinfo;
270 struct qinst *qinst = n->inst;
271 struct v3d_qpu_instr *inst = &qinst->qpu;
272
273 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
274 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
275 add_read_dep(state, state->last_sf, n);
276
277 /* XXX: BDI */
278 /* XXX: BDU */
279 /* XXX: ub */
280 /* XXX: raddr_a */
281
282 add_write_dep(state, &state->last_unif, n);
283 return;
284 }
285
286 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
287
288 /* XXX: LOAD_IMM */
289
290 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
291 process_mux_deps(state, n, inst->alu.add.a);
292 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
293 process_mux_deps(state, n, inst->alu.add.b);
294
295 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
296 process_mux_deps(state, n, inst->alu.mul.a);
297 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
298 process_mux_deps(state, n, inst->alu.mul.b);
299
300 switch (inst->alu.add.op) {
301 case V3D_QPU_A_VPMSETUP:
302 /* Could distinguish read/write by unpacking the uniform. */
303 add_write_dep(state, &state->last_vpm, n);
304 add_write_dep(state, &state->last_vpm_read, n);
305 break;
306
307 case V3D_QPU_A_STVPMV:
308 case V3D_QPU_A_STVPMD:
309 case V3D_QPU_A_STVPMP:
310 add_write_dep(state, &state->last_vpm, n);
311 break;
312
313 case V3D_QPU_A_MSF:
314 add_read_dep(state, state->last_tlb, n);
315 break;
316
317 case V3D_QPU_A_SETMSF:
318 case V3D_QPU_A_SETREVF:
319 add_write_dep(state, &state->last_tlb, n);
320 break;
321
322 case V3D_QPU_A_FLAPUSH:
323 case V3D_QPU_A_FLBPUSH:
324 case V3D_QPU_A_VFLA:
325 case V3D_QPU_A_VFLNA:
326 case V3D_QPU_A_VFLB:
327 case V3D_QPU_A_VFLNB:
328 add_read_dep(state, state->last_sf, n);
329 break;
330
331 case V3D_QPU_A_FLBPOP:
332 add_write_dep(state, &state->last_sf, n);
333 break;
334
335 default:
336 break;
337 }
338
339 switch (inst->alu.mul.op) {
340 case V3D_QPU_M_MULTOP:
341 case V3D_QPU_M_UMUL24:
342 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
343 * resets it to 0. We could possibly reorder umul24s relative
344 * to each other, but for now just keep all the MUL parts in
345 * order.
346 */
347 add_write_dep(state, &state->last_rtop, n);
348 break;
349 default:
350 break;
351 }
352
353 if (inst->alu.add.op != V3D_QPU_A_NOP) {
354 process_waddr_deps(state, n, inst->alu.add.waddr,
355 inst->alu.add.magic_write);
356 }
357 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
358 process_waddr_deps(state, n, inst->alu.mul.waddr,
359 inst->alu.mul.magic_write);
360 }
361 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
362 process_waddr_deps(state, n, inst->sig_addr,
363 inst->sig_magic);
364 }
365
366 if (v3d_qpu_writes_r3(devinfo, inst))
367 add_write_dep(state, &state->last_r[3], n);
368 if (v3d_qpu_writes_r4(devinfo, inst))
369 add_write_dep(state, &state->last_r[4], n);
370 if (v3d_qpu_writes_r5(devinfo, inst))
371 add_write_dep(state, &state->last_r[5], n);
372
373 if (inst->sig.thrsw) {
374 /* All accumulator contents and flags are undefined after the
375 * switch.
376 */
377 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
378 add_write_dep(state, &state->last_r[i], n);
379 add_write_dep(state, &state->last_sf, n);
380
381 /* Scoreboard-locking operations have to stay after the last
382 * thread switch.
383 */
384 add_write_dep(state, &state->last_tlb, n);
385
386 add_write_dep(state, &state->last_tmu_write, n);
387 }
388
389 if (inst->sig.ldtmu) {
390 /* TMU loads are coming from a FIFO, so ordering is important.
391 */
392 add_write_dep(state, &state->last_tmu_write, n);
393 }
394
395 if (inst->sig.ldtlb | inst->sig.ldtlbu)
396 add_read_dep(state, state->last_tlb, n);
397
398 if (inst->sig.ldvpm)
399 add_write_dep(state, &state->last_vpm_read, n);
400
401 /* inst->sig.ldunif or sideband uniform read */
402 if (qinst->uniform != ~0)
403 add_write_dep(state, &state->last_unif, n);
404
405 process_cond_deps(state, n, inst->flags.ac);
406 process_cond_deps(state, n, inst->flags.mc);
407 process_pf_deps(state, n, inst->flags.apf);
408 process_pf_deps(state, n, inst->flags.mpf);
409 process_uf_deps(state, n, inst->flags.auf);
410 process_uf_deps(state, n, inst->flags.muf);
411 }
412
413 static void
414 calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list)
415 {
416 struct schedule_state state;
417
418 memset(&state, 0, sizeof(state));
419 state.devinfo = c->devinfo;
420 state.dir = F;
421
422 list_for_each_entry(struct schedule_node, node, schedule_list, link)
423 calculate_deps(&state, node);
424 }
425
426 static void
427 calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list)
428 {
429 struct list_head *node;
430 struct schedule_state state;
431
432 memset(&state, 0, sizeof(state));
433 state.devinfo = c->devinfo;
434 state.dir = R;
435
436 for (node = schedule_list->prev; schedule_list != node; node = node->prev) {
437 calculate_deps(&state, (struct schedule_node *)node);
438 }
439 }
440
441 struct choose_scoreboard {
442 int tick;
443 int last_sfu_write_tick;
444 int last_ldvary_tick;
445 int last_uniforms_reset_tick;
446 uint32_t last_waddr_add, last_waddr_mul;
447 bool tlb_locked;
448 };
449
450 static bool
451 mux_reads_too_soon(struct choose_scoreboard *scoreboard,
452 const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
453 {
454 switch (mux) {
455 case V3D_QPU_MUX_A:
456 if (scoreboard->last_waddr_add == inst->raddr_a ||
457 scoreboard->last_waddr_mul == inst->raddr_a) {
458 return true;
459 }
460 break;
461
462 case V3D_QPU_MUX_B:
463 if (scoreboard->last_waddr_add == inst->raddr_b ||
464 scoreboard->last_waddr_mul == inst->raddr_b) {
465 return true;
466 }
467 break;
468
469 case V3D_QPU_MUX_R4:
470 if (scoreboard->tick - scoreboard->last_sfu_write_tick <= 2)
471 return true;
472 break;
473
474 case V3D_QPU_MUX_R5:
475 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
476 return true;
477 break;
478 default:
479 break;
480 }
481
482 return false;
483 }
484
485 static bool
486 reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
487 struct qinst *qinst)
488 {
489 const struct v3d_qpu_instr *inst = &qinst->qpu;
490
491 /* XXX: Branching off of raddr. */
492 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
493 return false;
494
495 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
496
497 if (inst->alu.add.op != V3D_QPU_A_NOP) {
498 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
499 mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
500 return true;
501 }
502 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
503 mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
504 return true;
505 }
506 }
507
508 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
509 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
510 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
511 return true;
512 }
513 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
514 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
515 return true;
516 }
517 }
518
519 /* XXX: imm */
520
521 return false;
522 }
523
524 static bool
525 writes_too_soon_after_write(const struct v3d_device_info *devinfo,
526 struct choose_scoreboard *scoreboard,
527 struct qinst *qinst)
528 {
529 const struct v3d_qpu_instr *inst = &qinst->qpu;
530
531 /* Don't schedule any other r4 write too soon after an SFU write.
532 * This would normally be prevented by dependency tracking, but might
533 * occur if a dead SFU computation makes it to scheduling.
534 */
535 if (scoreboard->tick - scoreboard->last_sfu_write_tick < 2 &&
536 v3d_qpu_writes_r4(devinfo, inst))
537 return true;
538
539 return false;
540 }
541
542 static bool
543 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
544 const struct v3d_qpu_instr *inst)
545 {
546 return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
547 }
548
549 static int
550 get_instruction_priority(const struct v3d_qpu_instr *inst)
551 {
552 uint32_t baseline_score;
553 uint32_t next_score = 0;
554
555 /* Schedule TLB operations as late as possible, to get more
556 * parallelism between shaders.
557 */
558 if (qpu_inst_is_tlb(inst))
559 return next_score;
560 next_score++;
561
562 /* Schedule texture read results collection late to hide latency. */
563 if (inst->sig.ldtmu)
564 return next_score;
565 next_score++;
566
567 /* Default score for things that aren't otherwise special. */
568 baseline_score = next_score;
569 next_score++;
570
571 /* Schedule texture read setup early to hide their latency better. */
572 if (inst->type == V3D_QPU_INSTR_TYPE_ALU &&
573 ((inst->alu.add.magic_write &&
574 v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr)) ||
575 (inst->alu.mul.magic_write &&
576 v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr)))) {
577 return next_score;
578 }
579 next_score++;
580
581 return baseline_score;
582 }
583
584 static bool
585 qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr)
586 {
587 return (v3d_qpu_magic_waddr_is_tmu(waddr) ||
588 v3d_qpu_magic_waddr_is_sfu(waddr) ||
589 v3d_qpu_magic_waddr_is_tlb(waddr) ||
590 v3d_qpu_magic_waddr_is_vpm(waddr) ||
591 v3d_qpu_magic_waddr_is_tsy(waddr));
592 }
593
594 static bool
595 qpu_accesses_peripheral(const struct v3d_qpu_instr *inst)
596 {
597 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
598 if (inst->alu.add.op != V3D_QPU_A_NOP &&
599 inst->alu.add.magic_write &&
600 qpu_magic_waddr_is_periph(inst->alu.add.waddr)) {
601 return true;
602 }
603
604 if (inst->alu.add.op == V3D_QPU_A_VPMSETUP)
605 return true;
606
607 if (inst->alu.mul.op != V3D_QPU_M_NOP &&
608 inst->alu.mul.magic_write &&
609 qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) {
610 return true;
611 }
612 }
613
614 return (inst->sig.ldvpm ||
615 inst->sig.ldtmu ||
616 inst->sig.ldtlb ||
617 inst->sig.ldtlbu ||
618 inst->sig.wrtmuc);
619 }
620
621 static bool
622 qpu_merge_inst(const struct v3d_device_info *devinfo,
623 struct v3d_qpu_instr *result,
624 const struct v3d_qpu_instr *a,
625 const struct v3d_qpu_instr *b)
626 {
627 if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
628 b->type != V3D_QPU_INSTR_TYPE_ALU) {
629 return false;
630 }
631
632 /* Can't do more than one peripheral access in an instruction.
633 *
634 * XXX: V3D 4.1 allows TMU read along with a VPM read or write, and
635 * WRTMUC with a TMU magic register write (other than tmuc).
636 */
637 if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b))
638 return false;
639
640 struct v3d_qpu_instr merge = *a;
641
642 if (b->alu.add.op != V3D_QPU_A_NOP) {
643 if (a->alu.add.op != V3D_QPU_A_NOP)
644 return false;
645 merge.alu.add = b->alu.add;
646
647 merge.flags.ac = b->flags.ac;
648 merge.flags.apf = b->flags.apf;
649 merge.flags.auf = b->flags.auf;
650 }
651
652 if (b->alu.mul.op != V3D_QPU_M_NOP) {
653 if (a->alu.mul.op != V3D_QPU_M_NOP)
654 return false;
655 merge.alu.mul = b->alu.mul;
656
657 merge.flags.mc = b->flags.mc;
658 merge.flags.mpf = b->flags.mpf;
659 merge.flags.muf = b->flags.muf;
660 }
661
662 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) {
663 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) &&
664 a->raddr_a != b->raddr_a) {
665 return false;
666 }
667 merge.raddr_a = b->raddr_a;
668 }
669
670 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) {
671 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) &&
672 a->raddr_b != b->raddr_b) {
673 return false;
674 }
675 merge.raddr_b = b->raddr_b;
676 }
677
678 merge.sig.thrsw |= b->sig.thrsw;
679 merge.sig.ldunif |= b->sig.ldunif;
680 merge.sig.ldunifrf |= b->sig.ldunifrf;
681 merge.sig.ldunifa |= b->sig.ldunifa;
682 merge.sig.ldunifarf |= b->sig.ldunifarf;
683 merge.sig.ldtmu |= b->sig.ldtmu;
684 merge.sig.ldvary |= b->sig.ldvary;
685 merge.sig.ldvpm |= b->sig.ldvpm;
686 merge.sig.small_imm |= b->sig.small_imm;
687 merge.sig.ldtlb |= b->sig.ldtlb;
688 merge.sig.ldtlbu |= b->sig.ldtlbu;
689 merge.sig.ucb |= b->sig.ucb;
690 merge.sig.rotate |= b->sig.rotate;
691 merge.sig.wrtmuc |= b->sig.wrtmuc;
692
693 if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
694 v3d_qpu_sig_writes_address(devinfo, &b->sig))
695 return false;
696 merge.sig_addr |= b->sig_addr;
697 merge.sig_magic |= b->sig_magic;
698
699 uint64_t packed;
700 bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
701
702 *result = merge;
703 /* No modifying the real instructions on failure. */
704 assert(ok || (a != result && b != result));
705
706 return ok;
707 }
708
709 static struct schedule_node *
710 choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
711 struct choose_scoreboard *scoreboard,
712 struct list_head *schedule_list,
713 struct schedule_node *prev_inst)
714 {
715 struct schedule_node *chosen = NULL;
716 int chosen_prio = 0;
717
718 /* Don't pair up anything with a thread switch signal -- emit_thrsw()
719 * will handle pairing it along with filling the delay slots.
720 */
721 if (prev_inst) {
722 if (prev_inst->inst->qpu.sig.thrsw)
723 return NULL;
724 }
725
726 list_for_each_entry(struct schedule_node, n, schedule_list, link) {
727 const struct v3d_qpu_instr *inst = &n->inst->qpu;
728
729 /* Don't choose the branch instruction until it's the last one
730 * left. We'll move it up to fit its delay slots after we
731 * choose it.
732 */
733 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
734 !list_is_singular(schedule_list)) {
735 continue;
736 }
737
738 /* "An instruction must not read from a location in physical
739 * regfile A or B that was written to by the previous
740 * instruction."
741 */
742 if (reads_too_soon_after_write(scoreboard, n->inst))
743 continue;
744
745 if (writes_too_soon_after_write(devinfo, scoreboard, n->inst))
746 continue;
747
748 /* "A scoreboard wait must not occur in the first two
749 * instructions of a fragment shader. This is either the
750 * explicit Wait for Scoreboard signal or an implicit wait
751 * with the first tile-buffer read or write instruction."
752 */
753 if (pixel_scoreboard_too_soon(scoreboard, inst))
754 continue;
755
756 /* ldunif and ldvary both write r5, but ldunif does so a tick
757 * sooner. If the ldvary's r5 wasn't used, then ldunif might
758 * otherwise get scheduled so ldunif and ldvary try to update
759 * r5 in the same tick.
760 */
761 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
762 scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
763 continue;
764 }
765
766 /* If we're trying to pair with another instruction, check
767 * that they're compatible.
768 */
769 if (prev_inst) {
770 /* Don't pair up a thread switch signal -- we'll
771 * handle pairing it when we pick it on its own.
772 */
773 if (inst->sig.thrsw)
774 continue;
775
776 if (prev_inst->inst->uniform != -1 &&
777 n->inst->uniform != -1)
778 continue;
779
780 /* Don't merge in something that will lock the TLB.
781 * Hopwefully what we have in inst will release some
782 * other instructions, allowing us to delay the
783 * TLB-locking instruction until later.
784 */
785 if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
786 continue;
787
788 struct v3d_qpu_instr merged_inst;
789 if (!qpu_merge_inst(devinfo, &merged_inst,
790 &prev_inst->inst->qpu, inst)) {
791 continue;
792 }
793 }
794
795 int prio = get_instruction_priority(inst);
796
797 /* Found a valid instruction. If nothing better comes along,
798 * this one works.
799 */
800 if (!chosen) {
801 chosen = n;
802 chosen_prio = prio;
803 continue;
804 }
805
806 if (prio > chosen_prio) {
807 chosen = n;
808 chosen_prio = prio;
809 } else if (prio < chosen_prio) {
810 continue;
811 }
812
813 if (n->delay > chosen->delay) {
814 chosen = n;
815 chosen_prio = prio;
816 } else if (n->delay < chosen->delay) {
817 continue;
818 }
819 }
820
821 return chosen;
822 }
823
824 static void
825 update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
826 enum v3d_qpu_waddr waddr)
827 {
828 if (v3d_qpu_magic_waddr_is_sfu(waddr))
829 scoreboard->last_sfu_write_tick = scoreboard->tick;
830 }
831
832 static void
833 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
834 const struct v3d_qpu_instr *inst)
835 {
836 scoreboard->last_waddr_add = ~0;
837 scoreboard->last_waddr_mul = ~0;
838
839 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
840 return;
841
842 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
843
844 if (inst->alu.add.op != V3D_QPU_A_NOP) {
845 if (inst->alu.add.magic_write) {
846 update_scoreboard_for_magic_waddr(scoreboard,
847 inst->alu.add.waddr);
848 } else {
849 scoreboard->last_waddr_add = inst->alu.add.waddr;
850 }
851 }
852
853 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
854 if (inst->alu.mul.magic_write) {
855 update_scoreboard_for_magic_waddr(scoreboard,
856 inst->alu.mul.waddr);
857 } else {
858 scoreboard->last_waddr_mul = inst->alu.mul.waddr;
859 }
860 }
861
862 if (inst->sig.ldvary)
863 scoreboard->last_ldvary_tick = scoreboard->tick;
864
865 if (qpu_inst_is_tlb(inst))
866 scoreboard->tlb_locked = true;
867 }
868
869 static void
870 dump_state(const struct v3d_device_info *devinfo,
871 struct list_head *schedule_list)
872 {
873 list_for_each_entry(struct schedule_node, n, schedule_list, link) {
874 fprintf(stderr, " t=%4d: ", n->unblocked_time);
875 v3d_qpu_dump(devinfo, &n->inst->qpu);
876 fprintf(stderr, "\n");
877
878 for (int i = 0; i < n->child_count; i++) {
879 struct schedule_node *child = n->children[i].node;
880 if (!child)
881 continue;
882
883 fprintf(stderr, " - ");
884 v3d_qpu_dump(devinfo, &child->inst->qpu);
885 fprintf(stderr, " (%d parents, %c)\n",
886 child->parent_count,
887 n->children[i].write_after_read ? 'w' : 'r');
888 }
889 }
890 }
891
892 static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr,
893 const struct v3d_qpu_instr *after)
894 {
895 /* Apply some huge latency between texture fetch requests and getting
896 * their results back.
897 *
898 * FIXME: This is actually pretty bogus. If we do:
899 *
900 * mov tmu0_s, a
901 * <a bit of math>
902 * mov tmu0_s, b
903 * load_tmu0
904 * <more math>
905 * load_tmu0
906 *
907 * we count that as worse than
908 *
909 * mov tmu0_s, a
910 * mov tmu0_s, b
911 * <lots of math>
912 * load_tmu0
913 * <more math>
914 * load_tmu0
915 *
916 * because we associate the first load_tmu0 with the *second* tmu0_s.
917 */
918 if (v3d_qpu_magic_waddr_is_tmu(waddr) && after->sig.ldtmu)
919 return 100;
920
921 /* Assume that anything depending on us is consuming the SFU result. */
922 if (v3d_qpu_magic_waddr_is_sfu(waddr))
923 return 3;
924
925 return 1;
926 }
927
928 static uint32_t
929 instruction_latency(struct schedule_node *before, struct schedule_node *after)
930 {
931 const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
932 const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
933 uint32_t latency = 1;
934
935 if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
936 after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
937 return latency;
938
939 if (before_inst->alu.add.magic_write) {
940 latency = MAX2(latency,
941 magic_waddr_latency(before_inst->alu.add.waddr,
942 after_inst));
943 }
944
945 if (before_inst->alu.mul.magic_write) {
946 latency = MAX2(latency,
947 magic_waddr_latency(before_inst->alu.mul.waddr,
948 after_inst));
949 }
950
951 return latency;
952 }
953
954 /** Recursive computation of the delay member of a node. */
955 static void
956 compute_delay(struct schedule_node *n)
957 {
958 if (!n->child_count) {
959 n->delay = 1;
960 } else {
961 for (int i = 0; i < n->child_count; i++) {
962 if (!n->children[i].node->delay)
963 compute_delay(n->children[i].node);
964 n->delay = MAX2(n->delay,
965 n->children[i].node->delay +
966 instruction_latency(n, n->children[i].node));
967 }
968 }
969 }
970
971 static void
972 mark_instruction_scheduled(struct list_head *schedule_list,
973 uint32_t time,
974 struct schedule_node *node,
975 bool war_only)
976 {
977 if (!node)
978 return;
979
980 for (int i = node->child_count - 1; i >= 0; i--) {
981 struct schedule_node *child =
982 node->children[i].node;
983
984 if (!child)
985 continue;
986
987 if (war_only && !node->children[i].write_after_read)
988 continue;
989
990 /* If the requirement is only that the node not appear before
991 * the last read of its destination, then it can be scheduled
992 * immediately after (or paired with!) the thing reading the
993 * destination.
994 */
995 uint32_t latency = 0;
996 if (!war_only) {
997 latency = instruction_latency(node,
998 node->children[i].node);
999 }
1000
1001 child->unblocked_time = MAX2(child->unblocked_time,
1002 time + latency);
1003 child->parent_count--;
1004 if (child->parent_count == 0)
1005 list_add(&child->link, schedule_list);
1006
1007 node->children[i].node = NULL;
1008 }
1009 }
1010
1011 static struct qinst *
1012 vir_nop()
1013 {
1014 struct qreg undef = { QFILE_NULL, 0 };
1015 struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
1016
1017 return qinst;
1018 }
1019
1020 #if 0
1021 static struct qinst *
1022 nop_after(struct qinst *inst)
1023 {
1024 struct qinst *q = vir_nop();
1025
1026 list_add(&q->link, &inst->link);
1027
1028 return q;
1029 }
1030
1031 /**
1032 * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair
1033 * with another instruction.
1034 */
1035 static void
1036 emit_thrsw(struct v3d_compile *c,
1037 struct choose_scoreboard *scoreboard,
1038 const struct v3d_qpu_instr *inst)
1039 {
1040 /* There should be nothing in a thrsw inst being scheduled other than
1041 * the signal bits.
1042 */
1043 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
1044 assert(inst->alu.add.op == V3D_QPU_A_NOP);
1045 assert(inst->alu.mul.op == V3D_QPU_M_NOP);
1046
1047 /* Try to find an earlier scheduled instruction that we can merge the
1048 * thrsw into.
1049 */
1050 int thrsw_ip = c->qpu_inst_count;
1051 for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {
1052 uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];
1053 uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);
1054
1055 if (prev_sig == QPU_SIG_NONE)
1056 thrsw_ip = c->qpu_inst_count - i;
1057 }
1058
1059 if (thrsw_ip != c->qpu_inst_count) {
1060 /* Merge the thrsw into the existing instruction. */
1061 c->qpu_insts[thrsw_ip] =
1062 QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);
1063 } else {
1064 qpu_serialize_one_inst(c, inst);
1065 update_scoreboard_for_chosen(scoreboard, inst);
1066 }
1067
1068 /* Fill the delay slots. */
1069 while (c->qpu_inst_count < thrsw_ip + 3) {
1070 update_scoreboard_for_chosen(scoreboard, v3d_qpu_nop());
1071 qpu_serialize_one_inst(c, v3d_qpu_nop());
1072 }
1073 }
1074 #endif
1075
1076 static uint32_t
1077 schedule_instructions(struct v3d_compile *c,
1078 struct choose_scoreboard *scoreboard,
1079 struct qblock *block,
1080 struct list_head *schedule_list,
1081 enum quniform_contents *orig_uniform_contents,
1082 uint32_t *orig_uniform_data,
1083 uint32_t *next_uniform)
1084 {
1085 const struct v3d_device_info *devinfo = c->devinfo;
1086 uint32_t time = 0;
1087
1088 if (debug) {
1089 fprintf(stderr, "initial deps:\n");
1090 dump_state(devinfo, schedule_list);
1091 fprintf(stderr, "\n");
1092 }
1093
1094 /* Remove non-DAG heads from the list. */
1095 list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) {
1096 if (n->parent_count != 0)
1097 list_del(&n->link);
1098 }
1099
1100 while (!list_empty(schedule_list)) {
1101 struct schedule_node *chosen =
1102 choose_instruction_to_schedule(devinfo,
1103 scoreboard,
1104 schedule_list,
1105 NULL);
1106 struct schedule_node *merge = NULL;
1107
1108 /* If there are no valid instructions to schedule, drop a NOP
1109 * in.
1110 */
1111 struct qinst *qinst = chosen ? chosen->inst : vir_nop();
1112 struct v3d_qpu_instr *inst = &qinst->qpu;
1113
1114 if (debug) {
1115 fprintf(stderr, "t=%4d: current list:\n",
1116 time);
1117 dump_state(devinfo, schedule_list);
1118 fprintf(stderr, "t=%4d: chose: ", time);
1119 v3d_qpu_dump(devinfo, inst);
1120 fprintf(stderr, "\n");
1121 }
1122
1123 /* Schedule this instruction onto the QPU list. Also try to
1124 * find an instruction to pair with it.
1125 */
1126 if (chosen) {
1127 time = MAX2(chosen->unblocked_time, time);
1128 list_del(&chosen->link);
1129 mark_instruction_scheduled(schedule_list, time,
1130 chosen, true);
1131
1132 merge = choose_instruction_to_schedule(devinfo,
1133 scoreboard,
1134 schedule_list,
1135 chosen);
1136 if (merge) {
1137 time = MAX2(merge->unblocked_time, time);
1138 list_del(&merge->link);
1139 (void)qpu_merge_inst(devinfo, inst,
1140 inst, &merge->inst->qpu);
1141 if (merge->inst->uniform != -1) {
1142 chosen->inst->uniform =
1143 merge->inst->uniform;
1144 }
1145
1146 if (debug) {
1147 fprintf(stderr, "t=%4d: merging: ",
1148 time);
1149 v3d_qpu_dump(devinfo, &merge->inst->qpu);
1150 fprintf(stderr, "\n");
1151 fprintf(stderr, " result: ");
1152 v3d_qpu_dump(devinfo, inst);
1153 fprintf(stderr, "\n");
1154 }
1155 }
1156 }
1157
1158 /* Update the uniform index for the rewritten location --
1159 * branch target updating will still need to change
1160 * c->uniform_data[] using this index.
1161 */
1162 if (qinst->uniform != -1) {
1163 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
1164 block->branch_uniform = *next_uniform;
1165
1166 c->uniform_data[*next_uniform] =
1167 orig_uniform_data[qinst->uniform];
1168 c->uniform_contents[*next_uniform] =
1169 orig_uniform_contents[qinst->uniform];
1170 qinst->uniform = *next_uniform;
1171 (*next_uniform)++;
1172 }
1173
1174 if (debug) {
1175 fprintf(stderr, "\n");
1176 }
1177
1178 /* Now that we've scheduled a new instruction, some of its
1179 * children can be promoted to the list of instructions ready to
1180 * be scheduled. Update the children's unblocked time for this
1181 * DAG edge as we do so.
1182 */
1183 mark_instruction_scheduled(schedule_list, time, chosen, false);
1184
1185 if (merge) {
1186 mark_instruction_scheduled(schedule_list, time, merge,
1187 false);
1188
1189 /* The merged VIR instruction doesn't get re-added to the
1190 * block, so free it now.
1191 */
1192 free(merge->inst);
1193 }
1194
1195 if (0 && inst->sig.thrsw) {
1196 /* XXX emit_thrsw(c, scoreboard, qinst); */
1197 } else {
1198 c->qpu_inst_count++;
1199 list_addtail(&qinst->link, &block->instructions);
1200 update_scoreboard_for_chosen(scoreboard, inst);
1201 }
1202
1203 scoreboard->tick++;
1204 time++;
1205
1206 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ||
1207 inst->sig.thrsw /* XXX */) {
1208 block->branch_qpu_ip = c->qpu_inst_count - 1;
1209 /* Fill the delay slots.
1210 *
1211 * We should fill these with actual instructions,
1212 * instead, but that will probably need to be done
1213 * after this, once we know what the leading
1214 * instructions of the successors are (so we can
1215 * handle A/B register file write latency)
1216 */
1217 /* XXX: scoreboard */
1218 int slots = (inst->type == V3D_QPU_INSTR_TYPE_BRANCH ?
1219 3 : 2);
1220 for (int i = 0; i < slots; i++) {
1221 struct qinst *nop = vir_nop();
1222 list_addtail(&nop->link, &block->instructions);
1223
1224 update_scoreboard_for_chosen(scoreboard,
1225 &nop->qpu);
1226 c->qpu_inst_count++;
1227 scoreboard->tick++;
1228 time++;
1229 }
1230 }
1231 }
1232
1233 return time;
1234 }
1235
1236 static uint32_t
1237 qpu_schedule_instructions_block(struct v3d_compile *c,
1238 struct choose_scoreboard *scoreboard,
1239 struct qblock *block,
1240 enum quniform_contents *orig_uniform_contents,
1241 uint32_t *orig_uniform_data,
1242 uint32_t *next_uniform)
1243 {
1244 void *mem_ctx = ralloc_context(NULL);
1245 struct list_head schedule_list;
1246
1247 list_inithead(&schedule_list);
1248
1249 /* Wrap each instruction in a scheduler structure. */
1250 while (!list_empty(&block->instructions)) {
1251 struct qinst *qinst = (struct qinst *)block->instructions.next;
1252 struct schedule_node *n =
1253 rzalloc(mem_ctx, struct schedule_node);
1254
1255 n->inst = qinst;
1256
1257 list_del(&qinst->link);
1258 list_addtail(&n->link, &schedule_list);
1259 }
1260
1261 calculate_forward_deps(c, &schedule_list);
1262 calculate_reverse_deps(c, &schedule_list);
1263
1264 list_for_each_entry(struct schedule_node, n, &schedule_list, link) {
1265 compute_delay(n);
1266 }
1267
1268 uint32_t cycles = schedule_instructions(c, scoreboard, block,
1269 &schedule_list,
1270 orig_uniform_contents,
1271 orig_uniform_data,
1272 next_uniform);
1273
1274 ralloc_free(mem_ctx);
1275
1276 return cycles;
1277 }
1278
1279 static void
1280 qpu_set_branch_targets(struct v3d_compile *c)
1281 {
1282 vir_for_each_block(block, c) {
1283 /* The end block of the program has no branch. */
1284 if (!block->successors[0])
1285 continue;
1286
1287 /* If there was no branch instruction, then the successor
1288 * block must follow immediately after this one.
1289 */
1290 if (block->branch_qpu_ip == ~0) {
1291 assert(block->end_qpu_ip + 1 ==
1292 block->successors[0]->start_qpu_ip);
1293 continue;
1294 }
1295
1296 /* Walk back through the delay slots to find the branch
1297 * instr.
1298 */
1299 struct list_head *entry = block->instructions.prev;
1300 for (int i = 0; i < 3; i++)
1301 entry = entry->prev;
1302 struct qinst *branch = container_of(entry, branch, link);
1303 assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
1304
1305 /* Make sure that the if-we-don't-jump
1306 * successor was scheduled just after the
1307 * delay slots.
1308 */
1309 assert(!block->successors[1] ||
1310 block->successors[1]->start_qpu_ip ==
1311 block->branch_qpu_ip + 4);
1312
1313 branch->qpu.branch.offset =
1314 ((block->successors[0]->start_qpu_ip -
1315 (block->branch_qpu_ip + 4)) *
1316 sizeof(uint64_t));
1317
1318 /* Set up the relative offset to jump in the
1319 * uniform stream.
1320 *
1321 * Use a temporary here, because
1322 * uniform_data[inst->uniform] may be shared
1323 * between multiple instructions.
1324 */
1325 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
1326 c->uniform_data[branch->uniform] =
1327 (block->successors[0]->start_uniform -
1328 (block->branch_uniform + 1)) * 4;
1329 }
1330 }
1331
1332 uint32_t
1333 v3d_qpu_schedule_instructions(struct v3d_compile *c)
1334 {
1335 const struct v3d_device_info *devinfo = c->devinfo;
1336
1337 /* We reorder the uniforms as we schedule instructions, so save the
1338 * old data off and replace it.
1339 */
1340 uint32_t *uniform_data = c->uniform_data;
1341 enum quniform_contents *uniform_contents = c->uniform_contents;
1342 c->uniform_contents = ralloc_array(c, enum quniform_contents,
1343 c->num_uniforms);
1344 c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
1345 c->uniform_array_size = c->num_uniforms;
1346 uint32_t next_uniform = 0;
1347
1348 struct choose_scoreboard scoreboard;
1349 memset(&scoreboard, 0, sizeof(scoreboard));
1350 scoreboard.last_waddr_add = ~0;
1351 scoreboard.last_waddr_mul = ~0;
1352 scoreboard.last_ldvary_tick = -10;
1353 scoreboard.last_sfu_write_tick = -10;
1354 scoreboard.last_uniforms_reset_tick = -10;
1355
1356 if (debug) {
1357 fprintf(stderr, "Pre-schedule instructions\n");
1358 vir_for_each_block(block, c) {
1359 fprintf(stderr, "BLOCK %d\n", block->index);
1360 list_for_each_entry(struct qinst, qinst,
1361 &block->instructions, link) {
1362 v3d_qpu_dump(devinfo, &qinst->qpu);
1363 fprintf(stderr, "\n");
1364 }
1365 }
1366 fprintf(stderr, "\n");
1367 }
1368
1369 uint32_t cycles = 0;
1370 vir_for_each_block(block, c) {
1371 block->start_qpu_ip = c->qpu_inst_count;
1372 block->branch_qpu_ip = ~0;
1373 block->start_uniform = next_uniform;
1374
1375 cycles += qpu_schedule_instructions_block(c,
1376 &scoreboard,
1377 block,
1378 uniform_contents,
1379 uniform_data,
1380 &next_uniform);
1381
1382 block->end_qpu_ip = c->qpu_inst_count - 1;
1383 }
1384
1385 qpu_set_branch_targets(c);
1386
1387 assert(next_uniform == c->num_uniforms);
1388
1389 return cycles;
1390 }