vc4: Add debug output to match shaderdb info to program dumps.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29
30 static void
31 vc4_dump_program(struct vc4_compile *c)
32 {
33 fprintf(stderr, "%s prog %d/%d QPU:\n",
34 qir_get_stage_name(c->stage),
35 c->program_id, c->variant_id);
36
37 for (int i = 0; i < c->qpu_inst_count; i++) {
38 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
39 vc4_qpu_disasm(&c->qpu_insts[i], 1);
40 fprintf(stderr, "\n");
41 }
42 }
43
44 struct queued_qpu_inst {
45 struct simple_node link;
46 uint64_t inst;
47 };
48
49 static void
50 queue(struct vc4_compile *c, uint64_t inst)
51 {
52 struct queued_qpu_inst *q = calloc(1, sizeof(*q));
53 q->inst = inst;
54 insert_at_tail(&c->qpu_inst_list, &q->link);
55 }
56
57 static uint64_t *
58 last_inst(struct vc4_compile *c)
59 {
60 struct queued_qpu_inst *q =
61 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
62 return &q->inst;
63 }
64
65 static void
66 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
67 {
68 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
69 }
70
71 /**
72 * Some special registers can be read from either file, which lets us resolve
73 * raddr conflicts without extra MOVs.
74 */
75 static bool
76 swap_file(struct qpu_reg *src)
77 {
78 switch (src->addr) {
79 case QPU_R_UNIF:
80 case QPU_R_VARY:
81 if (src->mux == QPU_MUX_A)
82 src->mux = QPU_MUX_B;
83 else
84 src->mux = QPU_MUX_A;
85 return true;
86
87 default:
88 return false;
89 }
90 }
91
92 /**
93 * This is used to resolve the fact that we might register-allocate two
94 * different operands of an instruction to the same physical register file
95 * even though instructions have only one field for the register file source
96 * address.
97 *
98 * In that case, we need to move one to a temporary that can be used in the
99 * instruction, instead.
100 */
101 static void
102 fixup_raddr_conflict(struct vc4_compile *c,
103 struct qpu_reg *src0, struct qpu_reg *src1)
104 {
105 if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
106 src0->mux != src1->mux ||
107 src0->addr == src1->addr) {
108 return;
109 }
110
111 if (swap_file(src0) || swap_file(src1))
112 return;
113
114 queue(c, qpu_a_MOV(qpu_r3(), *src1));
115 *src1 = qpu_r3();
116 }
117
118 static void
119 serialize_one_inst(struct vc4_compile *c, uint64_t inst)
120 {
121 if (c->qpu_inst_count >= c->qpu_inst_size) {
122 c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2);
123 c->qpu_insts = realloc(c->qpu_insts,
124 c->qpu_inst_size * sizeof(uint64_t));
125 }
126 c->qpu_insts[c->qpu_inst_count++] = inst;
127 }
128
129 static void
130 serialize_insts(struct vc4_compile *c)
131 {
132 int last_sfu_write = -10;
133 bool scoreboard_wait_emitted = false;
134
135 while (!is_empty_list(&c->qpu_inst_list)) {
136 struct queued_qpu_inst *q =
137 (struct queued_qpu_inst *)first_elem(&c->qpu_inst_list);
138 uint32_t last_waddr_a = QPU_W_NOP, last_waddr_b = QPU_W_NOP;
139 uint32_t raddr_a = QPU_GET_FIELD(q->inst, QPU_RADDR_A);
140 uint32_t raddr_b = QPU_GET_FIELD(q->inst, QPU_RADDR_B);
141
142 if (c->qpu_inst_count > 0) {
143 uint64_t last_inst = c->qpu_insts[c->qpu_inst_count -
144 1];
145 uint32_t last_waddr_add = QPU_GET_FIELD(last_inst,
146 QPU_WADDR_ADD);
147 uint32_t last_waddr_mul = QPU_GET_FIELD(last_inst,
148 QPU_WADDR_MUL);
149
150 if (last_inst & QPU_WS) {
151 last_waddr_a = last_waddr_mul;
152 last_waddr_b = last_waddr_add;
153 } else {
154 last_waddr_a = last_waddr_add;
155 last_waddr_b = last_waddr_mul;
156 }
157 }
158
159 uint32_t src_muxes[] = {
160 QPU_GET_FIELD(q->inst, QPU_ADD_A),
161 QPU_GET_FIELD(q->inst, QPU_ADD_B),
162 QPU_GET_FIELD(q->inst, QPU_MUL_A),
163 QPU_GET_FIELD(q->inst, QPU_MUL_B),
164 };
165
166 /* "An instruction must not read from a location in physical
167 * regfile A or B that was written to by the previous
168 * instruction."
169 */
170 bool needs_raddr_vs_waddr_nop = false;
171 bool reads_r4 = false;
172 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
173 if ((raddr_a < 32 &&
174 src_muxes[i] == QPU_MUX_A &&
175 last_waddr_a == raddr_a) ||
176 (raddr_b < 32 &&
177 src_muxes[i] == QPU_MUX_B &&
178 last_waddr_b == raddr_b)) {
179 needs_raddr_vs_waddr_nop = true;
180 }
181 if (src_muxes[i] == QPU_MUX_R4)
182 reads_r4 = true;
183 }
184
185 if (needs_raddr_vs_waddr_nop) {
186 serialize_one_inst(c, qpu_NOP());
187 }
188
189 /* "After an SFU lookup instruction, accumulator r4 must not
190 * be read in the following two instructions. Any other
191 * instruction that results in r4 being written (that is, TMU
192 * read, TLB read, SFU lookup) cannot occur in the two
193 * instructions following an SFU lookup."
194 */
195 if (reads_r4) {
196 while (c->qpu_inst_count - last_sfu_write < 3) {
197 serialize_one_inst(c, qpu_NOP());
198 }
199 }
200
201 uint32_t waddr_a = QPU_GET_FIELD(q->inst, QPU_WADDR_ADD);
202 uint32_t waddr_m = QPU_GET_FIELD(q->inst, QPU_WADDR_MUL);
203 if ((waddr_a >= QPU_W_SFU_RECIP && waddr_a <= QPU_W_SFU_LOG) ||
204 (waddr_m >= QPU_W_SFU_RECIP && waddr_m <= QPU_W_SFU_LOG)) {
205 last_sfu_write = c->qpu_inst_count;
206 }
207
208 /* "A scoreboard wait must not occur in the first two
209 * instructions of a fragment shader. This is either the
210 * explicit Wait for Scoreboard signal or an implicit wait
211 * with the first tile-buffer read or write instruction."
212 */
213 if (!scoreboard_wait_emitted &&
214 (waddr_a == QPU_W_TLB_Z || waddr_m == QPU_W_TLB_Z ||
215 waddr_a == QPU_W_TLB_COLOR_MS ||
216 waddr_m == QPU_W_TLB_COLOR_MS ||
217 waddr_a == QPU_W_TLB_COLOR_ALL ||
218 waddr_m == QPU_W_TLB_COLOR_ALL ||
219 QPU_GET_FIELD(q->inst, QPU_SIG) == QPU_SIG_COLOR_LOAD)) {
220 while (c->qpu_inst_count < 3 ||
221 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
222 QPU_SIG) != QPU_SIG_NONE) {
223 serialize_one_inst(c, qpu_NOP());
224 }
225 c->qpu_insts[c->qpu_inst_count - 1] =
226 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
227 QPU_SIG_WAIT_FOR_SCOREBOARD);
228 scoreboard_wait_emitted = true;
229 }
230
231 serialize_one_inst(c, q->inst);
232
233 remove_from_list(&q->link);
234 free(q);
235 }
236 }
237
238 void
239 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
240 {
241 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
242 bool discard = false;
243 uint32_t inputs_remaining = c->num_inputs;
244 uint32_t vpm_read_fifo_count = 0;
245 uint32_t vpm_read_offset = 0;
246
247 make_empty_list(&c->qpu_inst_list);
248
249 switch (c->stage) {
250 case QSTAGE_VERT:
251 case QSTAGE_COORD:
252 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
253 * load up to 16 dwords (4 vec4s) per vertex.
254 */
255 while (inputs_remaining) {
256 uint32_t num_entries = MIN2(inputs_remaining, 16);
257 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
258 vpm_read_offset |
259 0x00001a00 |
260 ((num_entries & 0xf) << 20)));
261 inputs_remaining -= num_entries;
262 vpm_read_offset += num_entries;
263 vpm_read_fifo_count++;
264 }
265 assert(vpm_read_fifo_count <= 4);
266
267 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
268 break;
269 case QSTAGE_FRAG:
270 break;
271 }
272
273 struct simple_node *node;
274 foreach(node, &c->instructions) {
275 struct qinst *qinst = (struct qinst *)node;
276
277 #if 0
278 fprintf(stderr, "translating qinst to qpu: ");
279 qir_dump_inst(qinst);
280 fprintf(stderr, "\n");
281 #endif
282
283 static const struct {
284 uint32_t op;
285 bool is_mul;
286 } translate[] = {
287 #define A(name) [QOP_##name] = {QPU_A_##name, false}
288 #define M(name) [QOP_##name] = {QPU_M_##name, true}
289 A(FADD),
290 A(FSUB),
291 A(FMIN),
292 A(FMAX),
293 A(FMINABS),
294 A(FMAXABS),
295 A(FTOI),
296 A(ITOF),
297 A(ADD),
298 A(SUB),
299 A(SHL),
300 A(SHR),
301 A(ASR),
302 A(MIN),
303 A(MAX),
304 A(AND),
305 A(OR),
306 A(XOR),
307 A(NOT),
308
309 M(FMUL),
310 M(MUL24),
311 };
312
313 struct qpu_reg src[4];
314 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
315 int index = qinst->src[i].index;
316 switch (qinst->src[i].file) {
317 case QFILE_NULL:
318 src[i] = qpu_rn(0);
319 break;
320 case QFILE_TEMP:
321 src[i] = temp_registers[index];
322 break;
323 case QFILE_UNIF:
324 src[i] = qpu_unif();
325 break;
326 case QFILE_VARY:
327 src[i] = qpu_vary();
328 break;
329 }
330 }
331
332 struct qpu_reg dst;
333 switch (qinst->dst.file) {
334 case QFILE_NULL:
335 dst = qpu_ra(QPU_W_NOP);
336 break;
337 case QFILE_TEMP:
338 dst = temp_registers[qinst->dst.index];
339 break;
340 case QFILE_VARY:
341 case QFILE_UNIF:
342 assert(!"not reached");
343 break;
344 }
345
346 switch (qinst->op) {
347 case QOP_MOV:
348 /* Skip emitting the MOV if it's a no-op. */
349 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
350 dst.mux != src[0].mux || dst.addr != src[0].addr) {
351 queue(c, qpu_a_MOV(dst, src[0]));
352 }
353 break;
354
355 case QOP_SF:
356 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
357 *last_inst(c) |= QPU_SF;
358 break;
359
360 case QOP_SEL_X_0_ZS:
361 case QOP_SEL_X_0_ZC:
362 case QOP_SEL_X_0_NS:
363 case QOP_SEL_X_0_NC:
364 queue(c, qpu_a_MOV(dst, src[0]));
365 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
366 QPU_COND_ZS);
367
368 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
369 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
370 1) + QPU_COND_ZS);
371 break;
372
373 case QOP_SEL_X_Y_ZS:
374 case QOP_SEL_X_Y_ZC:
375 case QOP_SEL_X_Y_NS:
376 case QOP_SEL_X_Y_NC:
377 queue(c, qpu_a_MOV(dst, src[0]));
378 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
379 QPU_COND_ZS);
380
381 queue(c, qpu_a_MOV(dst, src[1]));
382 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
383 1) + QPU_COND_ZS);
384
385 break;
386
387 case QOP_VPM_WRITE:
388 queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
389 break;
390
391 case QOP_VPM_READ:
392 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
393 break;
394
395 case QOP_RCP:
396 case QOP_RSQ:
397 case QOP_EXP2:
398 case QOP_LOG2:
399 switch (qinst->op) {
400 case QOP_RCP:
401 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
402 src[0]));
403 break;
404 case QOP_RSQ:
405 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
406 src[0]));
407 break;
408 case QOP_EXP2:
409 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
410 src[0]));
411 break;
412 case QOP_LOG2:
413 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
414 src[0]));
415 break;
416 default:
417 abort();
418 }
419
420 queue(c, qpu_a_MOV(dst, qpu_r4()));
421
422 break;
423
424 case QOP_PACK_COLORS:
425 for (int i = 0; i < 4; i++) {
426 queue(c, qpu_m_MOV(qpu_r3(), src[i]));
427 *last_inst(c) |= QPU_PM;
428 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
429 QPU_PACK);
430 }
431
432 queue(c, qpu_a_MOV(dst, qpu_r3()));
433
434 break;
435
436 case QOP_FRAG_X:
437 queue(c, qpu_a_ITOF(dst,
438 qpu_ra(QPU_R_XY_PIXEL_COORD)));
439 break;
440
441 case QOP_FRAG_Y:
442 queue(c, qpu_a_ITOF(dst,
443 qpu_rb(QPU_R_XY_PIXEL_COORD)));
444 break;
445
446 case QOP_FRAG_REV_FLAG:
447 queue(c, qpu_a_ITOF(dst,
448 qpu_rb(QPU_R_MS_REV_FLAGS)));
449 break;
450
451 case QOP_FRAG_Z:
452 case QOP_FRAG_W:
453 /* QOP_FRAG_Z/W don't emit instructions, just allocate
454 * the register to the Z/W payload.
455 */
456 break;
457
458 case QOP_TLB_DISCARD_SETUP:
459 discard = true;
460 queue(c, qpu_a_MOV(src[0], src[0]));
461 *last_inst(c) |= QPU_SF;
462 break;
463
464 case QOP_TLB_STENCIL_SETUP:
465 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
466 break;
467
468 case QOP_TLB_Z_WRITE:
469 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
470 if (discard) {
471 set_last_cond_add(c, QPU_COND_ZS);
472 }
473 break;
474
475 case QOP_TLB_COLOR_READ:
476 queue(c, qpu_NOP());
477 *last_inst(c) = qpu_set_sig(*last_inst(c),
478 QPU_SIG_COLOR_LOAD);
479
480 break;
481
482 case QOP_TLB_COLOR_WRITE:
483 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
484 if (discard) {
485 set_last_cond_add(c, QPU_COND_ZS);
486 }
487 break;
488
489 case QOP_VARY_ADD_C:
490 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
491 break;
492
493 case QOP_PACK_SCALED: {
494 uint64_t a = (qpu_a_MOV(dst, src[0]) |
495 QPU_SET_FIELD(QPU_PACK_A_16A,
496 QPU_PACK));
497 uint64_t b = (qpu_a_MOV(dst, src[1]) |
498 QPU_SET_FIELD(QPU_PACK_A_16B,
499 QPU_PACK));
500
501 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
502 queue(c, b);
503 queue(c, a);
504 } else {
505 queue(c, a);
506 queue(c, b);
507 }
508 break;
509 }
510
511 case QOP_TEX_S:
512 case QOP_TEX_T:
513 case QOP_TEX_R:
514 case QOP_TEX_B:
515 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
516 (qinst->op - QOP_TEX_S)),
517 src[0]));
518 break;
519
520 case QOP_TEX_RESULT:
521 queue(c, qpu_NOP());
522 *last_inst(c) = qpu_set_sig(*last_inst(c),
523 QPU_SIG_LOAD_TMU0);
524
525 break;
526
527 case QOP_R4_UNPACK_A:
528 case QOP_R4_UNPACK_B:
529 case QOP_R4_UNPACK_C:
530 case QOP_R4_UNPACK_D:
531 assert(src[0].mux == QPU_MUX_R4);
532 queue(c, qpu_a_MOV(dst, src[0]));
533 *last_inst(c) |= QPU_PM;
534 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
535 (qinst->op -
536 QOP_R4_UNPACK_A),
537 QPU_UNPACK);
538
539 break;
540
541 case QOP_UNPACK_8A:
542 case QOP_UNPACK_8B:
543 case QOP_UNPACK_8C:
544 case QOP_UNPACK_8D: {
545 assert(src[0].mux == QPU_MUX_A);
546
547 /* And, since we're setting the pack bits, if the
548 * destination is in A it would get re-packed.
549 */
550 struct qpu_reg orig_dst = dst;
551 if (orig_dst.mux == QPU_MUX_A)
552 dst = qpu_rn(3);
553
554 queue(c, qpu_a_FMAX(dst, src[0], src[0]));
555 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
556 (qinst->op -
557 QOP_UNPACK_8A),
558 QPU_UNPACK);
559
560 if (orig_dst.mux == QPU_MUX_A) {
561 queue(c, qpu_a_MOV(orig_dst, dst));
562 }
563 }
564 break;
565
566 default:
567 assert(qinst->op < ARRAY_SIZE(translate));
568 assert(translate[qinst->op].op != 0); /* NOPs */
569
570 /* If we have only one source, put it in the second
571 * argument slot as well so that we don't take up
572 * another raddr just to get unused data.
573 */
574 if (qir_get_op_nsrc(qinst->op) == 1)
575 src[1] = src[0];
576
577 fixup_raddr_conflict(c, &src[0], &src[1]);
578
579 if (translate[qinst->op].is_mul) {
580 queue(c, qpu_m_alu2(translate[qinst->op].op,
581 dst,
582 src[0], src[1]));
583 } else {
584 queue(c, qpu_a_alu2(translate[qinst->op].op,
585 dst,
586 src[0], src[1]));
587 }
588 break;
589 }
590 }
591
592 serialize_insts(c);
593
594 /* thread end can't have VPM write */
595 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
596 QPU_WADDR_ADD) == QPU_W_VPM ||
597 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
598 QPU_WADDR_MUL) == QPU_W_VPM) {
599 serialize_one_inst(c, qpu_NOP());
600 }
601
602 /* thread end can't have uniform read */
603 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
604 QPU_RADDR_A) == QPU_R_UNIF ||
605 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
606 QPU_RADDR_B) == QPU_R_UNIF) {
607 serialize_one_inst(c, qpu_NOP());
608 }
609
610 c->qpu_insts[c->qpu_inst_count - 1] =
611 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
612 QPU_SIG_PROG_END);
613 serialize_one_inst(c, qpu_NOP());
614 serialize_one_inst(c, qpu_NOP());
615
616 switch (c->stage) {
617 case QSTAGE_VERT:
618 case QSTAGE_COORD:
619 break;
620 case QSTAGE_FRAG:
621 c->qpu_insts[c->qpu_inst_count - 1] =
622 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
623 QPU_SIG_SCOREBOARD_UNLOCK);
624 break;
625 }
626
627 if (vc4_debug & VC4_DEBUG_QPU)
628 vc4_dump_program(c);
629
630 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
631
632 free(temp_registers);
633 }