vc4: Add support for ARL and indirect register access on TGSI_FILE_CONSTANT.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29
30 static void
31 vc4_dump_program(struct vc4_compile *c)
32 {
33 fprintf(stderr, "%s prog %d/%d QPU:\n",
34 qir_get_stage_name(c->stage),
35 c->program_id, c->variant_id);
36
37 for (int i = 0; i < c->qpu_inst_count; i++) {
38 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
39 vc4_qpu_disasm(&c->qpu_insts[i], 1);
40 fprintf(stderr, "\n");
41 }
42 }
43
44 struct queued_qpu_inst {
45 struct simple_node link;
46 uint64_t inst;
47 };
48
49 static void
50 queue(struct vc4_compile *c, uint64_t inst)
51 {
52 struct queued_qpu_inst *q = calloc(1, sizeof(*q));
53 q->inst = inst;
54 insert_at_tail(&c->qpu_inst_list, &q->link);
55 }
56
57 static uint64_t *
58 last_inst(struct vc4_compile *c)
59 {
60 struct queued_qpu_inst *q =
61 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
62 return &q->inst;
63 }
64
65 static void
66 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
67 {
68 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
69 }
70
71 /**
72 * Some special registers can be read from either file, which lets us resolve
73 * raddr conflicts without extra MOVs.
74 */
75 static bool
76 swap_file(struct qpu_reg *src)
77 {
78 switch (src->addr) {
79 case QPU_R_UNIF:
80 case QPU_R_VARY:
81 if (src->mux == QPU_MUX_A)
82 src->mux = QPU_MUX_B;
83 else
84 src->mux = QPU_MUX_A;
85 return true;
86
87 default:
88 return false;
89 }
90 }
91
92 /**
93 * This is used to resolve the fact that we might register-allocate two
94 * different operands of an instruction to the same physical register file
95 * even though instructions have only one field for the register file source
96 * address.
97 *
98 * In that case, we need to move one to a temporary that can be used in the
99 * instruction, instead.
100 */
101 static void
102 fixup_raddr_conflict(struct vc4_compile *c,
103 struct qpu_reg *src0, struct qpu_reg *src1)
104 {
105 if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
106 src0->mux != src1->mux ||
107 src0->addr == src1->addr) {
108 return;
109 }
110
111 if (swap_file(src0) || swap_file(src1))
112 return;
113
114 queue(c, qpu_a_MOV(qpu_r3(), *src1));
115 *src1 = qpu_r3();
116 }
117
118 static void
119 serialize_one_inst(struct vc4_compile *c, uint64_t inst)
120 {
121 if (c->qpu_inst_count >= c->qpu_inst_size) {
122 c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2);
123 c->qpu_insts = realloc(c->qpu_insts,
124 c->qpu_inst_size * sizeof(uint64_t));
125 }
126 c->qpu_insts[c->qpu_inst_count++] = inst;
127 }
128
129 static void
130 serialize_insts(struct vc4_compile *c)
131 {
132 int last_sfu_write = -10;
133 bool scoreboard_wait_emitted = false;
134
135 while (!is_empty_list(&c->qpu_inst_list)) {
136 struct queued_qpu_inst *q =
137 (struct queued_qpu_inst *)first_elem(&c->qpu_inst_list);
138 uint32_t last_waddr_a = QPU_W_NOP, last_waddr_b = QPU_W_NOP;
139 uint32_t raddr_a = QPU_GET_FIELD(q->inst, QPU_RADDR_A);
140 uint32_t raddr_b = QPU_GET_FIELD(q->inst, QPU_RADDR_B);
141
142 if (c->qpu_inst_count > 0) {
143 uint64_t last_inst = c->qpu_insts[c->qpu_inst_count -
144 1];
145 uint32_t last_waddr_add = QPU_GET_FIELD(last_inst,
146 QPU_WADDR_ADD);
147 uint32_t last_waddr_mul = QPU_GET_FIELD(last_inst,
148 QPU_WADDR_MUL);
149
150 if (last_inst & QPU_WS) {
151 last_waddr_a = last_waddr_mul;
152 last_waddr_b = last_waddr_add;
153 } else {
154 last_waddr_a = last_waddr_add;
155 last_waddr_b = last_waddr_mul;
156 }
157 }
158
159 uint32_t src_muxes[] = {
160 QPU_GET_FIELD(q->inst, QPU_ADD_A),
161 QPU_GET_FIELD(q->inst, QPU_ADD_B),
162 QPU_GET_FIELD(q->inst, QPU_MUL_A),
163 QPU_GET_FIELD(q->inst, QPU_MUL_B),
164 };
165
166 /* "An instruction must not read from a location in physical
167 * regfile A or B that was written to by the previous
168 * instruction."
169 */
170 bool needs_raddr_vs_waddr_nop = false;
171 bool reads_r4 = false;
172 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
173 if ((raddr_a < 32 &&
174 src_muxes[i] == QPU_MUX_A &&
175 last_waddr_a == raddr_a) ||
176 (raddr_b < 32 &&
177 src_muxes[i] == QPU_MUX_B &&
178 last_waddr_b == raddr_b)) {
179 needs_raddr_vs_waddr_nop = true;
180 }
181 if (src_muxes[i] == QPU_MUX_R4)
182 reads_r4 = true;
183 }
184
185 if (needs_raddr_vs_waddr_nop) {
186 serialize_one_inst(c, qpu_NOP());
187 }
188
189 /* "After an SFU lookup instruction, accumulator r4 must not
190 * be read in the following two instructions. Any other
191 * instruction that results in r4 being written (that is, TMU
192 * read, TLB read, SFU lookup) cannot occur in the two
193 * instructions following an SFU lookup."
194 */
195 if (reads_r4) {
196 while (c->qpu_inst_count - last_sfu_write < 3) {
197 serialize_one_inst(c, qpu_NOP());
198 }
199 }
200
201 uint32_t waddr_a = QPU_GET_FIELD(q->inst, QPU_WADDR_ADD);
202 uint32_t waddr_m = QPU_GET_FIELD(q->inst, QPU_WADDR_MUL);
203 if ((waddr_a >= QPU_W_SFU_RECIP && waddr_a <= QPU_W_SFU_LOG) ||
204 (waddr_m >= QPU_W_SFU_RECIP && waddr_m <= QPU_W_SFU_LOG)) {
205 last_sfu_write = c->qpu_inst_count;
206 }
207
208 /* "A scoreboard wait must not occur in the first two
209 * instructions of a fragment shader. This is either the
210 * explicit Wait for Scoreboard signal or an implicit wait
211 * with the first tile-buffer read or write instruction."
212 */
213 if (!scoreboard_wait_emitted &&
214 (waddr_a == QPU_W_TLB_Z || waddr_m == QPU_W_TLB_Z ||
215 waddr_a == QPU_W_TLB_COLOR_MS ||
216 waddr_m == QPU_W_TLB_COLOR_MS ||
217 waddr_a == QPU_W_TLB_COLOR_ALL ||
218 waddr_m == QPU_W_TLB_COLOR_ALL ||
219 QPU_GET_FIELD(q->inst, QPU_SIG) == QPU_SIG_COLOR_LOAD)) {
220 while (c->qpu_inst_count < 3 ||
221 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
222 QPU_SIG) != QPU_SIG_NONE) {
223 serialize_one_inst(c, qpu_NOP());
224 }
225 c->qpu_insts[c->qpu_inst_count - 1] =
226 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
227 QPU_SIG_WAIT_FOR_SCOREBOARD);
228 scoreboard_wait_emitted = true;
229 }
230
231 serialize_one_inst(c, q->inst);
232
233 remove_from_list(&q->link);
234 free(q);
235 }
236 }
237
238 void
239 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
240 {
241 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
242 bool discard = false;
243 uint32_t inputs_remaining = c->num_inputs;
244 uint32_t vpm_read_fifo_count = 0;
245 uint32_t vpm_read_offset = 0;
246
247 make_empty_list(&c->qpu_inst_list);
248
249 switch (c->stage) {
250 case QSTAGE_VERT:
251 case QSTAGE_COORD:
252 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
253 * load up to 16 dwords (4 vec4s) per vertex.
254 */
255 while (inputs_remaining) {
256 uint32_t num_entries = MIN2(inputs_remaining, 16);
257 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
258 vpm_read_offset |
259 0x00001a00 |
260 ((num_entries & 0xf) << 20)));
261 inputs_remaining -= num_entries;
262 vpm_read_offset += num_entries;
263 vpm_read_fifo_count++;
264 }
265 assert(vpm_read_fifo_count <= 4);
266
267 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
268 break;
269 case QSTAGE_FRAG:
270 break;
271 }
272
273 struct simple_node *node;
274 foreach(node, &c->instructions) {
275 struct qinst *qinst = (struct qinst *)node;
276
277 #if 0
278 fprintf(stderr, "translating qinst to qpu: ");
279 qir_dump_inst(qinst);
280 fprintf(stderr, "\n");
281 #endif
282
283 static const struct {
284 uint32_t op;
285 bool is_mul;
286 } translate[] = {
287 #define A(name) [QOP_##name] = {QPU_A_##name, false}
288 #define M(name) [QOP_##name] = {QPU_M_##name, true}
289 A(FADD),
290 A(FSUB),
291 A(FMIN),
292 A(FMAX),
293 A(FMINABS),
294 A(FMAXABS),
295 A(FTOI),
296 A(ITOF),
297 A(ADD),
298 A(SUB),
299 A(SHL),
300 A(SHR),
301 A(ASR),
302 A(MIN),
303 A(MAX),
304 A(AND),
305 A(OR),
306 A(XOR),
307 A(NOT),
308
309 M(FMUL),
310 M(MUL24),
311 };
312
313 struct qpu_reg src[4];
314 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
315 int index = qinst->src[i].index;
316 switch (qinst->src[i].file) {
317 case QFILE_NULL:
318 src[i] = qpu_rn(0);
319 break;
320 case QFILE_TEMP:
321 src[i] = temp_registers[index];
322 break;
323 case QFILE_UNIF:
324 src[i] = qpu_unif();
325 break;
326 case QFILE_VARY:
327 src[i] = qpu_vary();
328 break;
329 }
330 }
331
332 struct qpu_reg dst;
333 switch (qinst->dst.file) {
334 case QFILE_NULL:
335 dst = qpu_ra(QPU_W_NOP);
336 break;
337 case QFILE_TEMP:
338 dst = temp_registers[qinst->dst.index];
339 break;
340 case QFILE_VARY:
341 case QFILE_UNIF:
342 assert(!"not reached");
343 break;
344 }
345
346 switch (qinst->op) {
347 case QOP_MOV:
348 /* Skip emitting the MOV if it's a no-op. */
349 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
350 dst.mux != src[0].mux || dst.addr != src[0].addr) {
351 queue(c, qpu_a_MOV(dst, src[0]));
352 }
353 break;
354
355 case QOP_SF:
356 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
357 *last_inst(c) |= QPU_SF;
358 break;
359
360 case QOP_SEL_X_0_ZS:
361 case QOP_SEL_X_0_ZC:
362 case QOP_SEL_X_0_NS:
363 case QOP_SEL_X_0_NC:
364 queue(c, qpu_a_MOV(dst, src[0]));
365 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
366 QPU_COND_ZS);
367
368 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
369 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
370 1) + QPU_COND_ZS);
371 break;
372
373 case QOP_SEL_X_Y_ZS:
374 case QOP_SEL_X_Y_ZC:
375 case QOP_SEL_X_Y_NS:
376 case QOP_SEL_X_Y_NC:
377 queue(c, qpu_a_MOV(dst, src[0]));
378 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
379 QPU_COND_ZS);
380
381 queue(c, qpu_a_MOV(dst, src[1]));
382 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
383 1) + QPU_COND_ZS);
384
385 break;
386
387 case QOP_VPM_WRITE:
388 queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
389 break;
390
391 case QOP_VPM_READ:
392 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
393 break;
394
395 case QOP_RCP:
396 case QOP_RSQ:
397 case QOP_EXP2:
398 case QOP_LOG2:
399 switch (qinst->op) {
400 case QOP_RCP:
401 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
402 src[0]));
403 break;
404 case QOP_RSQ:
405 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
406 src[0]));
407 break;
408 case QOP_EXP2:
409 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
410 src[0]));
411 break;
412 case QOP_LOG2:
413 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
414 src[0]));
415 break;
416 default:
417 abort();
418 }
419
420 queue(c, qpu_a_MOV(dst, qpu_r4()));
421
422 break;
423
424 case QOP_PACK_COLORS:
425 for (int i = 0; i < 4; i++) {
426 queue(c, qpu_m_MOV(qpu_r3(), src[i]));
427 *last_inst(c) |= QPU_PM;
428 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
429 QPU_PACK);
430 }
431
432 queue(c, qpu_a_MOV(dst, qpu_r3()));
433
434 break;
435
436 case QOP_FRAG_X:
437 queue(c, qpu_a_ITOF(dst,
438 qpu_ra(QPU_R_XY_PIXEL_COORD)));
439 break;
440
441 case QOP_FRAG_Y:
442 queue(c, qpu_a_ITOF(dst,
443 qpu_rb(QPU_R_XY_PIXEL_COORD)));
444 break;
445
446 case QOP_FRAG_REV_FLAG:
447 queue(c, qpu_a_ITOF(dst,
448 qpu_rb(QPU_R_MS_REV_FLAGS)));
449 break;
450
451 case QOP_FRAG_Z:
452 case QOP_FRAG_W:
453 /* QOP_FRAG_Z/W don't emit instructions, just allocate
454 * the register to the Z/W payload.
455 */
456 break;
457
458 case QOP_TLB_DISCARD_SETUP:
459 discard = true;
460 queue(c, qpu_a_MOV(src[0], src[0]));
461 *last_inst(c) |= QPU_SF;
462 break;
463
464 case QOP_TLB_STENCIL_SETUP:
465 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
466 break;
467
468 case QOP_TLB_Z_WRITE:
469 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
470 if (discard) {
471 set_last_cond_add(c, QPU_COND_ZS);
472 }
473 break;
474
475 case QOP_TLB_COLOR_READ:
476 queue(c, qpu_NOP());
477 *last_inst(c) = qpu_set_sig(*last_inst(c),
478 QPU_SIG_COLOR_LOAD);
479
480 break;
481
482 case QOP_TLB_COLOR_WRITE:
483 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
484 if (discard) {
485 set_last_cond_add(c, QPU_COND_ZS);
486 }
487 break;
488
489 case QOP_VARY_ADD_C:
490 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
491 break;
492
493 case QOP_PACK_SCALED: {
494 uint64_t a = (qpu_a_MOV(dst, src[0]) |
495 QPU_SET_FIELD(QPU_PACK_A_16A,
496 QPU_PACK));
497 uint64_t b = (qpu_a_MOV(dst, src[1]) |
498 QPU_SET_FIELD(QPU_PACK_A_16B,
499 QPU_PACK));
500
501 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
502 queue(c, b);
503 queue(c, a);
504 } else {
505 queue(c, a);
506 queue(c, b);
507 }
508 break;
509 }
510
511 case QOP_TEX_S:
512 case QOP_TEX_T:
513 case QOP_TEX_R:
514 case QOP_TEX_B:
515 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
516 (qinst->op - QOP_TEX_S)),
517 src[0]));
518 break;
519
520 case QOP_TEX_DIRECT:
521 fixup_raddr_conflict(c, &src[0], &src[1]);
522 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
523 break;
524
525 case QOP_TEX_RESULT:
526 queue(c, qpu_NOP());
527 *last_inst(c) = qpu_set_sig(*last_inst(c),
528 QPU_SIG_LOAD_TMU0);
529
530 break;
531
532 case QOP_R4_UNPACK_A:
533 case QOP_R4_UNPACK_B:
534 case QOP_R4_UNPACK_C:
535 case QOP_R4_UNPACK_D:
536 assert(src[0].mux == QPU_MUX_R4);
537 queue(c, qpu_a_MOV(dst, src[0]));
538 *last_inst(c) |= QPU_PM;
539 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
540 (qinst->op -
541 QOP_R4_UNPACK_A),
542 QPU_UNPACK);
543
544 break;
545
546 case QOP_UNPACK_8A:
547 case QOP_UNPACK_8B:
548 case QOP_UNPACK_8C:
549 case QOP_UNPACK_8D: {
550 assert(src[0].mux == QPU_MUX_A);
551
552 /* And, since we're setting the pack bits, if the
553 * destination is in A it would get re-packed.
554 */
555 struct qpu_reg orig_dst = dst;
556 if (orig_dst.mux == QPU_MUX_A)
557 dst = qpu_rn(3);
558
559 queue(c, qpu_a_FMAX(dst, src[0], src[0]));
560 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
561 (qinst->op -
562 QOP_UNPACK_8A),
563 QPU_UNPACK);
564
565 if (orig_dst.mux == QPU_MUX_A) {
566 queue(c, qpu_a_MOV(orig_dst, dst));
567 }
568 }
569 break;
570
571 default:
572 assert(qinst->op < ARRAY_SIZE(translate));
573 assert(translate[qinst->op].op != 0); /* NOPs */
574
575 /* If we have only one source, put it in the second
576 * argument slot as well so that we don't take up
577 * another raddr just to get unused data.
578 */
579 if (qir_get_op_nsrc(qinst->op) == 1)
580 src[1] = src[0];
581
582 fixup_raddr_conflict(c, &src[0], &src[1]);
583
584 if (translate[qinst->op].is_mul) {
585 queue(c, qpu_m_alu2(translate[qinst->op].op,
586 dst,
587 src[0], src[1]));
588 } else {
589 queue(c, qpu_a_alu2(translate[qinst->op].op,
590 dst,
591 src[0], src[1]));
592 }
593 break;
594 }
595 }
596
597 serialize_insts(c);
598
599 /* thread end can't have VPM write */
600 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
601 QPU_WADDR_ADD) == QPU_W_VPM ||
602 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
603 QPU_WADDR_MUL) == QPU_W_VPM) {
604 serialize_one_inst(c, qpu_NOP());
605 }
606
607 /* thread end can't have uniform read */
608 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
609 QPU_RADDR_A) == QPU_R_UNIF ||
610 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
611 QPU_RADDR_B) == QPU_R_UNIF) {
612 serialize_one_inst(c, qpu_NOP());
613 }
614
615 c->qpu_insts[c->qpu_inst_count - 1] =
616 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
617 QPU_SIG_PROG_END);
618 serialize_one_inst(c, qpu_NOP());
619 serialize_one_inst(c, qpu_NOP());
620
621 switch (c->stage) {
622 case QSTAGE_VERT:
623 case QSTAGE_COORD:
624 break;
625 case QSTAGE_FRAG:
626 c->qpu_insts[c->qpu_inst_count - 1] =
627 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
628 QPU_SIG_SCOREBOARD_UNLOCK);
629 break;
630 }
631
632 if (vc4_debug & VC4_DEBUG_QPU)
633 vc4_dump_program(c);
634
635 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
636
637 free(temp_registers);
638 }