vc4: Redefine VPM writes as a (destination) QIR register file.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 insert_at_tail(&c->qpu_inst_list, &q->link);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 /**
68 * Some special registers can be read from either file, which lets us resolve
69 * raddr conflicts without extra MOVs.
70 */
71 static bool
72 swap_file(struct qpu_reg *src)
73 {
74 switch (src->addr) {
75 case QPU_R_UNIF:
76 case QPU_R_VARY:
77 if (src->mux == QPU_MUX_SMALL_IMM) {
78 return false;
79 } else {
80 if (src->mux == QPU_MUX_A)
81 src->mux = QPU_MUX_B;
82 else
83 src->mux = QPU_MUX_A;
84 return true;
85 }
86
87 default:
88 return false;
89 }
90 }
91
92 /**
93 * This is used to resolve the fact that we might register-allocate two
94 * different operands of an instruction to the same physical register file
95 * even though instructions have only one field for the register file source
96 * address.
97 *
98 * In that case, we need to move one to a temporary that can be used in the
99 * instruction, instead.
100 */
101 static bool
102 fixup_raddr_conflict(struct vc4_compile *c,
103 struct qpu_reg dst,
104 struct qpu_reg *src0, struct qpu_reg *src1,
105 bool r3_live)
106 {
107 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
108 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
109
110 if (mux0 <= QPU_MUX_R5 ||
111 mux0 != mux1 ||
112 (src0->addr == src1->addr &&
113 src0->mux == src1->mux)) {
114 return false;
115 }
116
117 if (swap_file(src0) || swap_file(src1))
118 return false;
119
120 if (mux0 == QPU_MUX_A) {
121 /* If we're conflicting over the A regfile, then we can just
122 * use the reserved rb31.
123 */
124 queue(c, qpu_a_MOV(qpu_rb(31), *src1));
125 *src1 = qpu_rb(31);
126 return false;
127 } else {
128 /* Otherwise, we need a non-B regfile. So, we spill r3 out to
129 * rb31, then store our desired value in r3, and tell the
130 * caller to put rb31 back into r3 when we're done.
131 */
132 if (r3_live)
133 queue(c, qpu_a_MOV(qpu_rb(31), qpu_r3()));
134 queue(c, qpu_a_MOV(qpu_r3(), *src1));
135
136 *src1 = qpu_r3();
137
138 return r3_live && dst.mux != QPU_MUX_R3;
139 }
140 }
141
142 void
143 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
144 {
145 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
146 bool discard = false;
147 uint32_t inputs_remaining = c->num_inputs;
148 uint32_t vpm_read_fifo_count = 0;
149 uint32_t vpm_read_offset = 0;
150 bool written_r3 = false;
151 bool needs_restore;
152 /* Map from the QIR ops enum order to QPU unpack bits. */
153 static const uint32_t unpack_map[] = {
154 QPU_UNPACK_8A,
155 QPU_UNPACK_8B,
156 QPU_UNPACK_8C,
157 QPU_UNPACK_8D,
158 QPU_UNPACK_16A_TO_F32,
159 QPU_UNPACK_16B_TO_F32,
160 };
161
162 make_empty_list(&c->qpu_inst_list);
163
164 switch (c->stage) {
165 case QSTAGE_VERT:
166 case QSTAGE_COORD:
167 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
168 * load up to 16 dwords (4 vec4s) per vertex.
169 */
170 while (inputs_remaining) {
171 uint32_t num_entries = MIN2(inputs_remaining, 16);
172 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
173 vpm_read_offset |
174 0x00001a00 |
175 ((num_entries & 0xf) << 20)));
176 inputs_remaining -= num_entries;
177 vpm_read_offset += num_entries;
178 vpm_read_fifo_count++;
179 }
180 assert(vpm_read_fifo_count <= 4);
181
182 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
183 break;
184 case QSTAGE_FRAG:
185 break;
186 }
187
188 struct simple_node *node;
189 foreach(node, &c->instructions) {
190 struct qinst *qinst = (struct qinst *)node;
191
192 #if 0
193 fprintf(stderr, "translating qinst to qpu: ");
194 qir_dump_inst(qinst);
195 fprintf(stderr, "\n");
196 #endif
197
198 static const struct {
199 uint32_t op;
200 bool is_mul;
201 } translate[] = {
202 #define A(name) [QOP_##name] = {QPU_A_##name, false}
203 #define M(name) [QOP_##name] = {QPU_M_##name, true}
204 A(FADD),
205 A(FSUB),
206 A(FMIN),
207 A(FMAX),
208 A(FMINABS),
209 A(FMAXABS),
210 A(FTOI),
211 A(ITOF),
212 A(ADD),
213 A(SUB),
214 A(SHL),
215 A(SHR),
216 A(ASR),
217 A(MIN),
218 A(MAX),
219 A(AND),
220 A(OR),
221 A(XOR),
222 A(NOT),
223
224 M(FMUL),
225 M(MUL24),
226 };
227
228 struct qpu_reg src[4];
229 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
230 int index = qinst->src[i].index;
231 switch (qinst->src[i].file) {
232 case QFILE_NULL:
233 src[i] = qpu_rn(0);
234 break;
235 case QFILE_TEMP:
236 src[i] = temp_registers[index];
237 break;
238 case QFILE_UNIF:
239 src[i] = qpu_unif();
240 break;
241 case QFILE_VARY:
242 src[i] = qpu_vary();
243 break;
244 case QFILE_SMALL_IMM:
245 src[i].mux = QPU_MUX_SMALL_IMM;
246 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
247 /* This should only have returned a valid
248 * small immediate field, not ~0 for failure.
249 */
250 assert(src[i].addr <= 47);
251 break;
252 case QFILE_VPM:
253 assert(!"not reached");
254 break;
255 }
256 }
257
258 struct qpu_reg dst;
259 switch (qinst->dst.file) {
260 case QFILE_NULL:
261 dst = qpu_ra(QPU_W_NOP);
262 break;
263 case QFILE_TEMP:
264 dst = temp_registers[qinst->dst.index];
265 break;
266 case QFILE_VPM:
267 dst = qpu_ra(QPU_W_VPM);
268 break;
269 case QFILE_VARY:
270 case QFILE_UNIF:
271 case QFILE_SMALL_IMM:
272 assert(!"not reached");
273 break;
274 }
275
276 switch (qinst->op) {
277 case QOP_MOV:
278 /* Skip emitting the MOV if it's a no-op. */
279 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
280 dst.mux != src[0].mux || dst.addr != src[0].addr) {
281 queue(c, qpu_a_MOV(dst, src[0]));
282 }
283 break;
284
285 case QOP_SF:
286 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
287 *last_inst(c) |= QPU_SF;
288 break;
289
290 case QOP_SEL_X_0_ZS:
291 case QOP_SEL_X_0_ZC:
292 case QOP_SEL_X_0_NS:
293 case QOP_SEL_X_0_NC:
294 queue(c, qpu_a_MOV(dst, src[0]));
295 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
296 QPU_COND_ZS);
297
298 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
299 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
300 1) + QPU_COND_ZS);
301 break;
302
303 case QOP_SEL_X_Y_ZS:
304 case QOP_SEL_X_Y_ZC:
305 case QOP_SEL_X_Y_NS:
306 case QOP_SEL_X_Y_NC:
307 queue(c, qpu_a_MOV(dst, src[0]));
308 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
309 QPU_COND_ZS);
310
311 queue(c, qpu_a_MOV(dst, src[1]));
312 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
313 1) + QPU_COND_ZS);
314
315 break;
316
317 case QOP_VPM_READ:
318 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
319 break;
320
321 case QOP_RCP:
322 case QOP_RSQ:
323 case QOP_EXP2:
324 case QOP_LOG2:
325 switch (qinst->op) {
326 case QOP_RCP:
327 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
328 src[0]));
329 break;
330 case QOP_RSQ:
331 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
332 src[0]));
333 break;
334 case QOP_EXP2:
335 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
336 src[0]));
337 break;
338 case QOP_LOG2:
339 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
340 src[0]));
341 break;
342 default:
343 abort();
344 }
345
346 queue(c, qpu_a_MOV(dst, qpu_r4()));
347
348 break;
349
350 case QOP_PACK_COLORS: {
351 /* We have to be careful not to start writing over one
352 * of our source values when incrementally writing the
353 * destination. So, if the dst is one of the srcs, we
354 * pack that one first (and we pack 4 channels at once
355 * for the first pack).
356 */
357 struct qpu_reg first_pack = src[0];
358 for (int i = 0; i < 4; i++) {
359 if (src[i].mux == dst.mux &&
360 src[i].addr == dst.addr) {
361 first_pack = dst;
362 break;
363 }
364 }
365 queue(c, qpu_m_MOV(dst, first_pack));
366 *last_inst(c) |= QPU_PM;
367 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
368 QPU_PACK);
369
370 for (int i = 0; i < 4; i++) {
371 if (src[i].mux == first_pack.mux &&
372 src[i].addr == first_pack.addr) {
373 continue;
374 }
375
376 queue(c, qpu_m_MOV(dst, src[i]));
377 *last_inst(c) |= QPU_PM;
378 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
379 QPU_PACK);
380 }
381
382 break;
383 }
384
385 case QOP_FRAG_X:
386 queue(c, qpu_a_ITOF(dst,
387 qpu_ra(QPU_R_XY_PIXEL_COORD)));
388 break;
389
390 case QOP_FRAG_Y:
391 queue(c, qpu_a_ITOF(dst,
392 qpu_rb(QPU_R_XY_PIXEL_COORD)));
393 break;
394
395 case QOP_FRAG_REV_FLAG:
396 queue(c, qpu_a_ITOF(dst,
397 qpu_rb(QPU_R_MS_REV_FLAGS)));
398 break;
399
400 case QOP_FRAG_Z:
401 case QOP_FRAG_W:
402 /* QOP_FRAG_Z/W don't emit instructions, just allocate
403 * the register to the Z/W payload.
404 */
405 break;
406
407 case QOP_TLB_DISCARD_SETUP:
408 discard = true;
409 queue(c, qpu_a_MOV(src[0], src[0]));
410 *last_inst(c) |= QPU_SF;
411 break;
412
413 case QOP_TLB_STENCIL_SETUP:
414 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
415 break;
416
417 case QOP_TLB_Z_WRITE:
418 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
419 if (discard) {
420 set_last_cond_add(c, QPU_COND_ZS);
421 }
422 break;
423
424 case QOP_TLB_COLOR_READ:
425 queue(c, qpu_NOP());
426 *last_inst(c) = qpu_set_sig(*last_inst(c),
427 QPU_SIG_COLOR_LOAD);
428
429 break;
430
431 case QOP_TLB_COLOR_WRITE:
432 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
433 if (discard) {
434 set_last_cond_add(c, QPU_COND_ZS);
435 }
436 break;
437
438 case QOP_VARY_ADD_C:
439 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
440 break;
441
442 case QOP_PACK_SCALED: {
443 uint64_t a = (qpu_a_MOV(dst, src[0]) |
444 QPU_SET_FIELD(QPU_PACK_A_16A,
445 QPU_PACK));
446 uint64_t b = (qpu_a_MOV(dst, src[1]) |
447 QPU_SET_FIELD(QPU_PACK_A_16B,
448 QPU_PACK));
449
450 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
451 queue(c, b);
452 queue(c, a);
453 } else {
454 queue(c, a);
455 queue(c, b);
456 }
457 break;
458 }
459
460 case QOP_TEX_S:
461 case QOP_TEX_T:
462 case QOP_TEX_R:
463 case QOP_TEX_B:
464 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
465 (qinst->op - QOP_TEX_S)),
466 src[0]));
467 break;
468
469 case QOP_TEX_DIRECT:
470 needs_restore = fixup_raddr_conflict(c, dst,
471 &src[0], &src[1],
472 written_r3);
473 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
474 if (needs_restore)
475 queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
476 break;
477
478 case QOP_TEX_RESULT:
479 queue(c, qpu_NOP());
480 *last_inst(c) = qpu_set_sig(*last_inst(c),
481 QPU_SIG_LOAD_TMU0);
482
483 break;
484
485 case QOP_R4_UNPACK_A:
486 case QOP_R4_UNPACK_B:
487 case QOP_R4_UNPACK_C:
488 case QOP_R4_UNPACK_D:
489 assert(src[0].mux == QPU_MUX_R4);
490 queue(c, qpu_a_MOV(dst, src[0]));
491 *last_inst(c) |= QPU_PM;
492 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
493 (qinst->op -
494 QOP_R4_UNPACK_A),
495 QPU_UNPACK);
496
497 break;
498
499 case QOP_UNPACK_8A_F:
500 case QOP_UNPACK_8B_F:
501 case QOP_UNPACK_8C_F:
502 case QOP_UNPACK_8D_F:
503 case QOP_UNPACK_16A_F:
504 case QOP_UNPACK_16B_F: {
505 assert(src[0].mux == QPU_MUX_A);
506
507 /* Since we're setting the pack bits, if the
508 * destination is in A it would get re-packed.
509 */
510 queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
511 qpu_rb(31) : dst),
512 src[0], src[0]));
513 *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
514 QOP_UNPACK_8A_F],
515 QPU_UNPACK);
516
517 if (dst.mux == QPU_MUX_A) {
518 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
519 }
520 }
521 break;
522
523 case QOP_UNPACK_8A_I:
524 case QOP_UNPACK_8B_I:
525 case QOP_UNPACK_8C_I:
526 case QOP_UNPACK_8D_I:
527 case QOP_UNPACK_16A_I:
528 case QOP_UNPACK_16B_I: {
529 assert(src[0].mux == QPU_MUX_A);
530
531 /* Since we're setting the pack bits, if the
532 * destination is in A it would get re-packed.
533 */
534 queue(c, qpu_a_MOV((dst.mux == QPU_MUX_A ?
535 qpu_rb(31) : dst), src[0]));
536 *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
537 QOP_UNPACK_8A_I],
538 QPU_UNPACK);
539
540 if (dst.mux == QPU_MUX_A) {
541 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
542 }
543 }
544 break;
545
546 default:
547 assert(qinst->op < ARRAY_SIZE(translate));
548 assert(translate[qinst->op].op != 0); /* NOPs */
549
550 /* If we have only one source, put it in the second
551 * argument slot as well so that we don't take up
552 * another raddr just to get unused data.
553 */
554 if (qir_get_op_nsrc(qinst->op) == 1)
555 src[1] = src[0];
556
557 needs_restore = fixup_raddr_conflict(c, dst,
558 &src[0], &src[1],
559 written_r3);
560
561 if (translate[qinst->op].is_mul) {
562 queue(c, qpu_m_alu2(translate[qinst->op].op,
563 dst,
564 src[0], src[1]));
565 } else {
566 queue(c, qpu_a_alu2(translate[qinst->op].op,
567 dst,
568 src[0], src[1]));
569 }
570 if (needs_restore)
571 queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
572
573 break;
574 }
575
576 if (dst.mux == QPU_MUX_R3)
577 written_r3 = true;
578 }
579
580 qpu_schedule_instructions(c);
581
582 /* thread end can't have VPM write or read */
583 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
584 QPU_WADDR_ADD) == QPU_W_VPM ||
585 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
586 QPU_WADDR_MUL) == QPU_W_VPM ||
587 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
588 QPU_RADDR_A) == QPU_R_VPM ||
589 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
590 QPU_RADDR_B) == QPU_R_VPM) {
591 qpu_serialize_one_inst(c, qpu_NOP());
592 }
593
594 /* thread end can't have uniform read */
595 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
596 QPU_RADDR_A) == QPU_R_UNIF ||
597 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
598 QPU_RADDR_B) == QPU_R_UNIF) {
599 qpu_serialize_one_inst(c, qpu_NOP());
600 }
601
602 /* thread end can't have TLB operations */
603 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
604 qpu_serialize_one_inst(c, qpu_NOP());
605
606 c->qpu_insts[c->qpu_inst_count - 1] =
607 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
608 QPU_SIG_PROG_END);
609 qpu_serialize_one_inst(c, qpu_NOP());
610 qpu_serialize_one_inst(c, qpu_NOP());
611
612 switch (c->stage) {
613 case QSTAGE_VERT:
614 case QSTAGE_COORD:
615 break;
616 case QSTAGE_FRAG:
617 c->qpu_insts[c->qpu_inst_count - 1] =
618 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
619 QPU_SIG_SCOREBOARD_UNLOCK);
620 break;
621 }
622
623 if (vc4_debug & VC4_DEBUG_QPU)
624 vc4_dump_program(c);
625
626 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
627
628 free(temp_registers);
629 }