vc4: Redo VPM reads as a read file.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 insert_at_tail(&c->qpu_inst_list, &q->link);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 /**
68 * Some special registers can be read from either file, which lets us resolve
69 * raddr conflicts without extra MOVs.
70 */
71 static bool
72 swap_file(struct qpu_reg *src)
73 {
74 switch (src->addr) {
75 case QPU_R_UNIF:
76 case QPU_R_VARY:
77 if (src->mux == QPU_MUX_SMALL_IMM) {
78 return false;
79 } else {
80 if (src->mux == QPU_MUX_A)
81 src->mux = QPU_MUX_B;
82 else
83 src->mux = QPU_MUX_A;
84 return true;
85 }
86
87 default:
88 return false;
89 }
90 }
91
92 /**
93 * This is used to resolve the fact that we might register-allocate two
94 * different operands of an instruction to the same physical register file
95 * even though instructions have only one field for the register file source
96 * address.
97 *
98 * In that case, we need to move one to a temporary that can be used in the
99 * instruction, instead.
100 */
101 static bool
102 fixup_raddr_conflict(struct vc4_compile *c,
103 struct qpu_reg dst,
104 struct qpu_reg *src0, struct qpu_reg *src1,
105 bool r3_live)
106 {
107 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
108 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
109
110 if (mux0 <= QPU_MUX_R5 ||
111 mux0 != mux1 ||
112 (src0->addr == src1->addr &&
113 src0->mux == src1->mux)) {
114 return false;
115 }
116
117 if (swap_file(src0) || swap_file(src1))
118 return false;
119
120 if (mux0 == QPU_MUX_A) {
121 /* If we're conflicting over the A regfile, then we can just
122 * use the reserved rb31.
123 */
124 queue(c, qpu_a_MOV(qpu_rb(31), *src1));
125 *src1 = qpu_rb(31);
126 return false;
127 } else {
128 /* Otherwise, we need a non-B regfile. So, we spill r3 out to
129 * rb31, then store our desired value in r3, and tell the
130 * caller to put rb31 back into r3 when we're done.
131 */
132 if (r3_live)
133 queue(c, qpu_a_MOV(qpu_rb(31), qpu_r3()));
134 queue(c, qpu_a_MOV(qpu_r3(), *src1));
135
136 *src1 = qpu_r3();
137
138 return r3_live && dst.mux != QPU_MUX_R3;
139 }
140 }
141
142 void
143 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
144 {
145 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
146 bool discard = false;
147 uint32_t inputs_remaining = c->num_inputs;
148 uint32_t vpm_read_fifo_count = 0;
149 uint32_t vpm_read_offset = 0;
150 int last_vpm_read_index = -1;
151 bool written_r3 = false;
152 bool needs_restore;
153 /* Map from the QIR ops enum order to QPU unpack bits. */
154 static const uint32_t unpack_map[] = {
155 QPU_UNPACK_8A,
156 QPU_UNPACK_8B,
157 QPU_UNPACK_8C,
158 QPU_UNPACK_8D,
159 QPU_UNPACK_16A_TO_F32,
160 QPU_UNPACK_16B_TO_F32,
161 };
162
163 make_empty_list(&c->qpu_inst_list);
164
165 switch (c->stage) {
166 case QSTAGE_VERT:
167 case QSTAGE_COORD:
168 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
169 * load up to 16 dwords (4 vec4s) per vertex.
170 */
171 while (inputs_remaining) {
172 uint32_t num_entries = MIN2(inputs_remaining, 16);
173 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
174 vpm_read_offset |
175 0x00001a00 |
176 ((num_entries & 0xf) << 20)));
177 inputs_remaining -= num_entries;
178 vpm_read_offset += num_entries;
179 vpm_read_fifo_count++;
180 }
181 assert(vpm_read_fifo_count <= 4);
182
183 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
184 break;
185 case QSTAGE_FRAG:
186 break;
187 }
188
189 struct simple_node *node;
190 foreach(node, &c->instructions) {
191 struct qinst *qinst = (struct qinst *)node;
192
193 #if 0
194 fprintf(stderr, "translating qinst to qpu: ");
195 qir_dump_inst(qinst);
196 fprintf(stderr, "\n");
197 #endif
198
199 static const struct {
200 uint32_t op;
201 bool is_mul;
202 } translate[] = {
203 #define A(name) [QOP_##name] = {QPU_A_##name, false}
204 #define M(name) [QOP_##name] = {QPU_M_##name, true}
205 A(FADD),
206 A(FSUB),
207 A(FMIN),
208 A(FMAX),
209 A(FMINABS),
210 A(FMAXABS),
211 A(FTOI),
212 A(ITOF),
213 A(ADD),
214 A(SUB),
215 A(SHL),
216 A(SHR),
217 A(ASR),
218 A(MIN),
219 A(MAX),
220 A(AND),
221 A(OR),
222 A(XOR),
223 A(NOT),
224
225 M(FMUL),
226 M(MUL24),
227 };
228
229 struct qpu_reg src[4];
230 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
231 int index = qinst->src[i].index;
232 switch (qinst->src[i].file) {
233 case QFILE_NULL:
234 src[i] = qpu_rn(0);
235 break;
236 case QFILE_TEMP:
237 src[i] = temp_registers[index];
238 break;
239 case QFILE_UNIF:
240 src[i] = qpu_unif();
241 break;
242 case QFILE_VARY:
243 src[i] = qpu_vary();
244 break;
245 case QFILE_SMALL_IMM:
246 src[i].mux = QPU_MUX_SMALL_IMM;
247 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
248 /* This should only have returned a valid
249 * small immediate field, not ~0 for failure.
250 */
251 assert(src[i].addr <= 47);
252 break;
253 case QFILE_VPM:
254 assert((int)qinst->src[i].index >=
255 last_vpm_read_index);
256 last_vpm_read_index = qinst->src[i].index;
257 src[i] = qpu_ra(QPU_R_VPM);
258 break;
259 }
260 }
261
262 struct qpu_reg dst;
263 switch (qinst->dst.file) {
264 case QFILE_NULL:
265 dst = qpu_ra(QPU_W_NOP);
266 break;
267 case QFILE_TEMP:
268 dst = temp_registers[qinst->dst.index];
269 break;
270 case QFILE_VPM:
271 dst = qpu_ra(QPU_W_VPM);
272 break;
273 case QFILE_VARY:
274 case QFILE_UNIF:
275 case QFILE_SMALL_IMM:
276 assert(!"not reached");
277 break;
278 }
279
280 switch (qinst->op) {
281 case QOP_MOV:
282 /* Skip emitting the MOV if it's a no-op. */
283 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
284 dst.mux != src[0].mux || dst.addr != src[0].addr) {
285 queue(c, qpu_a_MOV(dst, src[0]));
286 }
287 break;
288
289 case QOP_SF:
290 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
291 *last_inst(c) |= QPU_SF;
292 break;
293
294 case QOP_SEL_X_0_ZS:
295 case QOP_SEL_X_0_ZC:
296 case QOP_SEL_X_0_NS:
297 case QOP_SEL_X_0_NC:
298 queue(c, qpu_a_MOV(dst, src[0]));
299 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
300 QPU_COND_ZS);
301
302 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
303 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
304 1) + QPU_COND_ZS);
305 break;
306
307 case QOP_SEL_X_Y_ZS:
308 case QOP_SEL_X_Y_ZC:
309 case QOP_SEL_X_Y_NS:
310 case QOP_SEL_X_Y_NC:
311 queue(c, qpu_a_MOV(dst, src[0]));
312 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
313 QPU_COND_ZS);
314
315 queue(c, qpu_a_MOV(dst, src[1]));
316 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
317 1) + QPU_COND_ZS);
318
319 break;
320
321 case QOP_RCP:
322 case QOP_RSQ:
323 case QOP_EXP2:
324 case QOP_LOG2:
325 switch (qinst->op) {
326 case QOP_RCP:
327 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
328 src[0]));
329 break;
330 case QOP_RSQ:
331 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
332 src[0]));
333 break;
334 case QOP_EXP2:
335 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
336 src[0]));
337 break;
338 case QOP_LOG2:
339 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
340 src[0]));
341 break;
342 default:
343 abort();
344 }
345
346 queue(c, qpu_a_MOV(dst, qpu_r4()));
347
348 break;
349
350 case QOP_PACK_8888_F:
351 queue(c, qpu_m_MOV(dst, src[0]));
352 *last_inst(c) |= QPU_PM;
353 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
354 QPU_PACK);
355 break;
356
357 case QOP_PACK_8A_F:
358 case QOP_PACK_8B_F:
359 case QOP_PACK_8C_F:
360 case QOP_PACK_8D_F:
361 /* If dst doesn't happen to already contain src[0],
362 * then we have to move it in.
363 */
364 if (qinst->src[0].file != QFILE_NULL &&
365 (src[0].mux != dst.mux || src[0].addr != dst.addr)) {
366 /* Don't overwrite src1 while setting up
367 * the dst!
368 */
369 if (dst.mux == src[1].mux &&
370 dst.addr == src[1].addr) {
371 queue(c, qpu_m_MOV(qpu_rb(31), src[1]));
372 src[1] = qpu_rb(31);
373 }
374
375 queue(c, qpu_m_MOV(dst, src[0]));
376 }
377
378 queue(c, qpu_m_MOV(dst, src[1]));
379 *last_inst(c) |= QPU_PM;
380 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A +
381 qinst->op - QOP_PACK_8A_F,
382 QPU_PACK);
383 break;
384
385 case QOP_FRAG_X:
386 queue(c, qpu_a_ITOF(dst,
387 qpu_ra(QPU_R_XY_PIXEL_COORD)));
388 break;
389
390 case QOP_FRAG_Y:
391 queue(c, qpu_a_ITOF(dst,
392 qpu_rb(QPU_R_XY_PIXEL_COORD)));
393 break;
394
395 case QOP_FRAG_REV_FLAG:
396 queue(c, qpu_a_ITOF(dst,
397 qpu_rb(QPU_R_MS_REV_FLAGS)));
398 break;
399
400 case QOP_FRAG_Z:
401 case QOP_FRAG_W:
402 /* QOP_FRAG_Z/W don't emit instructions, just allocate
403 * the register to the Z/W payload.
404 */
405 break;
406
407 case QOP_TLB_DISCARD_SETUP:
408 discard = true;
409 queue(c, qpu_a_MOV(src[0], src[0]));
410 *last_inst(c) |= QPU_SF;
411 break;
412
413 case QOP_TLB_STENCIL_SETUP:
414 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
415 break;
416
417 case QOP_TLB_Z_WRITE:
418 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
419 if (discard) {
420 set_last_cond_add(c, QPU_COND_ZS);
421 }
422 break;
423
424 case QOP_TLB_COLOR_READ:
425 queue(c, qpu_NOP());
426 *last_inst(c) = qpu_set_sig(*last_inst(c),
427 QPU_SIG_COLOR_LOAD);
428
429 break;
430
431 case QOP_TLB_COLOR_WRITE:
432 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
433 if (discard) {
434 set_last_cond_add(c, QPU_COND_ZS);
435 }
436 break;
437
438 case QOP_VARY_ADD_C:
439 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
440 break;
441
442 case QOP_PACK_SCALED: {
443 uint64_t a = (qpu_a_MOV(dst, src[0]) |
444 QPU_SET_FIELD(QPU_PACK_A_16A,
445 QPU_PACK));
446 uint64_t b = (qpu_a_MOV(dst, src[1]) |
447 QPU_SET_FIELD(QPU_PACK_A_16B,
448 QPU_PACK));
449
450 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
451 queue(c, b);
452 queue(c, a);
453 } else {
454 queue(c, a);
455 queue(c, b);
456 }
457 break;
458 }
459
460 case QOP_TEX_S:
461 case QOP_TEX_T:
462 case QOP_TEX_R:
463 case QOP_TEX_B:
464 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
465 (qinst->op - QOP_TEX_S)),
466 src[0]));
467 break;
468
469 case QOP_TEX_DIRECT:
470 needs_restore = fixup_raddr_conflict(c, dst,
471 &src[0], &src[1],
472 written_r3);
473 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
474 if (needs_restore)
475 queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
476 break;
477
478 case QOP_TEX_RESULT:
479 queue(c, qpu_NOP());
480 *last_inst(c) = qpu_set_sig(*last_inst(c),
481 QPU_SIG_LOAD_TMU0);
482
483 break;
484
485 case QOP_R4_UNPACK_A:
486 case QOP_R4_UNPACK_B:
487 case QOP_R4_UNPACK_C:
488 case QOP_R4_UNPACK_D:
489 assert(src[0].mux == QPU_MUX_R4);
490 queue(c, qpu_a_MOV(dst, src[0]));
491 *last_inst(c) |= QPU_PM;
492 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
493 (qinst->op -
494 QOP_R4_UNPACK_A),
495 QPU_UNPACK);
496
497 break;
498
499 case QOP_UNPACK_8A_F:
500 case QOP_UNPACK_8B_F:
501 case QOP_UNPACK_8C_F:
502 case QOP_UNPACK_8D_F:
503 case QOP_UNPACK_16A_F:
504 case QOP_UNPACK_16B_F: {
505 assert(src[0].mux == QPU_MUX_A);
506
507 /* Since we're setting the pack bits, if the
508 * destination is in A it would get re-packed.
509 */
510 queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
511 qpu_rb(31) : dst),
512 src[0], src[0]));
513 *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
514 QOP_UNPACK_8A_F],
515 QPU_UNPACK);
516
517 if (dst.mux == QPU_MUX_A) {
518 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
519 }
520 }
521 break;
522
523 case QOP_UNPACK_8A_I:
524 case QOP_UNPACK_8B_I:
525 case QOP_UNPACK_8C_I:
526 case QOP_UNPACK_8D_I:
527 case QOP_UNPACK_16A_I:
528 case QOP_UNPACK_16B_I: {
529 assert(src[0].mux == QPU_MUX_A);
530
531 /* Since we're setting the pack bits, if the
532 * destination is in A it would get re-packed.
533 */
534 queue(c, qpu_a_MOV((dst.mux == QPU_MUX_A ?
535 qpu_rb(31) : dst), src[0]));
536 *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
537 QOP_UNPACK_8A_I],
538 QPU_UNPACK);
539
540 if (dst.mux == QPU_MUX_A) {
541 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
542 }
543 }
544 break;
545
546 default:
547 assert(qinst->op < ARRAY_SIZE(translate));
548 assert(translate[qinst->op].op != 0); /* NOPs */
549
550 /* If we have only one source, put it in the second
551 * argument slot as well so that we don't take up
552 * another raddr just to get unused data.
553 */
554 if (qir_get_op_nsrc(qinst->op) == 1)
555 src[1] = src[0];
556
557 needs_restore = fixup_raddr_conflict(c, dst,
558 &src[0], &src[1],
559 written_r3);
560
561 if (translate[qinst->op].is_mul) {
562 queue(c, qpu_m_alu2(translate[qinst->op].op,
563 dst,
564 src[0], src[1]));
565 } else {
566 queue(c, qpu_a_alu2(translate[qinst->op].op,
567 dst,
568 src[0], src[1]));
569 }
570 if (needs_restore)
571 queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
572
573 break;
574 }
575
576 if (dst.mux == QPU_MUX_R3)
577 written_r3 = true;
578 }
579
580 qpu_schedule_instructions(c);
581
582 /* thread end can't have VPM write or read */
583 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
584 QPU_WADDR_ADD) == QPU_W_VPM ||
585 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
586 QPU_WADDR_MUL) == QPU_W_VPM ||
587 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
588 QPU_RADDR_A) == QPU_R_VPM ||
589 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
590 QPU_RADDR_B) == QPU_R_VPM) {
591 qpu_serialize_one_inst(c, qpu_NOP());
592 }
593
594 /* thread end can't have uniform read */
595 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
596 QPU_RADDR_A) == QPU_R_UNIF ||
597 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
598 QPU_RADDR_B) == QPU_R_UNIF) {
599 qpu_serialize_one_inst(c, qpu_NOP());
600 }
601
602 /* thread end can't have TLB operations */
603 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
604 qpu_serialize_one_inst(c, qpu_NOP());
605
606 c->qpu_insts[c->qpu_inst_count - 1] =
607 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
608 QPU_SIG_PROG_END);
609 qpu_serialize_one_inst(c, qpu_NOP());
610 qpu_serialize_one_inst(c, qpu_NOP());
611
612 switch (c->stage) {
613 case QSTAGE_VERT:
614 case QSTAGE_COORD:
615 break;
616 case QSTAGE_FRAG:
617 c->qpu_insts[c->qpu_inst_count - 1] =
618 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
619 QPU_SIG_SCOREBOARD_UNLOCK);
620 break;
621 }
622
623 if (vc4_debug & VC4_DEBUG_QPU)
624 vc4_dump_program(c);
625
626 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
627
628 free(temp_registers);
629 }