vc4: Switch store_output to using nir_lower_io_to_scalar / component.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 fprintf(stderr, "\n");
44 }
45
46 static void
47 queue(struct qblock *block, uint64_t inst)
48 {
49 struct queued_qpu_inst *q = rzalloc(block, struct queued_qpu_inst);
50 q->inst = inst;
51 list_addtail(&q->link, &block->qpu_inst_list);
52 }
53
54 static uint64_t *
55 last_inst(struct qblock *block)
56 {
57 struct queued_qpu_inst *q =
58 (struct queued_qpu_inst *)block->qpu_inst_list.prev;
59 return &q->inst;
60 }
61
62 static void
63 set_last_cond_add(struct qblock *block, uint32_t cond)
64 {
65 *last_inst(block) = qpu_set_cond_add(*last_inst(block), cond);
66 }
67
68 static void
69 set_last_cond_mul(struct qblock *block, uint32_t cond)
70 {
71 *last_inst(block) = qpu_set_cond_mul(*last_inst(block), cond);
72 }
73
74 /**
75 * Some special registers can be read from either file, which lets us resolve
76 * raddr conflicts without extra MOVs.
77 */
78 static bool
79 swap_file(struct qpu_reg *src)
80 {
81 switch (src->addr) {
82 case QPU_R_UNIF:
83 case QPU_R_VARY:
84 if (src->mux == QPU_MUX_SMALL_IMM) {
85 return false;
86 } else {
87 if (src->mux == QPU_MUX_A)
88 src->mux = QPU_MUX_B;
89 else
90 src->mux = QPU_MUX_A;
91 return true;
92 }
93
94 default:
95 return false;
96 }
97 }
98
99 /**
100 * This is used to resolve the fact that we might register-allocate two
101 * different operands of an instruction to the same physical register file
102 * even though instructions have only one field for the register file source
103 * address.
104 *
105 * In that case, we need to move one to a temporary that can be used in the
106 * instruction, instead. We reserve ra31/rb31 for this purpose.
107 */
108 static void
109 fixup_raddr_conflict(struct qblock *block,
110 struct qpu_reg dst,
111 struct qpu_reg *src0, struct qpu_reg *src1,
112 struct qinst *inst, uint64_t *unpack)
113 {
114 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
115 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
116
117 if (mux0 <= QPU_MUX_R5 ||
118 mux0 != mux1 ||
119 (src0->addr == src1->addr &&
120 src0->mux == src1->mux)) {
121 return;
122 }
123
124 if (swap_file(src0) || swap_file(src1))
125 return;
126
127 if (mux0 == QPU_MUX_A) {
128 /* Make sure we use the same type of MOV as the instruction,
129 * in case of unpacks.
130 */
131 if (qir_is_float_input(inst))
132 queue(block, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
133 else
134 queue(block, qpu_a_MOV(qpu_rb(31), *src0));
135
136 /* If we had an unpack on this A-file source, we need to put
137 * it into this MOV, not into the later move from regfile B.
138 */
139 if (inst->src[0].pack) {
140 *last_inst(block) |= *unpack;
141 *unpack = 0;
142 }
143 *src0 = qpu_rb(31);
144 } else {
145 queue(block, qpu_a_MOV(qpu_ra(31), *src0));
146 *src0 = qpu_ra(31);
147 }
148 }
149
150 static void
151 set_last_dst_pack(struct qblock *block, struct qinst *inst)
152 {
153 bool had_pm = *last_inst(block) & QPU_PM;
154 bool had_ws = *last_inst(block) & QPU_WS;
155 uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
156
157 if (!inst->dst.pack)
158 return;
159
160 *last_inst(block) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
161
162 if (qir_is_mul(inst)) {
163 assert(!unpack || had_pm);
164 *last_inst(block) |= QPU_PM;
165 } else {
166 assert(!unpack || !had_pm);
167 assert(!had_ws); /* dst must be a-file to pack. */
168 }
169 }
170
171 static void
172 handle_r4_qpu_write(struct qblock *block, struct qinst *qinst,
173 struct qpu_reg dst)
174 {
175 if (dst.mux != QPU_MUX_R4)
176 queue(block, qpu_a_MOV(dst, qpu_r4()));
177 else if (qinst->sf)
178 queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
179 }
180
181 static void
182 vc4_generate_code_block(struct vc4_compile *c,
183 struct qblock *block,
184 struct qpu_reg *temp_registers)
185 {
186 int last_vpm_read_index = -1;
187
188 qir_for_each_inst(qinst, block) {
189 #if 0
190 fprintf(stderr, "translating qinst to qpu: ");
191 qir_dump_inst(qinst);
192 fprintf(stderr, "\n");
193 #endif
194
195 static const struct {
196 uint32_t op;
197 } translate[] = {
198 #define A(name) [QOP_##name] = {QPU_A_##name}
199 #define M(name) [QOP_##name] = {QPU_M_##name}
200 A(FADD),
201 A(FSUB),
202 A(FMIN),
203 A(FMAX),
204 A(FMINABS),
205 A(FMAXABS),
206 A(FTOI),
207 A(ITOF),
208 A(ADD),
209 A(SUB),
210 A(SHL),
211 A(SHR),
212 A(ASR),
213 A(MIN),
214 A(MAX),
215 A(AND),
216 A(OR),
217 A(XOR),
218 A(NOT),
219
220 M(FMUL),
221 M(V8MULD),
222 M(V8MIN),
223 M(V8MAX),
224 M(V8ADDS),
225 M(V8SUBS),
226 M(MUL24),
227
228 /* If we replicate src[0] out to src[1], this works
229 * out the same as a MOV.
230 */
231 [QOP_MOV] = { QPU_A_OR },
232 [QOP_FMOV] = { QPU_A_FMAX },
233 [QOP_MMOV] = { QPU_M_V8MIN },
234 };
235
236 uint64_t unpack = 0;
237 struct qpu_reg src[4];
238 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
239 int index = qinst->src[i].index;
240 switch (qinst->src[i].file) {
241 case QFILE_NULL:
242 case QFILE_LOAD_IMM:
243 src[i] = qpu_rn(0);
244 break;
245 case QFILE_TEMP:
246 src[i] = temp_registers[index];
247 if (qinst->src[i].pack) {
248 assert(!unpack ||
249 unpack == qinst->src[i].pack);
250 unpack = QPU_SET_FIELD(qinst->src[i].pack,
251 QPU_UNPACK);
252 if (src[i].mux == QPU_MUX_R4)
253 unpack |= QPU_PM;
254 }
255 break;
256 case QFILE_UNIF:
257 src[i] = qpu_unif();
258 break;
259 case QFILE_VARY:
260 src[i] = qpu_vary();
261 break;
262 case QFILE_SMALL_IMM:
263 src[i].mux = QPU_MUX_SMALL_IMM;
264 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
265 /* This should only have returned a valid
266 * small immediate field, not ~0 for failure.
267 */
268 assert(src[i].addr <= 47);
269 break;
270 case QFILE_VPM:
271 assert((int)qinst->src[i].index >=
272 last_vpm_read_index);
273 (void)last_vpm_read_index;
274 last_vpm_read_index = qinst->src[i].index;
275 src[i] = qpu_ra(QPU_R_VPM);
276 break;
277
278 case QFILE_FRAG_X:
279 src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
280 break;
281 case QFILE_FRAG_Y:
282 src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
283 break;
284 case QFILE_FRAG_REV_FLAG:
285 src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
286 break;
287
288 case QFILE_TLB_COLOR_WRITE:
289 case QFILE_TLB_COLOR_WRITE_MS:
290 case QFILE_TLB_Z_WRITE:
291 case QFILE_TLB_STENCIL_SETUP:
292 unreachable("bad qir src file");
293 }
294 }
295
296 struct qpu_reg dst;
297 switch (qinst->dst.file) {
298 case QFILE_NULL:
299 dst = qpu_ra(QPU_W_NOP);
300 break;
301 case QFILE_TEMP:
302 dst = temp_registers[qinst->dst.index];
303 break;
304 case QFILE_VPM:
305 dst = qpu_ra(QPU_W_VPM);
306 break;
307
308 case QFILE_TLB_COLOR_WRITE:
309 dst = qpu_tlbc();
310 break;
311
312 case QFILE_TLB_COLOR_WRITE_MS:
313 dst = qpu_tlbc_ms();
314 break;
315
316 case QFILE_TLB_Z_WRITE:
317 dst = qpu_ra(QPU_W_TLB_Z);
318 break;
319
320 case QFILE_TLB_STENCIL_SETUP:
321 dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
322 break;
323
324 case QFILE_VARY:
325 case QFILE_UNIF:
326 case QFILE_SMALL_IMM:
327 case QFILE_LOAD_IMM:
328 case QFILE_FRAG_X:
329 case QFILE_FRAG_Y:
330 case QFILE_FRAG_REV_FLAG:
331 assert(!"not reached");
332 break;
333 }
334
335 bool handled_qinst_cond = false;
336
337 switch (qinst->op) {
338 case QOP_RCP:
339 case QOP_RSQ:
340 case QOP_EXP2:
341 case QOP_LOG2:
342 switch (qinst->op) {
343 case QOP_RCP:
344 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
345 src[0]) | unpack);
346 break;
347 case QOP_RSQ:
348 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
349 src[0]) | unpack);
350 break;
351 case QOP_EXP2:
352 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
353 src[0]) | unpack);
354 break;
355 case QOP_LOG2:
356 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
357 src[0]) | unpack);
358 break;
359 default:
360 abort();
361 }
362
363 handle_r4_qpu_write(block, qinst, dst);
364
365 break;
366
367 case QOP_LOAD_IMM:
368 assert(qinst->src[0].file == QFILE_LOAD_IMM);
369 queue(block, qpu_load_imm_ui(dst, qinst->src[0].index));
370 break;
371
372 case QOP_MS_MASK:
373 src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
374 fixup_raddr_conflict(block, dst, &src[0], &src[1],
375 qinst, &unpack);
376 queue(block, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
377 src[0], src[1]) | unpack);
378 break;
379
380 case QOP_FRAG_Z:
381 case QOP_FRAG_W:
382 /* QOP_FRAG_Z/W don't emit instructions, just allocate
383 * the register to the Z/W payload.
384 */
385 break;
386
387 case QOP_TLB_COLOR_READ:
388 queue(block, qpu_NOP());
389 *last_inst(block) = qpu_set_sig(*last_inst(block),
390 QPU_SIG_COLOR_LOAD);
391 handle_r4_qpu_write(block, qinst, dst);
392 break;
393
394 case QOP_VARY_ADD_C:
395 queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
396 break;
397
398 case QOP_TEX_S:
399 case QOP_TEX_T:
400 case QOP_TEX_R:
401 case QOP_TEX_B:
402 queue(block, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
403 (qinst->op - QOP_TEX_S)),
404 src[0]) | unpack);
405 break;
406
407 case QOP_TEX_DIRECT:
408 fixup_raddr_conflict(block, dst, &src[0], &src[1],
409 qinst, &unpack);
410 queue(block, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
411 src[0], src[1]) | unpack);
412 break;
413
414 case QOP_TEX_RESULT:
415 queue(block, qpu_NOP());
416 *last_inst(block) = qpu_set_sig(*last_inst(block),
417 QPU_SIG_LOAD_TMU0);
418 handle_r4_qpu_write(block, qinst, dst);
419 break;
420
421 case QOP_BRANCH:
422 /* The branch target will be updated at QPU scheduling
423 * time.
424 */
425 queue(block, (qpu_branch(qinst->cond, 0) |
426 QPU_BRANCH_REL));
427 handled_qinst_cond = true;
428 break;
429
430 case QOP_UNIFORMS_RESET:
431 fixup_raddr_conflict(block, dst, &src[0], &src[1],
432 qinst, &unpack);
433
434 queue(block, qpu_a_ADD(qpu_ra(QPU_W_UNIFORMS_ADDRESS),
435 src[0], src[1]));
436 break;
437
438 default:
439 assert(qinst->op < ARRAY_SIZE(translate));
440 assert(translate[qinst->op].op != 0); /* NOPs */
441
442 /* Skip emitting the MOV if it's a no-op. */
443 if (qir_is_raw_mov(qinst) &&
444 dst.mux == src[0].mux && dst.addr == src[0].addr) {
445 break;
446 }
447
448 /* If we have only one source, put it in the second
449 * argument slot as well so that we don't take up
450 * another raddr just to get unused data.
451 */
452 if (qir_get_op_nsrc(qinst->op) == 1)
453 src[1] = src[0];
454
455 fixup_raddr_conflict(block, dst, &src[0], &src[1],
456 qinst, &unpack);
457
458 if (qir_is_mul(qinst)) {
459 queue(block, qpu_m_alu2(translate[qinst->op].op,
460 dst,
461 src[0], src[1]) | unpack);
462 set_last_cond_mul(block, qinst->cond);
463 } else {
464 queue(block, qpu_a_alu2(translate[qinst->op].op,
465 dst,
466 src[0], src[1]) | unpack);
467 set_last_cond_add(block, qinst->cond);
468 }
469 handled_qinst_cond = true;
470 set_last_dst_pack(block, qinst);
471
472 break;
473 }
474
475 assert(qinst->cond == QPU_COND_ALWAYS ||
476 handled_qinst_cond);
477
478 if (qinst->sf)
479 *last_inst(block) |= QPU_SF;
480 }
481 }
482
483 void
484 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
485 {
486 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
487 uint32_t inputs_remaining = c->num_inputs;
488 uint32_t vpm_read_fifo_count = 0;
489 uint32_t vpm_read_offset = 0;
490 struct qblock *start_block = list_first_entry(&c->blocks,
491 struct qblock, link);
492
493 switch (c->stage) {
494 case QSTAGE_VERT:
495 case QSTAGE_COORD:
496 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
497 * load up to 16 dwords (4 vec4s) per vertex.
498 */
499 while (inputs_remaining) {
500 uint32_t num_entries = MIN2(inputs_remaining, 16);
501 queue(start_block,
502 qpu_load_imm_ui(qpu_vrsetup(),
503 vpm_read_offset |
504 0x00001a00 |
505 ((num_entries & 0xf) << 20)));
506 inputs_remaining -= num_entries;
507 vpm_read_offset += num_entries;
508 vpm_read_fifo_count++;
509 }
510 assert(vpm_read_fifo_count <= 4);
511
512 queue(start_block, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
513 break;
514 case QSTAGE_FRAG:
515 break;
516 }
517
518 qir_for_each_block(block, c)
519 vc4_generate_code_block(c, block, temp_registers);
520
521 uint32_t cycles = qpu_schedule_instructions(c);
522 uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
523
524 /* thread end can't have VPM write or read */
525 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
526 QPU_WADDR_ADD) == QPU_W_VPM ||
527 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
528 QPU_WADDR_MUL) == QPU_W_VPM ||
529 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
530 QPU_RADDR_A) == QPU_R_VPM ||
531 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
532 QPU_RADDR_B) == QPU_R_VPM) {
533 qpu_serialize_one_inst(c, qpu_NOP());
534 }
535
536 /* thread end can't have uniform read */
537 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
538 QPU_RADDR_A) == QPU_R_UNIF ||
539 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
540 QPU_RADDR_B) == QPU_R_UNIF) {
541 qpu_serialize_one_inst(c, qpu_NOP());
542 }
543
544 /* thread end can't have TLB operations */
545 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
546 qpu_serialize_one_inst(c, qpu_NOP());
547
548 /* Make sure there's no existing signal set (like for a small
549 * immediate)
550 */
551 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
552 QPU_SIG) != QPU_SIG_NONE) {
553 qpu_serialize_one_inst(c, qpu_NOP());
554 }
555
556 c->qpu_insts[c->qpu_inst_count - 1] =
557 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
558 QPU_SIG_PROG_END);
559 qpu_serialize_one_inst(c, qpu_NOP());
560 qpu_serialize_one_inst(c, qpu_NOP());
561
562 switch (c->stage) {
563 case QSTAGE_VERT:
564 case QSTAGE_COORD:
565 break;
566 case QSTAGE_FRAG:
567 c->qpu_insts[c->qpu_inst_count - 1] =
568 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
569 QPU_SIG_SCOREBOARD_UNLOCK);
570 break;
571 }
572
573 cycles += c->qpu_inst_count - inst_count_at_schedule_time;
574
575 if (vc4_debug & VC4_DEBUG_SHADERDB) {
576 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
577 qir_get_stage_name(c->stage),
578 c->program_id, c->variant_id,
579 cycles);
580 }
581
582 if (vc4_debug & VC4_DEBUG_QPU)
583 vc4_dump_program(c);
584
585 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
586
587 free(temp_registers);
588 }