vc4: Allow TLB Z/color/stencil writes from any ALU operation in QIR.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 static void
68 set_last_cond_mul(struct vc4_compile *c, uint32_t cond)
69 {
70 *last_inst(c) = qpu_set_cond_mul(*last_inst(c), cond);
71 }
72
73 /**
74 * Some special registers can be read from either file, which lets us resolve
75 * raddr conflicts without extra MOVs.
76 */
77 static bool
78 swap_file(struct qpu_reg *src)
79 {
80 switch (src->addr) {
81 case QPU_R_UNIF:
82 case QPU_R_VARY:
83 if (src->mux == QPU_MUX_SMALL_IMM) {
84 return false;
85 } else {
86 if (src->mux == QPU_MUX_A)
87 src->mux = QPU_MUX_B;
88 else
89 src->mux = QPU_MUX_A;
90 return true;
91 }
92
93 default:
94 return false;
95 }
96 }
97
98 /**
99 * This is used to resolve the fact that we might register-allocate two
100 * different operands of an instruction to the same physical register file
101 * even though instructions have only one field for the register file source
102 * address.
103 *
104 * In that case, we need to move one to a temporary that can be used in the
105 * instruction, instead. We reserve ra31/rb31 for this purpose.
106 */
107 static void
108 fixup_raddr_conflict(struct vc4_compile *c,
109 struct qpu_reg dst,
110 struct qpu_reg *src0, struct qpu_reg *src1,
111 struct qinst *inst, uint64_t *unpack)
112 {
113 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
114 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
115
116 if (mux0 <= QPU_MUX_R5 ||
117 mux0 != mux1 ||
118 (src0->addr == src1->addr &&
119 src0->mux == src1->mux)) {
120 return;
121 }
122
123 if (swap_file(src0) || swap_file(src1))
124 return;
125
126 if (mux0 == QPU_MUX_A) {
127 /* Make sure we use the same type of MOV as the instruction,
128 * in case of unpacks.
129 */
130 if (qir_is_float_input(inst))
131 queue(c, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
132 else
133 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
134
135 /* If we had an unpack on this A-file source, we need to put
136 * it into this MOV, not into the later move from regfile B.
137 */
138 if (inst->src[0].pack) {
139 *last_inst(c) |= *unpack;
140 *unpack = 0;
141 }
142 *src0 = qpu_rb(31);
143 } else {
144 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
145 *src0 = qpu_ra(31);
146 }
147 }
148
149 static void
150 set_last_dst_pack(struct vc4_compile *c, struct qinst *inst)
151 {
152 bool had_pm = *last_inst(c) & QPU_PM;
153 bool had_ws = *last_inst(c) & QPU_WS;
154 uint32_t unpack = QPU_GET_FIELD(*last_inst(c), QPU_UNPACK);
155
156 if (!inst->dst.pack)
157 return;
158
159 *last_inst(c) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
160
161 if (qir_is_mul(inst)) {
162 assert(!unpack || had_pm);
163 *last_inst(c) |= QPU_PM;
164 } else {
165 assert(!unpack || !had_pm);
166 assert(!had_ws); /* dst must be a-file to pack. */
167 }
168 }
169
170 static void
171 handle_r4_qpu_write(struct vc4_compile *c, struct qinst *qinst,
172 struct qpu_reg dst)
173 {
174 if (dst.mux != QPU_MUX_R4)
175 queue(c, qpu_a_MOV(dst, qpu_r4()));
176 else if (qinst->sf)
177 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
178 }
179
180 void
181 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
182 {
183 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
184 uint32_t inputs_remaining = c->num_inputs;
185 uint32_t vpm_read_fifo_count = 0;
186 uint32_t vpm_read_offset = 0;
187 int last_vpm_read_index = -1;
188
189 list_inithead(&c->qpu_inst_list);
190
191 switch (c->stage) {
192 case QSTAGE_VERT:
193 case QSTAGE_COORD:
194 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
195 * load up to 16 dwords (4 vec4s) per vertex.
196 */
197 while (inputs_remaining) {
198 uint32_t num_entries = MIN2(inputs_remaining, 16);
199 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
200 vpm_read_offset |
201 0x00001a00 |
202 ((num_entries & 0xf) << 20)));
203 inputs_remaining -= num_entries;
204 vpm_read_offset += num_entries;
205 vpm_read_fifo_count++;
206 }
207 assert(vpm_read_fifo_count <= 4);
208
209 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
210 break;
211 case QSTAGE_FRAG:
212 break;
213 }
214
215 list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
216 #if 0
217 fprintf(stderr, "translating qinst to qpu: ");
218 qir_dump_inst(qinst);
219 fprintf(stderr, "\n");
220 #endif
221
222 static const struct {
223 uint32_t op;
224 } translate[] = {
225 #define A(name) [QOP_##name] = {QPU_A_##name}
226 #define M(name) [QOP_##name] = {QPU_M_##name}
227 A(FADD),
228 A(FSUB),
229 A(FMIN),
230 A(FMAX),
231 A(FMINABS),
232 A(FMAXABS),
233 A(FTOI),
234 A(ITOF),
235 A(ADD),
236 A(SUB),
237 A(SHL),
238 A(SHR),
239 A(ASR),
240 A(MIN),
241 A(MAX),
242 A(AND),
243 A(OR),
244 A(XOR),
245 A(NOT),
246
247 M(FMUL),
248 M(V8MULD),
249 M(V8MIN),
250 M(V8MAX),
251 M(V8ADDS),
252 M(V8SUBS),
253 M(MUL24),
254
255 /* If we replicate src[0] out to src[1], this works
256 * out the same as a MOV.
257 */
258 [QOP_MOV] = { QPU_A_OR },
259 [QOP_FMOV] = { QPU_A_FMAX },
260 [QOP_MMOV] = { QPU_M_V8MIN },
261 };
262
263 uint64_t unpack = 0;
264 struct qpu_reg src[4];
265 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
266 int index = qinst->src[i].index;
267 switch (qinst->src[i].file) {
268 case QFILE_NULL:
269 src[i] = qpu_rn(0);
270 break;
271 case QFILE_TEMP:
272 src[i] = temp_registers[index];
273 if (qinst->src[i].pack) {
274 assert(!unpack ||
275 unpack == qinst->src[i].pack);
276 unpack = QPU_SET_FIELD(qinst->src[i].pack,
277 QPU_UNPACK);
278 if (src[i].mux == QPU_MUX_R4)
279 unpack |= QPU_PM;
280 }
281 break;
282 case QFILE_UNIF:
283 src[i] = qpu_unif();
284 break;
285 case QFILE_VARY:
286 src[i] = qpu_vary();
287 break;
288 case QFILE_SMALL_IMM:
289 src[i].mux = QPU_MUX_SMALL_IMM;
290 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
291 /* This should only have returned a valid
292 * small immediate field, not ~0 for failure.
293 */
294 assert(src[i].addr <= 47);
295 break;
296 case QFILE_VPM:
297 assert((int)qinst->src[i].index >=
298 last_vpm_read_index);
299 (void)last_vpm_read_index;
300 last_vpm_read_index = qinst->src[i].index;
301 src[i] = qpu_ra(QPU_R_VPM);
302 break;
303 case QFILE_TLB_COLOR_WRITE:
304 case QFILE_TLB_COLOR_WRITE_MS:
305 case QFILE_TLB_Z_WRITE:
306 case QFILE_TLB_STENCIL_SETUP:
307 unreachable("bad qir src file");
308 }
309 }
310
311 struct qpu_reg dst;
312 switch (qinst->dst.file) {
313 case QFILE_NULL:
314 dst = qpu_ra(QPU_W_NOP);
315 break;
316 case QFILE_TEMP:
317 dst = temp_registers[qinst->dst.index];
318 break;
319 case QFILE_VPM:
320 dst = qpu_ra(QPU_W_VPM);
321 break;
322
323 case QFILE_TLB_COLOR_WRITE:
324 dst = qpu_tlbc();
325 break;
326
327 case QFILE_TLB_COLOR_WRITE_MS:
328 dst = qpu_tlbc_ms();
329 break;
330
331 case QFILE_TLB_Z_WRITE:
332 dst = qpu_ra(QPU_W_TLB_Z);
333 break;
334
335 case QFILE_TLB_STENCIL_SETUP:
336 dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
337 break;
338
339 case QFILE_VARY:
340 case QFILE_UNIF:
341 case QFILE_SMALL_IMM:
342 assert(!"not reached");
343 break;
344 }
345
346 bool handled_qinst_cond = false;
347
348 switch (qinst->op) {
349 case QOP_RCP:
350 case QOP_RSQ:
351 case QOP_EXP2:
352 case QOP_LOG2:
353 switch (qinst->op) {
354 case QOP_RCP:
355 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
356 src[0]) | unpack);
357 break;
358 case QOP_RSQ:
359 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
360 src[0]) | unpack);
361 break;
362 case QOP_EXP2:
363 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
364 src[0]) | unpack);
365 break;
366 case QOP_LOG2:
367 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
368 src[0]) | unpack);
369 break;
370 default:
371 abort();
372 }
373
374 handle_r4_qpu_write(c, qinst, dst);
375
376 break;
377
378 case QOP_FRAG_X:
379 queue(c, qpu_a_ITOF(dst,
380 qpu_ra(QPU_R_XY_PIXEL_COORD)));
381 break;
382
383 case QOP_FRAG_Y:
384 queue(c, qpu_a_ITOF(dst,
385 qpu_rb(QPU_R_XY_PIXEL_COORD)));
386 break;
387
388 case QOP_FRAG_REV_FLAG:
389 queue(c, qpu_a_ITOF(dst,
390 qpu_rb(QPU_R_MS_REV_FLAGS)));
391 break;
392
393 case QOP_MS_MASK:
394 src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
395 fixup_raddr_conflict(c, dst, &src[0], &src[1],
396 qinst, &unpack);
397 queue(c, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
398 src[0], src[1]) | unpack);
399 break;
400
401 case QOP_FRAG_Z:
402 case QOP_FRAG_W:
403 /* QOP_FRAG_Z/W don't emit instructions, just allocate
404 * the register to the Z/W payload.
405 */
406 break;
407
408 case QOP_TLB_COLOR_READ:
409 queue(c, qpu_NOP());
410 *last_inst(c) = qpu_set_sig(*last_inst(c),
411 QPU_SIG_COLOR_LOAD);
412 handle_r4_qpu_write(c, qinst, dst);
413 break;
414
415 case QOP_VARY_ADD_C:
416 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
417 break;
418
419 case QOP_TEX_S:
420 case QOP_TEX_T:
421 case QOP_TEX_R:
422 case QOP_TEX_B:
423 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
424 (qinst->op - QOP_TEX_S)),
425 src[0]) | unpack);
426 break;
427
428 case QOP_TEX_DIRECT:
429 fixup_raddr_conflict(c, dst, &src[0], &src[1],
430 qinst, &unpack);
431 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
432 src[0], src[1]) | unpack);
433 break;
434
435 case QOP_TEX_RESULT:
436 queue(c, qpu_NOP());
437 *last_inst(c) = qpu_set_sig(*last_inst(c),
438 QPU_SIG_LOAD_TMU0);
439 handle_r4_qpu_write(c, qinst, dst);
440 break;
441
442 default:
443 assert(qinst->op < ARRAY_SIZE(translate));
444 assert(translate[qinst->op].op != 0); /* NOPs */
445
446 /* Skip emitting the MOV if it's a no-op. */
447 if (qir_is_raw_mov(qinst) &&
448 dst.mux == src[0].mux && dst.addr == src[0].addr) {
449 break;
450 }
451
452 /* If we have only one source, put it in the second
453 * argument slot as well so that we don't take up
454 * another raddr just to get unused data.
455 */
456 if (qir_get_op_nsrc(qinst->op) == 1)
457 src[1] = src[0];
458
459 fixup_raddr_conflict(c, dst, &src[0], &src[1],
460 qinst, &unpack);
461
462 if (qir_is_mul(qinst)) {
463 queue(c, qpu_m_alu2(translate[qinst->op].op,
464 dst,
465 src[0], src[1]) | unpack);
466 set_last_cond_mul(c, qinst->cond);
467 } else {
468 queue(c, qpu_a_alu2(translate[qinst->op].op,
469 dst,
470 src[0], src[1]) | unpack);
471 set_last_cond_add(c, qinst->cond);
472 }
473 handled_qinst_cond = true;
474 set_last_dst_pack(c, qinst);
475
476 break;
477 }
478
479 assert(qinst->cond == QPU_COND_ALWAYS ||
480 handled_qinst_cond);
481
482 if (qinst->sf)
483 *last_inst(c) |= QPU_SF;
484 }
485
486 uint32_t cycles = qpu_schedule_instructions(c);
487 uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
488
489 /* thread end can't have VPM write or read */
490 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
491 QPU_WADDR_ADD) == QPU_W_VPM ||
492 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
493 QPU_WADDR_MUL) == QPU_W_VPM ||
494 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
495 QPU_RADDR_A) == QPU_R_VPM ||
496 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
497 QPU_RADDR_B) == QPU_R_VPM) {
498 qpu_serialize_one_inst(c, qpu_NOP());
499 }
500
501 /* thread end can't have uniform read */
502 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
503 QPU_RADDR_A) == QPU_R_UNIF ||
504 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
505 QPU_RADDR_B) == QPU_R_UNIF) {
506 qpu_serialize_one_inst(c, qpu_NOP());
507 }
508
509 /* thread end can't have TLB operations */
510 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
511 qpu_serialize_one_inst(c, qpu_NOP());
512
513 c->qpu_insts[c->qpu_inst_count - 1] =
514 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
515 QPU_SIG_PROG_END);
516 qpu_serialize_one_inst(c, qpu_NOP());
517 qpu_serialize_one_inst(c, qpu_NOP());
518
519 switch (c->stage) {
520 case QSTAGE_VERT:
521 case QSTAGE_COORD:
522 break;
523 case QSTAGE_FRAG:
524 c->qpu_insts[c->qpu_inst_count - 1] =
525 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
526 QPU_SIG_SCOREBOARD_UNLOCK);
527 break;
528 }
529
530 cycles += c->qpu_inst_count - inst_count_at_schedule_time;
531
532 if (vc4_debug & VC4_DEBUG_SHADERDB) {
533 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
534 qir_get_stage_name(c->stage),
535 c->program_id, c->variant_id,
536 cycles);
537 }
538
539 if (vc4_debug & VC4_DEBUG_QPU)
540 vc4_dump_program(c);
541
542 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
543
544 free(temp_registers);
545 }