vc4: Move FRAG_X/Y/REV_FLAG to a QFILE like VPM or TLB color writes.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 static void
68 set_last_cond_mul(struct vc4_compile *c, uint32_t cond)
69 {
70 *last_inst(c) = qpu_set_cond_mul(*last_inst(c), cond);
71 }
72
73 /**
74 * Some special registers can be read from either file, which lets us resolve
75 * raddr conflicts without extra MOVs.
76 */
77 static bool
78 swap_file(struct qpu_reg *src)
79 {
80 switch (src->addr) {
81 case QPU_R_UNIF:
82 case QPU_R_VARY:
83 if (src->mux == QPU_MUX_SMALL_IMM) {
84 return false;
85 } else {
86 if (src->mux == QPU_MUX_A)
87 src->mux = QPU_MUX_B;
88 else
89 src->mux = QPU_MUX_A;
90 return true;
91 }
92
93 default:
94 return false;
95 }
96 }
97
98 /**
99 * This is used to resolve the fact that we might register-allocate two
100 * different operands of an instruction to the same physical register file
101 * even though instructions have only one field for the register file source
102 * address.
103 *
104 * In that case, we need to move one to a temporary that can be used in the
105 * instruction, instead. We reserve ra31/rb31 for this purpose.
106 */
107 static void
108 fixup_raddr_conflict(struct vc4_compile *c,
109 struct qpu_reg dst,
110 struct qpu_reg *src0, struct qpu_reg *src1,
111 struct qinst *inst, uint64_t *unpack)
112 {
113 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
114 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
115
116 if (mux0 <= QPU_MUX_R5 ||
117 mux0 != mux1 ||
118 (src0->addr == src1->addr &&
119 src0->mux == src1->mux)) {
120 return;
121 }
122
123 if (swap_file(src0) || swap_file(src1))
124 return;
125
126 if (mux0 == QPU_MUX_A) {
127 /* Make sure we use the same type of MOV as the instruction,
128 * in case of unpacks.
129 */
130 if (qir_is_float_input(inst))
131 queue(c, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
132 else
133 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
134
135 /* If we had an unpack on this A-file source, we need to put
136 * it into this MOV, not into the later move from regfile B.
137 */
138 if (inst->src[0].pack) {
139 *last_inst(c) |= *unpack;
140 *unpack = 0;
141 }
142 *src0 = qpu_rb(31);
143 } else {
144 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
145 *src0 = qpu_ra(31);
146 }
147 }
148
149 static void
150 set_last_dst_pack(struct vc4_compile *c, struct qinst *inst)
151 {
152 bool had_pm = *last_inst(c) & QPU_PM;
153 bool had_ws = *last_inst(c) & QPU_WS;
154 uint32_t unpack = QPU_GET_FIELD(*last_inst(c), QPU_UNPACK);
155
156 if (!inst->dst.pack)
157 return;
158
159 *last_inst(c) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
160
161 if (qir_is_mul(inst)) {
162 assert(!unpack || had_pm);
163 *last_inst(c) |= QPU_PM;
164 } else {
165 assert(!unpack || !had_pm);
166 assert(!had_ws); /* dst must be a-file to pack. */
167 }
168 }
169
170 static void
171 handle_r4_qpu_write(struct vc4_compile *c, struct qinst *qinst,
172 struct qpu_reg dst)
173 {
174 if (dst.mux != QPU_MUX_R4)
175 queue(c, qpu_a_MOV(dst, qpu_r4()));
176 else if (qinst->sf)
177 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
178 }
179
180 void
181 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
182 {
183 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
184 uint32_t inputs_remaining = c->num_inputs;
185 uint32_t vpm_read_fifo_count = 0;
186 uint32_t vpm_read_offset = 0;
187 int last_vpm_read_index = -1;
188
189 list_inithead(&c->qpu_inst_list);
190
191 switch (c->stage) {
192 case QSTAGE_VERT:
193 case QSTAGE_COORD:
194 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
195 * load up to 16 dwords (4 vec4s) per vertex.
196 */
197 while (inputs_remaining) {
198 uint32_t num_entries = MIN2(inputs_remaining, 16);
199 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
200 vpm_read_offset |
201 0x00001a00 |
202 ((num_entries & 0xf) << 20)));
203 inputs_remaining -= num_entries;
204 vpm_read_offset += num_entries;
205 vpm_read_fifo_count++;
206 }
207 assert(vpm_read_fifo_count <= 4);
208
209 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
210 break;
211 case QSTAGE_FRAG:
212 break;
213 }
214
215 list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
216 #if 0
217 fprintf(stderr, "translating qinst to qpu: ");
218 qir_dump_inst(qinst);
219 fprintf(stderr, "\n");
220 #endif
221
222 static const struct {
223 uint32_t op;
224 } translate[] = {
225 #define A(name) [QOP_##name] = {QPU_A_##name}
226 #define M(name) [QOP_##name] = {QPU_M_##name}
227 A(FADD),
228 A(FSUB),
229 A(FMIN),
230 A(FMAX),
231 A(FMINABS),
232 A(FMAXABS),
233 A(FTOI),
234 A(ITOF),
235 A(ADD),
236 A(SUB),
237 A(SHL),
238 A(SHR),
239 A(ASR),
240 A(MIN),
241 A(MAX),
242 A(AND),
243 A(OR),
244 A(XOR),
245 A(NOT),
246
247 M(FMUL),
248 M(V8MULD),
249 M(V8MIN),
250 M(V8MAX),
251 M(V8ADDS),
252 M(V8SUBS),
253 M(MUL24),
254
255 /* If we replicate src[0] out to src[1], this works
256 * out the same as a MOV.
257 */
258 [QOP_MOV] = { QPU_A_OR },
259 [QOP_FMOV] = { QPU_A_FMAX },
260 [QOP_MMOV] = { QPU_M_V8MIN },
261 };
262
263 uint64_t unpack = 0;
264 struct qpu_reg src[4];
265 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
266 int index = qinst->src[i].index;
267 switch (qinst->src[i].file) {
268 case QFILE_NULL:
269 src[i] = qpu_rn(0);
270 break;
271 case QFILE_TEMP:
272 src[i] = temp_registers[index];
273 if (qinst->src[i].pack) {
274 assert(!unpack ||
275 unpack == qinst->src[i].pack);
276 unpack = QPU_SET_FIELD(qinst->src[i].pack,
277 QPU_UNPACK);
278 if (src[i].mux == QPU_MUX_R4)
279 unpack |= QPU_PM;
280 }
281 break;
282 case QFILE_UNIF:
283 src[i] = qpu_unif();
284 break;
285 case QFILE_VARY:
286 src[i] = qpu_vary();
287 break;
288 case QFILE_SMALL_IMM:
289 src[i].mux = QPU_MUX_SMALL_IMM;
290 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
291 /* This should only have returned a valid
292 * small immediate field, not ~0 for failure.
293 */
294 assert(src[i].addr <= 47);
295 break;
296 case QFILE_VPM:
297 assert((int)qinst->src[i].index >=
298 last_vpm_read_index);
299 (void)last_vpm_read_index;
300 last_vpm_read_index = qinst->src[i].index;
301 src[i] = qpu_ra(QPU_R_VPM);
302 break;
303
304 case QFILE_FRAG_X:
305 src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
306 break;
307 case QFILE_FRAG_Y:
308 src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
309 break;
310 case QFILE_FRAG_REV_FLAG:
311 src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
312 break;
313
314 case QFILE_TLB_COLOR_WRITE:
315 case QFILE_TLB_COLOR_WRITE_MS:
316 case QFILE_TLB_Z_WRITE:
317 case QFILE_TLB_STENCIL_SETUP:
318 unreachable("bad qir src file");
319 }
320 }
321
322 struct qpu_reg dst;
323 switch (qinst->dst.file) {
324 case QFILE_NULL:
325 dst = qpu_ra(QPU_W_NOP);
326 break;
327 case QFILE_TEMP:
328 dst = temp_registers[qinst->dst.index];
329 break;
330 case QFILE_VPM:
331 dst = qpu_ra(QPU_W_VPM);
332 break;
333
334 case QFILE_TLB_COLOR_WRITE:
335 dst = qpu_tlbc();
336 break;
337
338 case QFILE_TLB_COLOR_WRITE_MS:
339 dst = qpu_tlbc_ms();
340 break;
341
342 case QFILE_TLB_Z_WRITE:
343 dst = qpu_ra(QPU_W_TLB_Z);
344 break;
345
346 case QFILE_TLB_STENCIL_SETUP:
347 dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
348 break;
349
350 case QFILE_VARY:
351 case QFILE_UNIF:
352 case QFILE_SMALL_IMM:
353 case QFILE_FRAG_X:
354 case QFILE_FRAG_Y:
355 case QFILE_FRAG_REV_FLAG:
356 assert(!"not reached");
357 break;
358 }
359
360 bool handled_qinst_cond = false;
361
362 switch (qinst->op) {
363 case QOP_RCP:
364 case QOP_RSQ:
365 case QOP_EXP2:
366 case QOP_LOG2:
367 switch (qinst->op) {
368 case QOP_RCP:
369 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
370 src[0]) | unpack);
371 break;
372 case QOP_RSQ:
373 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
374 src[0]) | unpack);
375 break;
376 case QOP_EXP2:
377 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
378 src[0]) | unpack);
379 break;
380 case QOP_LOG2:
381 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
382 src[0]) | unpack);
383 break;
384 default:
385 abort();
386 }
387
388 handle_r4_qpu_write(c, qinst, dst);
389
390 break;
391
392 case QOP_MS_MASK:
393 src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
394 fixup_raddr_conflict(c, dst, &src[0], &src[1],
395 qinst, &unpack);
396 queue(c, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
397 src[0], src[1]) | unpack);
398 break;
399
400 case QOP_FRAG_Z:
401 case QOP_FRAG_W:
402 /* QOP_FRAG_Z/W don't emit instructions, just allocate
403 * the register to the Z/W payload.
404 */
405 break;
406
407 case QOP_TLB_COLOR_READ:
408 queue(c, qpu_NOP());
409 *last_inst(c) = qpu_set_sig(*last_inst(c),
410 QPU_SIG_COLOR_LOAD);
411 handle_r4_qpu_write(c, qinst, dst);
412 break;
413
414 case QOP_VARY_ADD_C:
415 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
416 break;
417
418 case QOP_TEX_S:
419 case QOP_TEX_T:
420 case QOP_TEX_R:
421 case QOP_TEX_B:
422 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
423 (qinst->op - QOP_TEX_S)),
424 src[0]) | unpack);
425 break;
426
427 case QOP_TEX_DIRECT:
428 fixup_raddr_conflict(c, dst, &src[0], &src[1],
429 qinst, &unpack);
430 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
431 src[0], src[1]) | unpack);
432 break;
433
434 case QOP_TEX_RESULT:
435 queue(c, qpu_NOP());
436 *last_inst(c) = qpu_set_sig(*last_inst(c),
437 QPU_SIG_LOAD_TMU0);
438 handle_r4_qpu_write(c, qinst, dst);
439 break;
440
441 default:
442 assert(qinst->op < ARRAY_SIZE(translate));
443 assert(translate[qinst->op].op != 0); /* NOPs */
444
445 /* Skip emitting the MOV if it's a no-op. */
446 if (qir_is_raw_mov(qinst) &&
447 dst.mux == src[0].mux && dst.addr == src[0].addr) {
448 break;
449 }
450
451 /* If we have only one source, put it in the second
452 * argument slot as well so that we don't take up
453 * another raddr just to get unused data.
454 */
455 if (qir_get_op_nsrc(qinst->op) == 1)
456 src[1] = src[0];
457
458 fixup_raddr_conflict(c, dst, &src[0], &src[1],
459 qinst, &unpack);
460
461 if (qir_is_mul(qinst)) {
462 queue(c, qpu_m_alu2(translate[qinst->op].op,
463 dst,
464 src[0], src[1]) | unpack);
465 set_last_cond_mul(c, qinst->cond);
466 } else {
467 queue(c, qpu_a_alu2(translate[qinst->op].op,
468 dst,
469 src[0], src[1]) | unpack);
470 set_last_cond_add(c, qinst->cond);
471 }
472 handled_qinst_cond = true;
473 set_last_dst_pack(c, qinst);
474
475 break;
476 }
477
478 assert(qinst->cond == QPU_COND_ALWAYS ||
479 handled_qinst_cond);
480
481 if (qinst->sf)
482 *last_inst(c) |= QPU_SF;
483 }
484
485 uint32_t cycles = qpu_schedule_instructions(c);
486 uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
487
488 /* thread end can't have VPM write or read */
489 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
490 QPU_WADDR_ADD) == QPU_W_VPM ||
491 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
492 QPU_WADDR_MUL) == QPU_W_VPM ||
493 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
494 QPU_RADDR_A) == QPU_R_VPM ||
495 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
496 QPU_RADDR_B) == QPU_R_VPM) {
497 qpu_serialize_one_inst(c, qpu_NOP());
498 }
499
500 /* thread end can't have uniform read */
501 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
502 QPU_RADDR_A) == QPU_R_UNIF ||
503 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
504 QPU_RADDR_B) == QPU_R_UNIF) {
505 qpu_serialize_one_inst(c, qpu_NOP());
506 }
507
508 /* thread end can't have TLB operations */
509 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
510 qpu_serialize_one_inst(c, qpu_NOP());
511
512 c->qpu_insts[c->qpu_inst_count - 1] =
513 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
514 QPU_SIG_PROG_END);
515 qpu_serialize_one_inst(c, qpu_NOP());
516 qpu_serialize_one_inst(c, qpu_NOP());
517
518 switch (c->stage) {
519 case QSTAGE_VERT:
520 case QSTAGE_COORD:
521 break;
522 case QSTAGE_FRAG:
523 c->qpu_insts[c->qpu_inst_count - 1] =
524 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
525 QPU_SIG_SCOREBOARD_UNLOCK);
526 break;
527 }
528
529 cycles += c->qpu_inst_count - inst_count_at_schedule_time;
530
531 if (vc4_debug & VC4_DEBUG_SHADERDB) {
532 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
533 qir_get_stage_name(c->stage),
534 c->program_id, c->variant_id,
535 cycles);
536 }
537
538 if (vc4_debug & VC4_DEBUG_QPU)
539 vc4_dump_program(c);
540
541 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
542
543 free(temp_registers);
544 }