794757e6d189e46a3e59dd6f22cc13e2dd65930a
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 fprintf(stderr, "\n");
44 }
45
46 static void
47 queue(struct vc4_compile *c, uint64_t inst)
48 {
49 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
50 q->inst = inst;
51 list_addtail(&q->link, &c->qpu_inst_list);
52 }
53
54 static uint64_t *
55 last_inst(struct vc4_compile *c)
56 {
57 struct queued_qpu_inst *q =
58 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
59 return &q->inst;
60 }
61
62 static void
63 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
64 {
65 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
66 }
67
68 static void
69 set_last_cond_mul(struct vc4_compile *c, uint32_t cond)
70 {
71 *last_inst(c) = qpu_set_cond_mul(*last_inst(c), cond);
72 }
73
74 /**
75 * Some special registers can be read from either file, which lets us resolve
76 * raddr conflicts without extra MOVs.
77 */
78 static bool
79 swap_file(struct qpu_reg *src)
80 {
81 switch (src->addr) {
82 case QPU_R_UNIF:
83 case QPU_R_VARY:
84 if (src->mux == QPU_MUX_SMALL_IMM) {
85 return false;
86 } else {
87 if (src->mux == QPU_MUX_A)
88 src->mux = QPU_MUX_B;
89 else
90 src->mux = QPU_MUX_A;
91 return true;
92 }
93
94 default:
95 return false;
96 }
97 }
98
99 /**
100 * This is used to resolve the fact that we might register-allocate two
101 * different operands of an instruction to the same physical register file
102 * even though instructions have only one field for the register file source
103 * address.
104 *
105 * In that case, we need to move one to a temporary that can be used in the
106 * instruction, instead. We reserve ra31/rb31 for this purpose.
107 */
108 static void
109 fixup_raddr_conflict(struct vc4_compile *c,
110 struct qpu_reg dst,
111 struct qpu_reg *src0, struct qpu_reg *src1,
112 struct qinst *inst, uint64_t *unpack)
113 {
114 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
115 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
116
117 if (mux0 <= QPU_MUX_R5 ||
118 mux0 != mux1 ||
119 (src0->addr == src1->addr &&
120 src0->mux == src1->mux)) {
121 return;
122 }
123
124 if (swap_file(src0) || swap_file(src1))
125 return;
126
127 if (mux0 == QPU_MUX_A) {
128 /* Make sure we use the same type of MOV as the instruction,
129 * in case of unpacks.
130 */
131 if (qir_is_float_input(inst))
132 queue(c, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
133 else
134 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
135
136 /* If we had an unpack on this A-file source, we need to put
137 * it into this MOV, not into the later move from regfile B.
138 */
139 if (inst->src[0].pack) {
140 *last_inst(c) |= *unpack;
141 *unpack = 0;
142 }
143 *src0 = qpu_rb(31);
144 } else {
145 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
146 *src0 = qpu_ra(31);
147 }
148 }
149
150 static void
151 set_last_dst_pack(struct vc4_compile *c, struct qinst *inst)
152 {
153 bool had_pm = *last_inst(c) & QPU_PM;
154 bool had_ws = *last_inst(c) & QPU_WS;
155 uint32_t unpack = QPU_GET_FIELD(*last_inst(c), QPU_UNPACK);
156
157 if (!inst->dst.pack)
158 return;
159
160 *last_inst(c) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
161
162 if (qir_is_mul(inst)) {
163 assert(!unpack || had_pm);
164 *last_inst(c) |= QPU_PM;
165 } else {
166 assert(!unpack || !had_pm);
167 assert(!had_ws); /* dst must be a-file to pack. */
168 }
169 }
170
171 static void
172 handle_r4_qpu_write(struct vc4_compile *c, struct qinst *qinst,
173 struct qpu_reg dst)
174 {
175 if (dst.mux != QPU_MUX_R4)
176 queue(c, qpu_a_MOV(dst, qpu_r4()));
177 else if (qinst->sf)
178 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
179 }
180
181 void
182 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
183 {
184 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
185 uint32_t inputs_remaining = c->num_inputs;
186 uint32_t vpm_read_fifo_count = 0;
187 uint32_t vpm_read_offset = 0;
188 int last_vpm_read_index = -1;
189
190 list_inithead(&c->qpu_inst_list);
191
192 switch (c->stage) {
193 case QSTAGE_VERT:
194 case QSTAGE_COORD:
195 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
196 * load up to 16 dwords (4 vec4s) per vertex.
197 */
198 while (inputs_remaining) {
199 uint32_t num_entries = MIN2(inputs_remaining, 16);
200 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
201 vpm_read_offset |
202 0x00001a00 |
203 ((num_entries & 0xf) << 20)));
204 inputs_remaining -= num_entries;
205 vpm_read_offset += num_entries;
206 vpm_read_fifo_count++;
207 }
208 assert(vpm_read_fifo_count <= 4);
209
210 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
211 break;
212 case QSTAGE_FRAG:
213 break;
214 }
215
216 list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
217 #if 0
218 fprintf(stderr, "translating qinst to qpu: ");
219 qir_dump_inst(qinst);
220 fprintf(stderr, "\n");
221 #endif
222
223 static const struct {
224 uint32_t op;
225 } translate[] = {
226 #define A(name) [QOP_##name] = {QPU_A_##name}
227 #define M(name) [QOP_##name] = {QPU_M_##name}
228 A(FADD),
229 A(FSUB),
230 A(FMIN),
231 A(FMAX),
232 A(FMINABS),
233 A(FMAXABS),
234 A(FTOI),
235 A(ITOF),
236 A(ADD),
237 A(SUB),
238 A(SHL),
239 A(SHR),
240 A(ASR),
241 A(MIN),
242 A(MAX),
243 A(AND),
244 A(OR),
245 A(XOR),
246 A(NOT),
247
248 M(FMUL),
249 M(V8MULD),
250 M(V8MIN),
251 M(V8MAX),
252 M(V8ADDS),
253 M(V8SUBS),
254 M(MUL24),
255
256 /* If we replicate src[0] out to src[1], this works
257 * out the same as a MOV.
258 */
259 [QOP_MOV] = { QPU_A_OR },
260 [QOP_FMOV] = { QPU_A_FMAX },
261 [QOP_MMOV] = { QPU_M_V8MIN },
262 };
263
264 uint64_t unpack = 0;
265 struct qpu_reg src[4];
266 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
267 int index = qinst->src[i].index;
268 switch (qinst->src[i].file) {
269 case QFILE_NULL:
270 case QFILE_LOAD_IMM:
271 src[i] = qpu_rn(0);
272 break;
273 case QFILE_TEMP:
274 src[i] = temp_registers[index];
275 if (qinst->src[i].pack) {
276 assert(!unpack ||
277 unpack == qinst->src[i].pack);
278 unpack = QPU_SET_FIELD(qinst->src[i].pack,
279 QPU_UNPACK);
280 if (src[i].mux == QPU_MUX_R4)
281 unpack |= QPU_PM;
282 }
283 break;
284 case QFILE_UNIF:
285 src[i] = qpu_unif();
286 break;
287 case QFILE_VARY:
288 src[i] = qpu_vary();
289 break;
290 case QFILE_SMALL_IMM:
291 src[i].mux = QPU_MUX_SMALL_IMM;
292 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
293 /* This should only have returned a valid
294 * small immediate field, not ~0 for failure.
295 */
296 assert(src[i].addr <= 47);
297 break;
298 case QFILE_VPM:
299 assert((int)qinst->src[i].index >=
300 last_vpm_read_index);
301 (void)last_vpm_read_index;
302 last_vpm_read_index = qinst->src[i].index;
303 src[i] = qpu_ra(QPU_R_VPM);
304 break;
305
306 case QFILE_FRAG_X:
307 src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
308 break;
309 case QFILE_FRAG_Y:
310 src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
311 break;
312 case QFILE_FRAG_REV_FLAG:
313 src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
314 break;
315
316 case QFILE_TLB_COLOR_WRITE:
317 case QFILE_TLB_COLOR_WRITE_MS:
318 case QFILE_TLB_Z_WRITE:
319 case QFILE_TLB_STENCIL_SETUP:
320 unreachable("bad qir src file");
321 }
322 }
323
324 struct qpu_reg dst;
325 switch (qinst->dst.file) {
326 case QFILE_NULL:
327 dst = qpu_ra(QPU_W_NOP);
328 break;
329 case QFILE_TEMP:
330 dst = temp_registers[qinst->dst.index];
331 break;
332 case QFILE_VPM:
333 dst = qpu_ra(QPU_W_VPM);
334 break;
335
336 case QFILE_TLB_COLOR_WRITE:
337 dst = qpu_tlbc();
338 break;
339
340 case QFILE_TLB_COLOR_WRITE_MS:
341 dst = qpu_tlbc_ms();
342 break;
343
344 case QFILE_TLB_Z_WRITE:
345 dst = qpu_ra(QPU_W_TLB_Z);
346 break;
347
348 case QFILE_TLB_STENCIL_SETUP:
349 dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
350 break;
351
352 case QFILE_VARY:
353 case QFILE_UNIF:
354 case QFILE_SMALL_IMM:
355 case QFILE_LOAD_IMM:
356 case QFILE_FRAG_X:
357 case QFILE_FRAG_Y:
358 case QFILE_FRAG_REV_FLAG:
359 assert(!"not reached");
360 break;
361 }
362
363 bool handled_qinst_cond = false;
364
365 switch (qinst->op) {
366 case QOP_RCP:
367 case QOP_RSQ:
368 case QOP_EXP2:
369 case QOP_LOG2:
370 switch (qinst->op) {
371 case QOP_RCP:
372 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
373 src[0]) | unpack);
374 break;
375 case QOP_RSQ:
376 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
377 src[0]) | unpack);
378 break;
379 case QOP_EXP2:
380 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
381 src[0]) | unpack);
382 break;
383 case QOP_LOG2:
384 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
385 src[0]) | unpack);
386 break;
387 default:
388 abort();
389 }
390
391 handle_r4_qpu_write(c, qinst, dst);
392
393 break;
394
395 case QOP_LOAD_IMM:
396 assert(qinst->src[0].file == QFILE_LOAD_IMM);
397 queue(c, qpu_load_imm_ui(dst, qinst->src[0].index));
398 break;
399
400 case QOP_MS_MASK:
401 src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
402 fixup_raddr_conflict(c, dst, &src[0], &src[1],
403 qinst, &unpack);
404 queue(c, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
405 src[0], src[1]) | unpack);
406 break;
407
408 case QOP_FRAG_Z:
409 case QOP_FRAG_W:
410 /* QOP_FRAG_Z/W don't emit instructions, just allocate
411 * the register to the Z/W payload.
412 */
413 break;
414
415 case QOP_TLB_COLOR_READ:
416 queue(c, qpu_NOP());
417 *last_inst(c) = qpu_set_sig(*last_inst(c),
418 QPU_SIG_COLOR_LOAD);
419 handle_r4_qpu_write(c, qinst, dst);
420 break;
421
422 case QOP_VARY_ADD_C:
423 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
424 break;
425
426 case QOP_TEX_S:
427 case QOP_TEX_T:
428 case QOP_TEX_R:
429 case QOP_TEX_B:
430 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
431 (qinst->op - QOP_TEX_S)),
432 src[0]) | unpack);
433 break;
434
435 case QOP_TEX_DIRECT:
436 fixup_raddr_conflict(c, dst, &src[0], &src[1],
437 qinst, &unpack);
438 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
439 src[0], src[1]) | unpack);
440 break;
441
442 case QOP_TEX_RESULT:
443 queue(c, qpu_NOP());
444 *last_inst(c) = qpu_set_sig(*last_inst(c),
445 QPU_SIG_LOAD_TMU0);
446 handle_r4_qpu_write(c, qinst, dst);
447 break;
448
449 default:
450 assert(qinst->op < ARRAY_SIZE(translate));
451 assert(translate[qinst->op].op != 0); /* NOPs */
452
453 /* Skip emitting the MOV if it's a no-op. */
454 if (qir_is_raw_mov(qinst) &&
455 dst.mux == src[0].mux && dst.addr == src[0].addr) {
456 break;
457 }
458
459 /* If we have only one source, put it in the second
460 * argument slot as well so that we don't take up
461 * another raddr just to get unused data.
462 */
463 if (qir_get_op_nsrc(qinst->op) == 1)
464 src[1] = src[0];
465
466 fixup_raddr_conflict(c, dst, &src[0], &src[1],
467 qinst, &unpack);
468
469 if (qir_is_mul(qinst)) {
470 queue(c, qpu_m_alu2(translate[qinst->op].op,
471 dst,
472 src[0], src[1]) | unpack);
473 set_last_cond_mul(c, qinst->cond);
474 } else {
475 queue(c, qpu_a_alu2(translate[qinst->op].op,
476 dst,
477 src[0], src[1]) | unpack);
478 set_last_cond_add(c, qinst->cond);
479 }
480 handled_qinst_cond = true;
481 set_last_dst_pack(c, qinst);
482
483 break;
484 }
485
486 assert(qinst->cond == QPU_COND_ALWAYS ||
487 handled_qinst_cond);
488
489 if (qinst->sf)
490 *last_inst(c) |= QPU_SF;
491 }
492
493 uint32_t cycles = qpu_schedule_instructions(c);
494 uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
495
496 /* thread end can't have VPM write or read */
497 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
498 QPU_WADDR_ADD) == QPU_W_VPM ||
499 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
500 QPU_WADDR_MUL) == QPU_W_VPM ||
501 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
502 QPU_RADDR_A) == QPU_R_VPM ||
503 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
504 QPU_RADDR_B) == QPU_R_VPM) {
505 qpu_serialize_one_inst(c, qpu_NOP());
506 }
507
508 /* thread end can't have uniform read */
509 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
510 QPU_RADDR_A) == QPU_R_UNIF ||
511 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
512 QPU_RADDR_B) == QPU_R_UNIF) {
513 qpu_serialize_one_inst(c, qpu_NOP());
514 }
515
516 /* thread end can't have TLB operations */
517 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
518 qpu_serialize_one_inst(c, qpu_NOP());
519
520 /* Make sure there's no existing signal set (like for a small
521 * immediate)
522 */
523 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
524 QPU_SIG) != QPU_SIG_NONE) {
525 qpu_serialize_one_inst(c, qpu_NOP());
526 }
527
528 c->qpu_insts[c->qpu_inst_count - 1] =
529 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
530 QPU_SIG_PROG_END);
531 qpu_serialize_one_inst(c, qpu_NOP());
532 qpu_serialize_one_inst(c, qpu_NOP());
533
534 switch (c->stage) {
535 case QSTAGE_VERT:
536 case QSTAGE_COORD:
537 break;
538 case QSTAGE_FRAG:
539 c->qpu_insts[c->qpu_inst_count - 1] =
540 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
541 QPU_SIG_SCOREBOARD_UNLOCK);
542 break;
543 }
544
545 cycles += c->qpu_inst_count - inst_count_at_schedule_time;
546
547 if (vc4_debug & VC4_DEBUG_SHADERDB) {
548 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
549 qir_get_stage_name(c->stage),
550 c->program_id, c->variant_id,
551 cycles);
552 }
553
554 if (vc4_debug & VC4_DEBUG_QPU)
555 vc4_dump_program(c);
556
557 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
558
559 free(temp_registers);
560 }