cb4e0cfcc3f518fbf3e690257bb6d78dcd08dcdf
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 /**
68 * Some special registers can be read from either file, which lets us resolve
69 * raddr conflicts without extra MOVs.
70 */
71 static bool
72 swap_file(struct qpu_reg *src)
73 {
74 switch (src->addr) {
75 case QPU_R_UNIF:
76 case QPU_R_VARY:
77 if (src->mux == QPU_MUX_SMALL_IMM) {
78 return false;
79 } else {
80 if (src->mux == QPU_MUX_A)
81 src->mux = QPU_MUX_B;
82 else
83 src->mux = QPU_MUX_A;
84 return true;
85 }
86
87 default:
88 return false;
89 }
90 }
91
92 /**
93 * This is used to resolve the fact that we might register-allocate two
94 * different operands of an instruction to the same physical register file
95 * even though instructions have only one field for the register file source
96 * address.
97 *
98 * In that case, we need to move one to a temporary that can be used in the
99 * instruction, instead. We reserve ra31/rb31 for this purpose.
100 */
101 static void
102 fixup_raddr_conflict(struct vc4_compile *c,
103 struct qpu_reg dst,
104 struct qpu_reg *src0, struct qpu_reg *src1,
105 struct qinst *inst, uint64_t *unpack)
106 {
107 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
108 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
109
110 if (mux0 <= QPU_MUX_R5 ||
111 mux0 != mux1 ||
112 (src0->addr == src1->addr &&
113 src0->mux == src1->mux)) {
114 return;
115 }
116
117 if (swap_file(src0) || swap_file(src1))
118 return;
119
120 if (mux0 == QPU_MUX_A) {
121 /* Make sure we use the same type of MOV as the instruction,
122 * in case of unpacks.
123 */
124 if (qir_is_float_input(inst))
125 queue(c, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
126 else
127 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
128
129 /* If we had an unpack on this A-file source, we need to put
130 * it into this MOV, not into the later move from regfile B.
131 */
132 if (inst->src[0].pack) {
133 *last_inst(c) |= *unpack;
134 *unpack = 0;
135 }
136 *src0 = qpu_rb(31);
137 } else {
138 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
139 *src0 = qpu_ra(31);
140 }
141 }
142
143 static void
144 set_last_dst_pack(struct vc4_compile *c, struct qinst *inst)
145 {
146 bool had_pm = *last_inst(c) & QPU_PM;
147 bool had_ws = *last_inst(c) & QPU_WS;
148 uint32_t unpack = QPU_GET_FIELD(*last_inst(c), QPU_UNPACK);
149
150 if (!inst->dst.pack)
151 return;
152
153 *last_inst(c) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
154
155 if (qir_is_mul(inst)) {
156 assert(!unpack || had_pm);
157 *last_inst(c) |= QPU_PM;
158 } else {
159 assert(!unpack || !had_pm);
160 assert(!had_ws); /* dst must be a-file to pack. */
161 }
162 }
163
164 void
165 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
166 {
167 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
168 bool discard = false;
169 uint32_t inputs_remaining = c->num_inputs;
170 uint32_t vpm_read_fifo_count = 0;
171 uint32_t vpm_read_offset = 0;
172 int last_vpm_read_index = -1;
173
174 list_inithead(&c->qpu_inst_list);
175
176 switch (c->stage) {
177 case QSTAGE_VERT:
178 case QSTAGE_COORD:
179 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
180 * load up to 16 dwords (4 vec4s) per vertex.
181 */
182 while (inputs_remaining) {
183 uint32_t num_entries = MIN2(inputs_remaining, 16);
184 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
185 vpm_read_offset |
186 0x00001a00 |
187 ((num_entries & 0xf) << 20)));
188 inputs_remaining -= num_entries;
189 vpm_read_offset += num_entries;
190 vpm_read_fifo_count++;
191 }
192 assert(vpm_read_fifo_count <= 4);
193
194 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
195 break;
196 case QSTAGE_FRAG:
197 break;
198 }
199
200 list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
201 #if 0
202 fprintf(stderr, "translating qinst to qpu: ");
203 qir_dump_inst(qinst);
204 fprintf(stderr, "\n");
205 #endif
206
207 static const struct {
208 uint32_t op;
209 } translate[] = {
210 #define A(name) [QOP_##name] = {QPU_A_##name}
211 #define M(name) [QOP_##name] = {QPU_M_##name}
212 A(FADD),
213 A(FSUB),
214 A(FMIN),
215 A(FMAX),
216 A(FMINABS),
217 A(FMAXABS),
218 A(FTOI),
219 A(ITOF),
220 A(ADD),
221 A(SUB),
222 A(SHL),
223 A(SHR),
224 A(ASR),
225 A(MIN),
226 A(MAX),
227 A(AND),
228 A(OR),
229 A(XOR),
230 A(NOT),
231
232 M(FMUL),
233 M(V8MULD),
234 M(V8MIN),
235 M(V8MAX),
236 M(V8ADDS),
237 M(V8SUBS),
238 M(MUL24),
239
240 /* If we replicate src[0] out to src[1], this works
241 * out the same as a MOV.
242 */
243 [QOP_MOV] = { QPU_A_OR },
244 [QOP_FMOV] = { QPU_A_FMAX },
245 [QOP_MMOV] = { QPU_M_V8MIN },
246 };
247
248 uint64_t unpack = 0;
249 struct qpu_reg src[4];
250 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
251 int index = qinst->src[i].index;
252 switch (qinst->src[i].file) {
253 case QFILE_NULL:
254 src[i] = qpu_rn(0);
255 break;
256 case QFILE_TEMP:
257 src[i] = temp_registers[index];
258 if (qinst->src[i].pack) {
259 assert(!unpack ||
260 unpack == qinst->src[i].pack);
261 unpack = QPU_SET_FIELD(qinst->src[i].pack,
262 QPU_UNPACK);
263 if (src[i].mux == QPU_MUX_R4)
264 unpack |= QPU_PM;
265 }
266 break;
267 case QFILE_UNIF:
268 src[i] = qpu_unif();
269 break;
270 case QFILE_VARY:
271 src[i] = qpu_vary();
272 break;
273 case QFILE_SMALL_IMM:
274 src[i].mux = QPU_MUX_SMALL_IMM;
275 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
276 /* This should only have returned a valid
277 * small immediate field, not ~0 for failure.
278 */
279 assert(src[i].addr <= 47);
280 break;
281 case QFILE_VPM:
282 assert((int)qinst->src[i].index >=
283 last_vpm_read_index);
284 (void)last_vpm_read_index;
285 last_vpm_read_index = qinst->src[i].index;
286 src[i] = qpu_ra(QPU_R_VPM);
287 break;
288 }
289 }
290
291 struct qpu_reg dst;
292 switch (qinst->dst.file) {
293 case QFILE_NULL:
294 dst = qpu_ra(QPU_W_NOP);
295 break;
296 case QFILE_TEMP:
297 dst = temp_registers[qinst->dst.index];
298 break;
299 case QFILE_VPM:
300 dst = qpu_ra(QPU_W_VPM);
301 break;
302 case QFILE_VARY:
303 case QFILE_UNIF:
304 case QFILE_SMALL_IMM:
305 assert(!"not reached");
306 break;
307 }
308
309 switch (qinst->op) {
310 case QOP_SEL_X_0_ZS:
311 case QOP_SEL_X_0_ZC:
312 case QOP_SEL_X_0_NS:
313 case QOP_SEL_X_0_NC:
314 case QOP_SEL_X_0_CS:
315 case QOP_SEL_X_0_CC:
316 queue(c, qpu_a_MOV(dst, src[0]) | unpack);
317 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
318 QPU_COND_ZS);
319
320 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
321 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
322 1) + QPU_COND_ZS);
323 break;
324
325 case QOP_SEL_X_Y_ZS:
326 case QOP_SEL_X_Y_ZC:
327 case QOP_SEL_X_Y_NS:
328 case QOP_SEL_X_Y_NC:
329 case QOP_SEL_X_Y_CS:
330 case QOP_SEL_X_Y_CC:
331 queue(c, qpu_a_MOV(dst, src[0]));
332 if (qinst->src[0].pack)
333 *(last_inst(c)) |= unpack;
334 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
335 QPU_COND_ZS);
336
337 queue(c, qpu_a_MOV(dst, src[1]));
338 if (qinst->src[1].pack)
339 *(last_inst(c)) |= unpack;
340 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
341 1) + QPU_COND_ZS);
342
343 break;
344
345 case QOP_RCP:
346 case QOP_RSQ:
347 case QOP_EXP2:
348 case QOP_LOG2:
349 switch (qinst->op) {
350 case QOP_RCP:
351 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
352 src[0]) | unpack);
353 break;
354 case QOP_RSQ:
355 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
356 src[0]) | unpack);
357 break;
358 case QOP_EXP2:
359 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
360 src[0]) | unpack);
361 break;
362 case QOP_LOG2:
363 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
364 src[0]) | unpack);
365 break;
366 default:
367 abort();
368 }
369
370 if (dst.mux != QPU_MUX_R4)
371 queue(c, qpu_a_MOV(dst, qpu_r4()));
372
373 break;
374
375 case QOP_FRAG_X:
376 queue(c, qpu_a_ITOF(dst,
377 qpu_ra(QPU_R_XY_PIXEL_COORD)));
378 break;
379
380 case QOP_FRAG_Y:
381 queue(c, qpu_a_ITOF(dst,
382 qpu_rb(QPU_R_XY_PIXEL_COORD)));
383 break;
384
385 case QOP_FRAG_REV_FLAG:
386 queue(c, qpu_a_ITOF(dst,
387 qpu_rb(QPU_R_MS_REV_FLAGS)));
388 break;
389
390 case QOP_MS_MASK:
391 src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
392 fixup_raddr_conflict(c, dst, &src[0], &src[1],
393 qinst, &unpack);
394 queue(c, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
395 src[0], src[1]) | unpack);
396 break;
397
398 case QOP_FRAG_Z:
399 case QOP_FRAG_W:
400 /* QOP_FRAG_Z/W don't emit instructions, just allocate
401 * the register to the Z/W payload.
402 */
403 break;
404
405 case QOP_TLB_DISCARD_SETUP:
406 discard = true;
407 queue(c, qpu_a_MOV(src[0], src[0]) | unpack);
408 *last_inst(c) |= QPU_SF;
409 break;
410
411 case QOP_TLB_STENCIL_SETUP:
412 assert(!unpack);
413 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP),
414 src[0]) | unpack);
415 break;
416
417 case QOP_TLB_Z_WRITE:
418 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z),
419 src[0]) | unpack);
420 if (discard) {
421 set_last_cond_add(c, QPU_COND_ZS);
422 }
423 break;
424
425 case QOP_TLB_COLOR_READ:
426 queue(c, qpu_NOP());
427 *last_inst(c) = qpu_set_sig(*last_inst(c),
428 QPU_SIG_COLOR_LOAD);
429
430 if (dst.mux != QPU_MUX_R4)
431 queue(c, qpu_a_MOV(dst, qpu_r4()));
432 break;
433
434 case QOP_TLB_COLOR_WRITE:
435 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]) | unpack);
436 if (discard) {
437 set_last_cond_add(c, QPU_COND_ZS);
438 }
439 break;
440
441 case QOP_TLB_COLOR_WRITE_MS:
442 queue(c, qpu_a_MOV(qpu_tlbc_ms(), src[0]));
443 if (discard) {
444 set_last_cond_add(c, QPU_COND_ZS);
445 }
446 break;
447
448 case QOP_VARY_ADD_C:
449 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
450 break;
451
452 case QOP_TEX_S:
453 case QOP_TEX_T:
454 case QOP_TEX_R:
455 case QOP_TEX_B:
456 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
457 (qinst->op - QOP_TEX_S)),
458 src[0]) | unpack);
459 break;
460
461 case QOP_TEX_DIRECT:
462 fixup_raddr_conflict(c, dst, &src[0], &src[1],
463 qinst, &unpack);
464 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
465 src[0], src[1]) | unpack);
466 break;
467
468 case QOP_TEX_RESULT:
469 queue(c, qpu_NOP());
470 *last_inst(c) = qpu_set_sig(*last_inst(c),
471 QPU_SIG_LOAD_TMU0);
472 if (dst.mux != QPU_MUX_R4)
473 queue(c, qpu_a_MOV(dst, qpu_r4()));
474 break;
475
476 default:
477 assert(qinst->op < ARRAY_SIZE(translate));
478 assert(translate[qinst->op].op != 0); /* NOPs */
479
480 /* Skip emitting the MOV if it's a no-op. */
481 if (qir_is_raw_mov(qinst) &&
482 dst.mux == src[0].mux && dst.addr == src[0].addr) {
483 break;
484 }
485
486 /* If we have only one source, put it in the second
487 * argument slot as well so that we don't take up
488 * another raddr just to get unused data.
489 */
490 if (qir_get_op_nsrc(qinst->op) == 1)
491 src[1] = src[0];
492
493 fixup_raddr_conflict(c, dst, &src[0], &src[1],
494 qinst, &unpack);
495
496 if (qir_is_mul(qinst)) {
497 queue(c, qpu_m_alu2(translate[qinst->op].op,
498 dst,
499 src[0], src[1]) | unpack);
500 } else {
501 queue(c, qpu_a_alu2(translate[qinst->op].op,
502 dst,
503 src[0], src[1]) | unpack);
504 }
505 set_last_dst_pack(c, qinst);
506
507 break;
508 }
509
510 if (qinst->sf) {
511 assert(!qir_is_multi_instruction(qinst));
512 *last_inst(c) |= QPU_SF;
513 }
514 }
515
516 uint32_t cycles = qpu_schedule_instructions(c);
517 uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
518
519 /* thread end can't have VPM write or read */
520 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
521 QPU_WADDR_ADD) == QPU_W_VPM ||
522 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
523 QPU_WADDR_MUL) == QPU_W_VPM ||
524 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
525 QPU_RADDR_A) == QPU_R_VPM ||
526 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
527 QPU_RADDR_B) == QPU_R_VPM) {
528 qpu_serialize_one_inst(c, qpu_NOP());
529 }
530
531 /* thread end can't have uniform read */
532 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
533 QPU_RADDR_A) == QPU_R_UNIF ||
534 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
535 QPU_RADDR_B) == QPU_R_UNIF) {
536 qpu_serialize_one_inst(c, qpu_NOP());
537 }
538
539 /* thread end can't have TLB operations */
540 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
541 qpu_serialize_one_inst(c, qpu_NOP());
542
543 c->qpu_insts[c->qpu_inst_count - 1] =
544 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
545 QPU_SIG_PROG_END);
546 qpu_serialize_one_inst(c, qpu_NOP());
547 qpu_serialize_one_inst(c, qpu_NOP());
548
549 switch (c->stage) {
550 case QSTAGE_VERT:
551 case QSTAGE_COORD:
552 break;
553 case QSTAGE_FRAG:
554 c->qpu_insts[c->qpu_inst_count - 1] =
555 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
556 QPU_SIG_SCOREBOARD_UNLOCK);
557 break;
558 }
559
560 cycles += c->qpu_inst_count - inst_count_at_schedule_time;
561
562 if (vc4_debug & VC4_DEBUG_SHADERDB) {
563 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
564 qir_get_stage_name(c->stage),
565 c->program_id, c->variant_id,
566 cycles);
567 }
568
569 if (vc4_debug & VC4_DEBUG_QPU)
570 vc4_dump_program(c);
571
572 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
573
574 free(temp_registers);
575 }