63e1ee543053af32013d42b18672afc1f784479b
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 static void
68 set_last_cond_mul(struct vc4_compile *c, uint32_t cond)
69 {
70 *last_inst(c) = qpu_set_cond_mul(*last_inst(c), cond);
71 }
72
73 /**
74 * Some special registers can be read from either file, which lets us resolve
75 * raddr conflicts without extra MOVs.
76 */
77 static bool
78 swap_file(struct qpu_reg *src)
79 {
80 switch (src->addr) {
81 case QPU_R_UNIF:
82 case QPU_R_VARY:
83 if (src->mux == QPU_MUX_SMALL_IMM) {
84 return false;
85 } else {
86 if (src->mux == QPU_MUX_A)
87 src->mux = QPU_MUX_B;
88 else
89 src->mux = QPU_MUX_A;
90 return true;
91 }
92
93 default:
94 return false;
95 }
96 }
97
98 /**
99 * This is used to resolve the fact that we might register-allocate two
100 * different operands of an instruction to the same physical register file
101 * even though instructions have only one field for the register file source
102 * address.
103 *
104 * In that case, we need to move one to a temporary that can be used in the
105 * instruction, instead. We reserve ra31/rb31 for this purpose.
106 */
107 static void
108 fixup_raddr_conflict(struct vc4_compile *c,
109 struct qpu_reg dst,
110 struct qpu_reg *src0, struct qpu_reg *src1,
111 struct qinst *inst, uint64_t *unpack)
112 {
113 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
114 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
115
116 if (mux0 <= QPU_MUX_R5 ||
117 mux0 != mux1 ||
118 (src0->addr == src1->addr &&
119 src0->mux == src1->mux)) {
120 return;
121 }
122
123 if (swap_file(src0) || swap_file(src1))
124 return;
125
126 if (mux0 == QPU_MUX_A) {
127 /* Make sure we use the same type of MOV as the instruction,
128 * in case of unpacks.
129 */
130 if (qir_is_float_input(inst))
131 queue(c, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
132 else
133 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
134
135 /* If we had an unpack on this A-file source, we need to put
136 * it into this MOV, not into the later move from regfile B.
137 */
138 if (inst->src[0].pack) {
139 *last_inst(c) |= *unpack;
140 *unpack = 0;
141 }
142 *src0 = qpu_rb(31);
143 } else {
144 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
145 *src0 = qpu_ra(31);
146 }
147 }
148
149 static void
150 set_last_dst_pack(struct vc4_compile *c, struct qinst *inst)
151 {
152 bool had_pm = *last_inst(c) & QPU_PM;
153 bool had_ws = *last_inst(c) & QPU_WS;
154 uint32_t unpack = QPU_GET_FIELD(*last_inst(c), QPU_UNPACK);
155
156 if (!inst->dst.pack)
157 return;
158
159 *last_inst(c) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
160
161 if (qir_is_mul(inst)) {
162 assert(!unpack || had_pm);
163 *last_inst(c) |= QPU_PM;
164 } else {
165 assert(!unpack || !had_pm);
166 assert(!had_ws); /* dst must be a-file to pack. */
167 }
168 }
169
170 static void
171 handle_r4_qpu_write(struct vc4_compile *c, struct qinst *qinst,
172 struct qpu_reg dst)
173 {
174 if (dst.mux != QPU_MUX_R4)
175 queue(c, qpu_a_MOV(dst, qpu_r4()));
176 else if (qinst->sf)
177 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
178 }
179
180 void
181 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
182 {
183 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
184 uint32_t inputs_remaining = c->num_inputs;
185 uint32_t vpm_read_fifo_count = 0;
186 uint32_t vpm_read_offset = 0;
187 int last_vpm_read_index = -1;
188
189 list_inithead(&c->qpu_inst_list);
190
191 switch (c->stage) {
192 case QSTAGE_VERT:
193 case QSTAGE_COORD:
194 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
195 * load up to 16 dwords (4 vec4s) per vertex.
196 */
197 while (inputs_remaining) {
198 uint32_t num_entries = MIN2(inputs_remaining, 16);
199 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
200 vpm_read_offset |
201 0x00001a00 |
202 ((num_entries & 0xf) << 20)));
203 inputs_remaining -= num_entries;
204 vpm_read_offset += num_entries;
205 vpm_read_fifo_count++;
206 }
207 assert(vpm_read_fifo_count <= 4);
208
209 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
210 break;
211 case QSTAGE_FRAG:
212 break;
213 }
214
215 list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
216 #if 0
217 fprintf(stderr, "translating qinst to qpu: ");
218 qir_dump_inst(qinst);
219 fprintf(stderr, "\n");
220 #endif
221
222 static const struct {
223 uint32_t op;
224 } translate[] = {
225 #define A(name) [QOP_##name] = {QPU_A_##name}
226 #define M(name) [QOP_##name] = {QPU_M_##name}
227 A(FADD),
228 A(FSUB),
229 A(FMIN),
230 A(FMAX),
231 A(FMINABS),
232 A(FMAXABS),
233 A(FTOI),
234 A(ITOF),
235 A(ADD),
236 A(SUB),
237 A(SHL),
238 A(SHR),
239 A(ASR),
240 A(MIN),
241 A(MAX),
242 A(AND),
243 A(OR),
244 A(XOR),
245 A(NOT),
246
247 M(FMUL),
248 M(V8MULD),
249 M(V8MIN),
250 M(V8MAX),
251 M(V8ADDS),
252 M(V8SUBS),
253 M(MUL24),
254
255 /* If we replicate src[0] out to src[1], this works
256 * out the same as a MOV.
257 */
258 [QOP_MOV] = { QPU_A_OR },
259 [QOP_FMOV] = { QPU_A_FMAX },
260 [QOP_MMOV] = { QPU_M_V8MIN },
261 };
262
263 uint64_t unpack = 0;
264 struct qpu_reg src[4];
265 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
266 int index = qinst->src[i].index;
267 switch (qinst->src[i].file) {
268 case QFILE_NULL:
269 src[i] = qpu_rn(0);
270 break;
271 case QFILE_TEMP:
272 src[i] = temp_registers[index];
273 if (qinst->src[i].pack) {
274 assert(!unpack ||
275 unpack == qinst->src[i].pack);
276 unpack = QPU_SET_FIELD(qinst->src[i].pack,
277 QPU_UNPACK);
278 if (src[i].mux == QPU_MUX_R4)
279 unpack |= QPU_PM;
280 }
281 break;
282 case QFILE_UNIF:
283 src[i] = qpu_unif();
284 break;
285 case QFILE_VARY:
286 src[i] = qpu_vary();
287 break;
288 case QFILE_SMALL_IMM:
289 src[i].mux = QPU_MUX_SMALL_IMM;
290 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
291 /* This should only have returned a valid
292 * small immediate field, not ~0 for failure.
293 */
294 assert(src[i].addr <= 47);
295 break;
296 case QFILE_VPM:
297 assert((int)qinst->src[i].index >=
298 last_vpm_read_index);
299 (void)last_vpm_read_index;
300 last_vpm_read_index = qinst->src[i].index;
301 src[i] = qpu_ra(QPU_R_VPM);
302 break;
303 }
304 }
305
306 struct qpu_reg dst;
307 switch (qinst->dst.file) {
308 case QFILE_NULL:
309 dst = qpu_ra(QPU_W_NOP);
310 break;
311 case QFILE_TEMP:
312 dst = temp_registers[qinst->dst.index];
313 break;
314 case QFILE_VPM:
315 dst = qpu_ra(QPU_W_VPM);
316 break;
317 case QFILE_VARY:
318 case QFILE_UNIF:
319 case QFILE_SMALL_IMM:
320 assert(!"not reached");
321 break;
322 }
323
324 bool handled_qinst_cond = false;
325
326 switch (qinst->op) {
327 case QOP_RCP:
328 case QOP_RSQ:
329 case QOP_EXP2:
330 case QOP_LOG2:
331 switch (qinst->op) {
332 case QOP_RCP:
333 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
334 src[0]) | unpack);
335 break;
336 case QOP_RSQ:
337 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
338 src[0]) | unpack);
339 break;
340 case QOP_EXP2:
341 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
342 src[0]) | unpack);
343 break;
344 case QOP_LOG2:
345 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
346 src[0]) | unpack);
347 break;
348 default:
349 abort();
350 }
351
352 handle_r4_qpu_write(c, qinst, dst);
353
354 break;
355
356 case QOP_FRAG_X:
357 queue(c, qpu_a_ITOF(dst,
358 qpu_ra(QPU_R_XY_PIXEL_COORD)));
359 break;
360
361 case QOP_FRAG_Y:
362 queue(c, qpu_a_ITOF(dst,
363 qpu_rb(QPU_R_XY_PIXEL_COORD)));
364 break;
365
366 case QOP_FRAG_REV_FLAG:
367 queue(c, qpu_a_ITOF(dst,
368 qpu_rb(QPU_R_MS_REV_FLAGS)));
369 break;
370
371 case QOP_MS_MASK:
372 src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
373 fixup_raddr_conflict(c, dst, &src[0], &src[1],
374 qinst, &unpack);
375 queue(c, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
376 src[0], src[1]) | unpack);
377 break;
378
379 case QOP_FRAG_Z:
380 case QOP_FRAG_W:
381 /* QOP_FRAG_Z/W don't emit instructions, just allocate
382 * the register to the Z/W payload.
383 */
384 break;
385
386 case QOP_TLB_STENCIL_SETUP:
387 assert(!unpack);
388 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP),
389 src[0]) | unpack);
390 break;
391
392 case QOP_TLB_Z_WRITE:
393 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z),
394 src[0]) | unpack);
395 set_last_cond_add(c, qinst->cond);
396 handled_qinst_cond = true;
397 break;
398
399 case QOP_TLB_COLOR_READ:
400 queue(c, qpu_NOP());
401 *last_inst(c) = qpu_set_sig(*last_inst(c),
402 QPU_SIG_COLOR_LOAD);
403 handle_r4_qpu_write(c, qinst, dst);
404 break;
405
406 case QOP_TLB_COLOR_WRITE:
407 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]) | unpack);
408 set_last_cond_add(c, qinst->cond);
409 handled_qinst_cond = true;
410 break;
411
412 case QOP_TLB_COLOR_WRITE_MS:
413 queue(c, qpu_a_MOV(qpu_tlbc_ms(), src[0]));
414 set_last_cond_add(c, qinst->cond);
415 handled_qinst_cond = true;
416 break;
417
418 case QOP_VARY_ADD_C:
419 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
420 break;
421
422 case QOP_TEX_S:
423 case QOP_TEX_T:
424 case QOP_TEX_R:
425 case QOP_TEX_B:
426 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
427 (qinst->op - QOP_TEX_S)),
428 src[0]) | unpack);
429 break;
430
431 case QOP_TEX_DIRECT:
432 fixup_raddr_conflict(c, dst, &src[0], &src[1],
433 qinst, &unpack);
434 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
435 src[0], src[1]) | unpack);
436 break;
437
438 case QOP_TEX_RESULT:
439 queue(c, qpu_NOP());
440 *last_inst(c) = qpu_set_sig(*last_inst(c),
441 QPU_SIG_LOAD_TMU0);
442 handle_r4_qpu_write(c, qinst, dst);
443 break;
444
445 default:
446 assert(qinst->op < ARRAY_SIZE(translate));
447 assert(translate[qinst->op].op != 0); /* NOPs */
448
449 /* Skip emitting the MOV if it's a no-op. */
450 if (qir_is_raw_mov(qinst) &&
451 dst.mux == src[0].mux && dst.addr == src[0].addr) {
452 break;
453 }
454
455 /* If we have only one source, put it in the second
456 * argument slot as well so that we don't take up
457 * another raddr just to get unused data.
458 */
459 if (qir_get_op_nsrc(qinst->op) == 1)
460 src[1] = src[0];
461
462 fixup_raddr_conflict(c, dst, &src[0], &src[1],
463 qinst, &unpack);
464
465 if (qir_is_mul(qinst)) {
466 queue(c, qpu_m_alu2(translate[qinst->op].op,
467 dst,
468 src[0], src[1]) | unpack);
469 set_last_cond_mul(c, qinst->cond);
470 } else {
471 queue(c, qpu_a_alu2(translate[qinst->op].op,
472 dst,
473 src[0], src[1]) | unpack);
474 set_last_cond_add(c, qinst->cond);
475 }
476 handled_qinst_cond = true;
477 set_last_dst_pack(c, qinst);
478
479 break;
480 }
481
482 assert(qinst->cond == QPU_COND_ALWAYS ||
483 handled_qinst_cond);
484
485 if (qinst->sf)
486 *last_inst(c) |= QPU_SF;
487 }
488
489 uint32_t cycles = qpu_schedule_instructions(c);
490 uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
491
492 /* thread end can't have VPM write or read */
493 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
494 QPU_WADDR_ADD) == QPU_W_VPM ||
495 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
496 QPU_WADDR_MUL) == QPU_W_VPM ||
497 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
498 QPU_RADDR_A) == QPU_R_VPM ||
499 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
500 QPU_RADDR_B) == QPU_R_VPM) {
501 qpu_serialize_one_inst(c, qpu_NOP());
502 }
503
504 /* thread end can't have uniform read */
505 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
506 QPU_RADDR_A) == QPU_R_UNIF ||
507 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
508 QPU_RADDR_B) == QPU_R_UNIF) {
509 qpu_serialize_one_inst(c, qpu_NOP());
510 }
511
512 /* thread end can't have TLB operations */
513 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
514 qpu_serialize_one_inst(c, qpu_NOP());
515
516 c->qpu_insts[c->qpu_inst_count - 1] =
517 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
518 QPU_SIG_PROG_END);
519 qpu_serialize_one_inst(c, qpu_NOP());
520 qpu_serialize_one_inst(c, qpu_NOP());
521
522 switch (c->stage) {
523 case QSTAGE_VERT:
524 case QSTAGE_COORD:
525 break;
526 case QSTAGE_FRAG:
527 c->qpu_insts[c->qpu_inst_count - 1] =
528 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
529 QPU_SIG_SCOREBOARD_UNLOCK);
530 break;
531 }
532
533 cycles += c->qpu_inst_count - inst_count_at_schedule_time;
534
535 if (vc4_debug & VC4_DEBUG_SHADERDB) {
536 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
537 qir_get_stage_name(c->stage),
538 c->program_id, c->variant_id,
539 cycles);
540 }
541
542 if (vc4_debug & VC4_DEBUG_QPU)
543 vc4_dump_program(c);
544
545 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
546
547 free(temp_registers);
548 }