vc4: Add whitespace after each program stage dump.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 fprintf(stderr, "\n");
44 }
45
46 static void
47 queue(struct vc4_compile *c, uint64_t inst)
48 {
49 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
50 q->inst = inst;
51 list_addtail(&q->link, &c->qpu_inst_list);
52 }
53
54 static uint64_t *
55 last_inst(struct vc4_compile *c)
56 {
57 struct queued_qpu_inst *q =
58 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
59 return &q->inst;
60 }
61
62 static void
63 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
64 {
65 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
66 }
67
68 static void
69 set_last_cond_mul(struct vc4_compile *c, uint32_t cond)
70 {
71 *last_inst(c) = qpu_set_cond_mul(*last_inst(c), cond);
72 }
73
74 /**
75 * Some special registers can be read from either file, which lets us resolve
76 * raddr conflicts without extra MOVs.
77 */
78 static bool
79 swap_file(struct qpu_reg *src)
80 {
81 switch (src->addr) {
82 case QPU_R_UNIF:
83 case QPU_R_VARY:
84 if (src->mux == QPU_MUX_SMALL_IMM) {
85 return false;
86 } else {
87 if (src->mux == QPU_MUX_A)
88 src->mux = QPU_MUX_B;
89 else
90 src->mux = QPU_MUX_A;
91 return true;
92 }
93
94 default:
95 return false;
96 }
97 }
98
99 /**
100 * This is used to resolve the fact that we might register-allocate two
101 * different operands of an instruction to the same physical register file
102 * even though instructions have only one field for the register file source
103 * address.
104 *
105 * In that case, we need to move one to a temporary that can be used in the
106 * instruction, instead. We reserve ra31/rb31 for this purpose.
107 */
108 static void
109 fixup_raddr_conflict(struct vc4_compile *c,
110 struct qpu_reg dst,
111 struct qpu_reg *src0, struct qpu_reg *src1,
112 struct qinst *inst, uint64_t *unpack)
113 {
114 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
115 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
116
117 if (mux0 <= QPU_MUX_R5 ||
118 mux0 != mux1 ||
119 (src0->addr == src1->addr &&
120 src0->mux == src1->mux)) {
121 return;
122 }
123
124 if (swap_file(src0) || swap_file(src1))
125 return;
126
127 if (mux0 == QPU_MUX_A) {
128 /* Make sure we use the same type of MOV as the instruction,
129 * in case of unpacks.
130 */
131 if (qir_is_float_input(inst))
132 queue(c, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
133 else
134 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
135
136 /* If we had an unpack on this A-file source, we need to put
137 * it into this MOV, not into the later move from regfile B.
138 */
139 if (inst->src[0].pack) {
140 *last_inst(c) |= *unpack;
141 *unpack = 0;
142 }
143 *src0 = qpu_rb(31);
144 } else {
145 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
146 *src0 = qpu_ra(31);
147 }
148 }
149
150 static void
151 set_last_dst_pack(struct vc4_compile *c, struct qinst *inst)
152 {
153 bool had_pm = *last_inst(c) & QPU_PM;
154 bool had_ws = *last_inst(c) & QPU_WS;
155 uint32_t unpack = QPU_GET_FIELD(*last_inst(c), QPU_UNPACK);
156
157 if (!inst->dst.pack)
158 return;
159
160 *last_inst(c) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
161
162 if (qir_is_mul(inst)) {
163 assert(!unpack || had_pm);
164 *last_inst(c) |= QPU_PM;
165 } else {
166 assert(!unpack || !had_pm);
167 assert(!had_ws); /* dst must be a-file to pack. */
168 }
169 }
170
171 static void
172 handle_r4_qpu_write(struct vc4_compile *c, struct qinst *qinst,
173 struct qpu_reg dst)
174 {
175 if (dst.mux != QPU_MUX_R4)
176 queue(c, qpu_a_MOV(dst, qpu_r4()));
177 else if (qinst->sf)
178 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
179 }
180
181 void
182 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
183 {
184 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
185 uint32_t inputs_remaining = c->num_inputs;
186 uint32_t vpm_read_fifo_count = 0;
187 uint32_t vpm_read_offset = 0;
188 int last_vpm_read_index = -1;
189
190 list_inithead(&c->qpu_inst_list);
191
192 switch (c->stage) {
193 case QSTAGE_VERT:
194 case QSTAGE_COORD:
195 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
196 * load up to 16 dwords (4 vec4s) per vertex.
197 */
198 while (inputs_remaining) {
199 uint32_t num_entries = MIN2(inputs_remaining, 16);
200 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
201 vpm_read_offset |
202 0x00001a00 |
203 ((num_entries & 0xf) << 20)));
204 inputs_remaining -= num_entries;
205 vpm_read_offset += num_entries;
206 vpm_read_fifo_count++;
207 }
208 assert(vpm_read_fifo_count <= 4);
209
210 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
211 break;
212 case QSTAGE_FRAG:
213 break;
214 }
215
216 list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
217 #if 0
218 fprintf(stderr, "translating qinst to qpu: ");
219 qir_dump_inst(qinst);
220 fprintf(stderr, "\n");
221 #endif
222
223 static const struct {
224 uint32_t op;
225 } translate[] = {
226 #define A(name) [QOP_##name] = {QPU_A_##name}
227 #define M(name) [QOP_##name] = {QPU_M_##name}
228 A(FADD),
229 A(FSUB),
230 A(FMIN),
231 A(FMAX),
232 A(FMINABS),
233 A(FMAXABS),
234 A(FTOI),
235 A(ITOF),
236 A(ADD),
237 A(SUB),
238 A(SHL),
239 A(SHR),
240 A(ASR),
241 A(MIN),
242 A(MAX),
243 A(AND),
244 A(OR),
245 A(XOR),
246 A(NOT),
247
248 M(FMUL),
249 M(V8MULD),
250 M(V8MIN),
251 M(V8MAX),
252 M(V8ADDS),
253 M(V8SUBS),
254 M(MUL24),
255
256 /* If we replicate src[0] out to src[1], this works
257 * out the same as a MOV.
258 */
259 [QOP_MOV] = { QPU_A_OR },
260 [QOP_FMOV] = { QPU_A_FMAX },
261 [QOP_MMOV] = { QPU_M_V8MIN },
262 };
263
264 uint64_t unpack = 0;
265 struct qpu_reg src[4];
266 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
267 int index = qinst->src[i].index;
268 switch (qinst->src[i].file) {
269 case QFILE_NULL:
270 src[i] = qpu_rn(0);
271 break;
272 case QFILE_TEMP:
273 src[i] = temp_registers[index];
274 if (qinst->src[i].pack) {
275 assert(!unpack ||
276 unpack == qinst->src[i].pack);
277 unpack = QPU_SET_FIELD(qinst->src[i].pack,
278 QPU_UNPACK);
279 if (src[i].mux == QPU_MUX_R4)
280 unpack |= QPU_PM;
281 }
282 break;
283 case QFILE_UNIF:
284 src[i] = qpu_unif();
285 break;
286 case QFILE_VARY:
287 src[i] = qpu_vary();
288 break;
289 case QFILE_SMALL_IMM:
290 src[i].mux = QPU_MUX_SMALL_IMM;
291 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
292 /* This should only have returned a valid
293 * small immediate field, not ~0 for failure.
294 */
295 assert(src[i].addr <= 47);
296 break;
297 case QFILE_VPM:
298 assert((int)qinst->src[i].index >=
299 last_vpm_read_index);
300 (void)last_vpm_read_index;
301 last_vpm_read_index = qinst->src[i].index;
302 src[i] = qpu_ra(QPU_R_VPM);
303 break;
304
305 case QFILE_FRAG_X:
306 src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
307 break;
308 case QFILE_FRAG_Y:
309 src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
310 break;
311 case QFILE_FRAG_REV_FLAG:
312 src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
313 break;
314
315 case QFILE_TLB_COLOR_WRITE:
316 case QFILE_TLB_COLOR_WRITE_MS:
317 case QFILE_TLB_Z_WRITE:
318 case QFILE_TLB_STENCIL_SETUP:
319 unreachable("bad qir src file");
320 }
321 }
322
323 struct qpu_reg dst;
324 switch (qinst->dst.file) {
325 case QFILE_NULL:
326 dst = qpu_ra(QPU_W_NOP);
327 break;
328 case QFILE_TEMP:
329 dst = temp_registers[qinst->dst.index];
330 break;
331 case QFILE_VPM:
332 dst = qpu_ra(QPU_W_VPM);
333 break;
334
335 case QFILE_TLB_COLOR_WRITE:
336 dst = qpu_tlbc();
337 break;
338
339 case QFILE_TLB_COLOR_WRITE_MS:
340 dst = qpu_tlbc_ms();
341 break;
342
343 case QFILE_TLB_Z_WRITE:
344 dst = qpu_ra(QPU_W_TLB_Z);
345 break;
346
347 case QFILE_TLB_STENCIL_SETUP:
348 dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
349 break;
350
351 case QFILE_VARY:
352 case QFILE_UNIF:
353 case QFILE_SMALL_IMM:
354 case QFILE_FRAG_X:
355 case QFILE_FRAG_Y:
356 case QFILE_FRAG_REV_FLAG:
357 assert(!"not reached");
358 break;
359 }
360
361 bool handled_qinst_cond = false;
362
363 switch (qinst->op) {
364 case QOP_RCP:
365 case QOP_RSQ:
366 case QOP_EXP2:
367 case QOP_LOG2:
368 switch (qinst->op) {
369 case QOP_RCP:
370 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
371 src[0]) | unpack);
372 break;
373 case QOP_RSQ:
374 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
375 src[0]) | unpack);
376 break;
377 case QOP_EXP2:
378 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
379 src[0]) | unpack);
380 break;
381 case QOP_LOG2:
382 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
383 src[0]) | unpack);
384 break;
385 default:
386 abort();
387 }
388
389 handle_r4_qpu_write(c, qinst, dst);
390
391 break;
392
393 case QOP_MS_MASK:
394 src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
395 fixup_raddr_conflict(c, dst, &src[0], &src[1],
396 qinst, &unpack);
397 queue(c, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
398 src[0], src[1]) | unpack);
399 break;
400
401 case QOP_FRAG_Z:
402 case QOP_FRAG_W:
403 /* QOP_FRAG_Z/W don't emit instructions, just allocate
404 * the register to the Z/W payload.
405 */
406 break;
407
408 case QOP_TLB_COLOR_READ:
409 queue(c, qpu_NOP());
410 *last_inst(c) = qpu_set_sig(*last_inst(c),
411 QPU_SIG_COLOR_LOAD);
412 handle_r4_qpu_write(c, qinst, dst);
413 break;
414
415 case QOP_VARY_ADD_C:
416 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
417 break;
418
419 case QOP_TEX_S:
420 case QOP_TEX_T:
421 case QOP_TEX_R:
422 case QOP_TEX_B:
423 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
424 (qinst->op - QOP_TEX_S)),
425 src[0]) | unpack);
426 break;
427
428 case QOP_TEX_DIRECT:
429 fixup_raddr_conflict(c, dst, &src[0], &src[1],
430 qinst, &unpack);
431 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
432 src[0], src[1]) | unpack);
433 break;
434
435 case QOP_TEX_RESULT:
436 queue(c, qpu_NOP());
437 *last_inst(c) = qpu_set_sig(*last_inst(c),
438 QPU_SIG_LOAD_TMU0);
439 handle_r4_qpu_write(c, qinst, dst);
440 break;
441
442 default:
443 assert(qinst->op < ARRAY_SIZE(translate));
444 assert(translate[qinst->op].op != 0); /* NOPs */
445
446 /* Skip emitting the MOV if it's a no-op. */
447 if (qir_is_raw_mov(qinst) &&
448 dst.mux == src[0].mux && dst.addr == src[0].addr) {
449 break;
450 }
451
452 /* If we have only one source, put it in the second
453 * argument slot as well so that we don't take up
454 * another raddr just to get unused data.
455 */
456 if (qir_get_op_nsrc(qinst->op) == 1)
457 src[1] = src[0];
458
459 fixup_raddr_conflict(c, dst, &src[0], &src[1],
460 qinst, &unpack);
461
462 if (qir_is_mul(qinst)) {
463 queue(c, qpu_m_alu2(translate[qinst->op].op,
464 dst,
465 src[0], src[1]) | unpack);
466 set_last_cond_mul(c, qinst->cond);
467 } else {
468 queue(c, qpu_a_alu2(translate[qinst->op].op,
469 dst,
470 src[0], src[1]) | unpack);
471 set_last_cond_add(c, qinst->cond);
472 }
473 handled_qinst_cond = true;
474 set_last_dst_pack(c, qinst);
475
476 break;
477 }
478
479 assert(qinst->cond == QPU_COND_ALWAYS ||
480 handled_qinst_cond);
481
482 if (qinst->sf)
483 *last_inst(c) |= QPU_SF;
484 }
485
486 uint32_t cycles = qpu_schedule_instructions(c);
487 uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
488
489 /* thread end can't have VPM write or read */
490 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
491 QPU_WADDR_ADD) == QPU_W_VPM ||
492 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
493 QPU_WADDR_MUL) == QPU_W_VPM ||
494 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
495 QPU_RADDR_A) == QPU_R_VPM ||
496 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
497 QPU_RADDR_B) == QPU_R_VPM) {
498 qpu_serialize_one_inst(c, qpu_NOP());
499 }
500
501 /* thread end can't have uniform read */
502 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
503 QPU_RADDR_A) == QPU_R_UNIF ||
504 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
505 QPU_RADDR_B) == QPU_R_UNIF) {
506 qpu_serialize_one_inst(c, qpu_NOP());
507 }
508
509 /* thread end can't have TLB operations */
510 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
511 qpu_serialize_one_inst(c, qpu_NOP());
512
513 c->qpu_insts[c->qpu_inst_count - 1] =
514 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
515 QPU_SIG_PROG_END);
516 qpu_serialize_one_inst(c, qpu_NOP());
517 qpu_serialize_one_inst(c, qpu_NOP());
518
519 switch (c->stage) {
520 case QSTAGE_VERT:
521 case QSTAGE_COORD:
522 break;
523 case QSTAGE_FRAG:
524 c->qpu_insts[c->qpu_inst_count - 1] =
525 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
526 QPU_SIG_SCOREBOARD_UNLOCK);
527 break;
528 }
529
530 cycles += c->qpu_inst_count - inst_count_at_schedule_time;
531
532 if (vc4_debug & VC4_DEBUG_SHADERDB) {
533 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
534 qir_get_stage_name(c->stage),
535 c->program_id, c->variant_id,
536 cycles);
537 }
538
539 if (vc4_debug & VC4_DEBUG_QPU)
540 vc4_dump_program(c);
541
542 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
543
544 free(temp_registers);
545 }