vc4: Reduce MOV special-casing in QIR-to-QPU.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 /**
68 * Some special registers can be read from either file, which lets us resolve
69 * raddr conflicts without extra MOVs.
70 */
71 static bool
72 swap_file(struct qpu_reg *src)
73 {
74 switch (src->addr) {
75 case QPU_R_UNIF:
76 case QPU_R_VARY:
77 if (src->mux == QPU_MUX_SMALL_IMM) {
78 return false;
79 } else {
80 if (src->mux == QPU_MUX_A)
81 src->mux = QPU_MUX_B;
82 else
83 src->mux = QPU_MUX_A;
84 return true;
85 }
86
87 default:
88 return false;
89 }
90 }
91
92 /**
93 * This is used to resolve the fact that we might register-allocate two
94 * different operands of an instruction to the same physical register file
95 * even though instructions have only one field for the register file source
96 * address.
97 *
98 * In that case, we need to move one to a temporary that can be used in the
99 * instruction, instead. We reserve ra31/rb31 for this purpose.
100 */
101 static void
102 fixup_raddr_conflict(struct vc4_compile *c,
103 struct qpu_reg dst,
104 struct qpu_reg *src0, struct qpu_reg *src1)
105 {
106 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
107 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
108
109 if (mux0 <= QPU_MUX_R5 ||
110 mux0 != mux1 ||
111 (src0->addr == src1->addr &&
112 src0->mux == src1->mux)) {
113 return;
114 }
115
116 if (swap_file(src0) || swap_file(src1))
117 return;
118
119 if (mux0 == QPU_MUX_A) {
120 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
121 *src0 = qpu_rb(31);
122 } else {
123 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
124 *src0 = qpu_ra(31);
125 }
126 }
127
128 void
129 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
130 {
131 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
132 bool discard = false;
133 uint32_t inputs_remaining = c->num_inputs;
134 uint32_t vpm_read_fifo_count = 0;
135 uint32_t vpm_read_offset = 0;
136 int last_vpm_read_index = -1;
137 /* Map from the QIR ops enum order to QPU unpack bits. */
138 static const uint32_t unpack_map[] = {
139 QPU_UNPACK_8A,
140 QPU_UNPACK_8B,
141 QPU_UNPACK_8C,
142 QPU_UNPACK_8D,
143 QPU_UNPACK_16A,
144 QPU_UNPACK_16B,
145 };
146
147 list_inithead(&c->qpu_inst_list);
148
149 switch (c->stage) {
150 case QSTAGE_VERT:
151 case QSTAGE_COORD:
152 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
153 * load up to 16 dwords (4 vec4s) per vertex.
154 */
155 while (inputs_remaining) {
156 uint32_t num_entries = MIN2(inputs_remaining, 16);
157 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
158 vpm_read_offset |
159 0x00001a00 |
160 ((num_entries & 0xf) << 20)));
161 inputs_remaining -= num_entries;
162 vpm_read_offset += num_entries;
163 vpm_read_fifo_count++;
164 }
165 assert(vpm_read_fifo_count <= 4);
166
167 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
168 break;
169 case QSTAGE_FRAG:
170 break;
171 }
172
173 list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
174 #if 0
175 fprintf(stderr, "translating qinst to qpu: ");
176 qir_dump_inst(qinst);
177 fprintf(stderr, "\n");
178 #endif
179
180 static const struct {
181 uint32_t op;
182 } translate[] = {
183 #define A(name) [QOP_##name] = {QPU_A_##name}
184 #define M(name) [QOP_##name] = {QPU_M_##name}
185 A(FADD),
186 A(FSUB),
187 A(FMIN),
188 A(FMAX),
189 A(FMINABS),
190 A(FMAXABS),
191 A(FTOI),
192 A(ITOF),
193 A(ADD),
194 A(SUB),
195 A(SHL),
196 A(SHR),
197 A(ASR),
198 A(MIN),
199 A(MAX),
200 A(AND),
201 A(OR),
202 A(XOR),
203 A(NOT),
204
205 M(FMUL),
206 M(V8MULD),
207 M(V8MIN),
208 M(V8MAX),
209 M(V8ADDS),
210 M(V8SUBS),
211 M(MUL24),
212
213 /* If we replicate src[0] out to src[1], this works
214 * out the same as a MOV.
215 */
216 [QOP_MOV] = { QPU_A_OR },
217 };
218
219 struct qpu_reg src[4];
220 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
221 int index = qinst->src[i].index;
222 switch (qinst->src[i].file) {
223 case QFILE_NULL:
224 src[i] = qpu_rn(0);
225 break;
226 case QFILE_TEMP:
227 src[i] = temp_registers[index];
228 break;
229 case QFILE_UNIF:
230 src[i] = qpu_unif();
231 break;
232 case QFILE_VARY:
233 src[i] = qpu_vary();
234 break;
235 case QFILE_SMALL_IMM:
236 src[i].mux = QPU_MUX_SMALL_IMM;
237 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
238 /* This should only have returned a valid
239 * small immediate field, not ~0 for failure.
240 */
241 assert(src[i].addr <= 47);
242 break;
243 case QFILE_VPM:
244 assert((int)qinst->src[i].index >=
245 last_vpm_read_index);
246 (void)last_vpm_read_index;
247 last_vpm_read_index = qinst->src[i].index;
248 src[i] = qpu_ra(QPU_R_VPM);
249 break;
250 }
251 }
252
253 struct qpu_reg dst;
254 switch (qinst->dst.file) {
255 case QFILE_NULL:
256 dst = qpu_ra(QPU_W_NOP);
257 break;
258 case QFILE_TEMP:
259 dst = temp_registers[qinst->dst.index];
260 break;
261 case QFILE_VPM:
262 dst = qpu_ra(QPU_W_VPM);
263 break;
264 case QFILE_VARY:
265 case QFILE_UNIF:
266 case QFILE_SMALL_IMM:
267 assert(!"not reached");
268 break;
269 }
270
271 switch (qinst->op) {
272 case QOP_SEL_X_0_ZS:
273 case QOP_SEL_X_0_ZC:
274 case QOP_SEL_X_0_NS:
275 case QOP_SEL_X_0_NC:
276 queue(c, qpu_a_MOV(dst, src[0]));
277 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
278 QPU_COND_ZS);
279
280 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
281 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
282 1) + QPU_COND_ZS);
283 break;
284
285 case QOP_SEL_X_Y_ZS:
286 case QOP_SEL_X_Y_ZC:
287 case QOP_SEL_X_Y_NS:
288 case QOP_SEL_X_Y_NC:
289 queue(c, qpu_a_MOV(dst, src[0]));
290 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
291 QPU_COND_ZS);
292
293 queue(c, qpu_a_MOV(dst, src[1]));
294 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
295 1) + QPU_COND_ZS);
296
297 break;
298
299 case QOP_RCP:
300 case QOP_RSQ:
301 case QOP_EXP2:
302 case QOP_LOG2:
303 switch (qinst->op) {
304 case QOP_RCP:
305 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
306 src[0]));
307 break;
308 case QOP_RSQ:
309 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
310 src[0]));
311 break;
312 case QOP_EXP2:
313 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
314 src[0]));
315 break;
316 case QOP_LOG2:
317 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
318 src[0]));
319 break;
320 default:
321 abort();
322 }
323
324 if (dst.mux != QPU_MUX_R4)
325 queue(c, qpu_a_MOV(dst, qpu_r4()));
326
327 break;
328
329 case QOP_PACK_8888_F:
330 queue(c, qpu_m_MOV(dst, src[0]));
331 *last_inst(c) |= QPU_PM;
332 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
333 QPU_PACK);
334 break;
335
336 case QOP_PACK_8A_F:
337 case QOP_PACK_8B_F:
338 case QOP_PACK_8C_F:
339 case QOP_PACK_8D_F:
340 queue(c,
341 qpu_m_MOV(dst, src[0]) |
342 QPU_PM |
343 QPU_SET_FIELD(QPU_PACK_MUL_8A +
344 qinst->op - QOP_PACK_8A_F,
345 QPU_PACK));
346 break;
347
348 case QOP_FRAG_X:
349 queue(c, qpu_a_ITOF(dst,
350 qpu_ra(QPU_R_XY_PIXEL_COORD)));
351 break;
352
353 case QOP_FRAG_Y:
354 queue(c, qpu_a_ITOF(dst,
355 qpu_rb(QPU_R_XY_PIXEL_COORD)));
356 break;
357
358 case QOP_FRAG_REV_FLAG:
359 queue(c, qpu_a_ITOF(dst,
360 qpu_rb(QPU_R_MS_REV_FLAGS)));
361 break;
362
363 case QOP_FRAG_Z:
364 case QOP_FRAG_W:
365 /* QOP_FRAG_Z/W don't emit instructions, just allocate
366 * the register to the Z/W payload.
367 */
368 break;
369
370 case QOP_TLB_DISCARD_SETUP:
371 discard = true;
372 queue(c, qpu_a_MOV(src[0], src[0]));
373 *last_inst(c) |= QPU_SF;
374 break;
375
376 case QOP_TLB_STENCIL_SETUP:
377 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
378 break;
379
380 case QOP_TLB_Z_WRITE:
381 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
382 if (discard) {
383 set_last_cond_add(c, QPU_COND_ZS);
384 }
385 break;
386
387 case QOP_TLB_COLOR_READ:
388 queue(c, qpu_NOP());
389 *last_inst(c) = qpu_set_sig(*last_inst(c),
390 QPU_SIG_COLOR_LOAD);
391
392 if (dst.mux != QPU_MUX_R4)
393 queue(c, qpu_a_MOV(dst, qpu_r4()));
394 break;
395
396 case QOP_TLB_COLOR_WRITE:
397 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
398 if (discard) {
399 set_last_cond_add(c, QPU_COND_ZS);
400 }
401 break;
402
403 case QOP_VARY_ADD_C:
404 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
405 break;
406
407 case QOP_TEX_S:
408 case QOP_TEX_T:
409 case QOP_TEX_R:
410 case QOP_TEX_B:
411 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
412 (qinst->op - QOP_TEX_S)),
413 src[0]));
414 break;
415
416 case QOP_TEX_DIRECT:
417 fixup_raddr_conflict(c, dst, &src[0], &src[1]);
418 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
419 break;
420
421 case QOP_TEX_RESULT:
422 queue(c, qpu_NOP());
423 *last_inst(c) = qpu_set_sig(*last_inst(c),
424 QPU_SIG_LOAD_TMU0);
425 if (dst.mux != QPU_MUX_R4)
426 queue(c, qpu_a_MOV(dst, qpu_r4()));
427 break;
428
429 case QOP_UNPACK_8A_F:
430 case QOP_UNPACK_8B_F:
431 case QOP_UNPACK_8C_F:
432 case QOP_UNPACK_8D_F:
433 case QOP_UNPACK_16A_F:
434 case QOP_UNPACK_16B_F: {
435 if (src[0].mux == QPU_MUX_R4) {
436 queue(c, qpu_a_MOV(dst, src[0]));
437 *last_inst(c) |= QPU_PM;
438 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
439 (qinst->op -
440 QOP_UNPACK_8A_F),
441 QPU_UNPACK);
442 } else {
443 assert(src[0].mux == QPU_MUX_A);
444
445 /* Since we're setting the pack bits, if the
446 * destination is in A it would get re-packed.
447 */
448 queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
449 qpu_rb(31) : dst),
450 src[0], src[0]));
451 *last_inst(c) |=
452 QPU_SET_FIELD(unpack_map[qinst->op -
453 QOP_UNPACK_8A_F],
454 QPU_UNPACK);
455
456 if (dst.mux == QPU_MUX_A) {
457 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
458 }
459 }
460 }
461 break;
462
463 case QOP_UNPACK_8A_I:
464 case QOP_UNPACK_8B_I:
465 case QOP_UNPACK_8C_I:
466 case QOP_UNPACK_8D_I:
467 case QOP_UNPACK_16A_I:
468 case QOP_UNPACK_16B_I: {
469 assert(src[0].mux == QPU_MUX_A);
470
471 /* Since we're setting the pack bits, if the
472 * destination is in A it would get re-packed.
473 */
474 queue(c, qpu_a_MOV((dst.mux == QPU_MUX_A ?
475 qpu_rb(31) : dst), src[0]));
476 *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
477 QOP_UNPACK_8A_I],
478 QPU_UNPACK);
479
480 if (dst.mux == QPU_MUX_A) {
481 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
482 }
483 }
484 break;
485
486 default:
487 assert(qinst->op < ARRAY_SIZE(translate));
488 assert(translate[qinst->op].op != 0); /* NOPs */
489
490 /* Skip emitting the MOV if it's a no-op. */
491 if (qir_is_raw_mov(qinst) &&
492 dst.mux == src[0].mux && dst.addr == src[0].addr) {
493 break;
494 }
495
496 /* If we have only one source, put it in the second
497 * argument slot as well so that we don't take up
498 * another raddr just to get unused data.
499 */
500 if (qir_get_op_nsrc(qinst->op) == 1)
501 src[1] = src[0];
502
503 fixup_raddr_conflict(c, dst, &src[0], &src[1]);
504
505 if (qir_is_mul(qinst)) {
506 queue(c, qpu_m_alu2(translate[qinst->op].op,
507 dst,
508 src[0], src[1]));
509 if (qinst->dst.pack) {
510 *last_inst(c) |= QPU_PM;
511 *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
512 QPU_PACK);
513 }
514 } else {
515 queue(c, qpu_a_alu2(translate[qinst->op].op,
516 dst,
517 src[0], src[1]));
518 if (qinst->dst.pack) {
519 assert(dst.mux == QPU_MUX_A);
520 *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
521 QPU_PACK);
522 }
523 }
524
525 break;
526 }
527
528 if (qinst->sf) {
529 assert(!qir_is_multi_instruction(qinst));
530 *last_inst(c) |= QPU_SF;
531 }
532 }
533
534 qpu_schedule_instructions(c);
535
536 /* thread end can't have VPM write or read */
537 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
538 QPU_WADDR_ADD) == QPU_W_VPM ||
539 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
540 QPU_WADDR_MUL) == QPU_W_VPM ||
541 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
542 QPU_RADDR_A) == QPU_R_VPM ||
543 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
544 QPU_RADDR_B) == QPU_R_VPM) {
545 qpu_serialize_one_inst(c, qpu_NOP());
546 }
547
548 /* thread end can't have uniform read */
549 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
550 QPU_RADDR_A) == QPU_R_UNIF ||
551 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
552 QPU_RADDR_B) == QPU_R_UNIF) {
553 qpu_serialize_one_inst(c, qpu_NOP());
554 }
555
556 /* thread end can't have TLB operations */
557 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
558 qpu_serialize_one_inst(c, qpu_NOP());
559
560 c->qpu_insts[c->qpu_inst_count - 1] =
561 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
562 QPU_SIG_PROG_END);
563 qpu_serialize_one_inst(c, qpu_NOP());
564 qpu_serialize_one_inst(c, qpu_NOP());
565
566 switch (c->stage) {
567 case QSTAGE_VERT:
568 case QSTAGE_COORD:
569 break;
570 case QSTAGE_FRAG:
571 c->qpu_insts[c->qpu_inst_count - 1] =
572 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
573 QPU_SIG_SCOREBOARD_UNLOCK);
574 break;
575 }
576
577 if (vc4_debug & VC4_DEBUG_QPU)
578 vc4_dump_program(c);
579
580 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
581
582 free(temp_registers);
583 }