vc4: Add support for copy propagation with unpack flags present.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 /**
68 * Some special registers can be read from either file, which lets us resolve
69 * raddr conflicts without extra MOVs.
70 */
71 static bool
72 swap_file(struct qpu_reg *src)
73 {
74 switch (src->addr) {
75 case QPU_R_UNIF:
76 case QPU_R_VARY:
77 if (src->mux == QPU_MUX_SMALL_IMM) {
78 return false;
79 } else {
80 if (src->mux == QPU_MUX_A)
81 src->mux = QPU_MUX_B;
82 else
83 src->mux = QPU_MUX_A;
84 return true;
85 }
86
87 default:
88 return false;
89 }
90 }
91
92 /**
93 * This is used to resolve the fact that we might register-allocate two
94 * different operands of an instruction to the same physical register file
95 * even though instructions have only one field for the register file source
96 * address.
97 *
98 * In that case, we need to move one to a temporary that can be used in the
99 * instruction, instead. We reserve ra31/rb31 for this purpose.
100 */
101 static void
102 fixup_raddr_conflict(struct vc4_compile *c,
103 struct qpu_reg dst,
104 struct qpu_reg *src0, struct qpu_reg *src1,
105 struct qinst *inst, uint64_t *unpack)
106 {
107 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
108 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
109
110 if (mux0 <= QPU_MUX_R5 ||
111 mux0 != mux1 ||
112 (src0->addr == src1->addr &&
113 src0->mux == src1->mux)) {
114 return;
115 }
116
117 if (swap_file(src0) || swap_file(src1))
118 return;
119
120 if (mux0 == QPU_MUX_A) {
121 /* Make sure we use the same type of MOV as the instruction,
122 * in case of unpacks.
123 */
124 if (qir_is_float_input(inst))
125 queue(c, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
126 else
127 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
128
129 /* If we had an unpack on this A-file source, we need to put
130 * it into this MOV, not into the later move from regfile B.
131 */
132 if (inst->src[0].pack) {
133 *last_inst(c) |= *unpack;
134 *unpack = 0;
135 }
136 *src0 = qpu_rb(31);
137 } else {
138 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
139 *src0 = qpu_ra(31);
140 }
141 }
142
143 static void
144 set_last_dst_pack(struct vc4_compile *c, struct qinst *inst)
145 {
146 bool had_pm = *last_inst(c) & QPU_PM;
147 bool had_ws = *last_inst(c) & QPU_WS;
148 uint32_t unpack = QPU_GET_FIELD(*last_inst(c), QPU_UNPACK);
149
150 if (!inst->dst.pack)
151 return;
152
153 *last_inst(c) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
154
155 if (qir_is_mul(inst)) {
156 assert(!unpack || had_pm);
157 *last_inst(c) |= QPU_PM;
158 } else {
159 assert(!unpack || !had_pm);
160 assert(!had_ws); /* dst must be a-file to pack. */
161 }
162 }
163
164 void
165 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
166 {
167 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
168 bool discard = false;
169 uint32_t inputs_remaining = c->num_inputs;
170 uint32_t vpm_read_fifo_count = 0;
171 uint32_t vpm_read_offset = 0;
172 int last_vpm_read_index = -1;
173
174 list_inithead(&c->qpu_inst_list);
175
176 switch (c->stage) {
177 case QSTAGE_VERT:
178 case QSTAGE_COORD:
179 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
180 * load up to 16 dwords (4 vec4s) per vertex.
181 */
182 while (inputs_remaining) {
183 uint32_t num_entries = MIN2(inputs_remaining, 16);
184 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
185 vpm_read_offset |
186 0x00001a00 |
187 ((num_entries & 0xf) << 20)));
188 inputs_remaining -= num_entries;
189 vpm_read_offset += num_entries;
190 vpm_read_fifo_count++;
191 }
192 assert(vpm_read_fifo_count <= 4);
193
194 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
195 break;
196 case QSTAGE_FRAG:
197 break;
198 }
199
200 list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
201 #if 0
202 fprintf(stderr, "translating qinst to qpu: ");
203 qir_dump_inst(qinst);
204 fprintf(stderr, "\n");
205 #endif
206
207 static const struct {
208 uint32_t op;
209 } translate[] = {
210 #define A(name) [QOP_##name] = {QPU_A_##name}
211 #define M(name) [QOP_##name] = {QPU_M_##name}
212 A(FADD),
213 A(FSUB),
214 A(FMIN),
215 A(FMAX),
216 A(FMINABS),
217 A(FMAXABS),
218 A(FTOI),
219 A(ITOF),
220 A(ADD),
221 A(SUB),
222 A(SHL),
223 A(SHR),
224 A(ASR),
225 A(MIN),
226 A(MAX),
227 A(AND),
228 A(OR),
229 A(XOR),
230 A(NOT),
231
232 M(FMUL),
233 M(V8MULD),
234 M(V8MIN),
235 M(V8MAX),
236 M(V8ADDS),
237 M(V8SUBS),
238 M(MUL24),
239
240 /* If we replicate src[0] out to src[1], this works
241 * out the same as a MOV.
242 */
243 [QOP_MOV] = { QPU_A_OR },
244 [QOP_FMOV] = { QPU_A_FMAX },
245 [QOP_MMOV] = { QPU_M_V8MIN },
246 };
247
248 uint64_t unpack = 0;
249 struct qpu_reg src[4];
250 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
251 int index = qinst->src[i].index;
252 switch (qinst->src[i].file) {
253 case QFILE_NULL:
254 src[i] = qpu_rn(0);
255 break;
256 case QFILE_TEMP:
257 src[i] = temp_registers[index];
258 if (qinst->src[i].pack) {
259 assert(!unpack ||
260 unpack == qinst->src[i].pack);
261 unpack = QPU_SET_FIELD(qinst->src[i].pack,
262 QPU_UNPACK);
263 if (src[i].mux == QPU_MUX_R4)
264 unpack |= QPU_PM;
265 }
266 break;
267 case QFILE_UNIF:
268 src[i] = qpu_unif();
269 break;
270 case QFILE_VARY:
271 src[i] = qpu_vary();
272 break;
273 case QFILE_SMALL_IMM:
274 src[i].mux = QPU_MUX_SMALL_IMM;
275 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
276 /* This should only have returned a valid
277 * small immediate field, not ~0 for failure.
278 */
279 assert(src[i].addr <= 47);
280 break;
281 case QFILE_VPM:
282 assert((int)qinst->src[i].index >=
283 last_vpm_read_index);
284 (void)last_vpm_read_index;
285 last_vpm_read_index = qinst->src[i].index;
286 src[i] = qpu_ra(QPU_R_VPM);
287 break;
288 }
289 }
290
291 struct qpu_reg dst;
292 switch (qinst->dst.file) {
293 case QFILE_NULL:
294 dst = qpu_ra(QPU_W_NOP);
295 break;
296 case QFILE_TEMP:
297 dst = temp_registers[qinst->dst.index];
298 break;
299 case QFILE_VPM:
300 dst = qpu_ra(QPU_W_VPM);
301 break;
302 case QFILE_VARY:
303 case QFILE_UNIF:
304 case QFILE_SMALL_IMM:
305 assert(!"not reached");
306 break;
307 }
308
309 switch (qinst->op) {
310 case QOP_SEL_X_0_ZS:
311 case QOP_SEL_X_0_ZC:
312 case QOP_SEL_X_0_NS:
313 case QOP_SEL_X_0_NC:
314 queue(c, qpu_a_MOV(dst, src[0]) | unpack);
315 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
316 QPU_COND_ZS);
317
318 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
319 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
320 1) + QPU_COND_ZS);
321 break;
322
323 case QOP_SEL_X_Y_ZS:
324 case QOP_SEL_X_Y_ZC:
325 case QOP_SEL_X_Y_NS:
326 case QOP_SEL_X_Y_NC:
327 queue(c, qpu_a_MOV(dst, src[0]));
328 if (qinst->src[0].pack)
329 *(last_inst(c)) |= unpack;
330 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
331 QPU_COND_ZS);
332
333 queue(c, qpu_a_MOV(dst, src[1]));
334 if (qinst->src[1].pack)
335 *(last_inst(c)) |= unpack;
336 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
337 1) + QPU_COND_ZS);
338
339 break;
340
341 case QOP_RCP:
342 case QOP_RSQ:
343 case QOP_EXP2:
344 case QOP_LOG2:
345 switch (qinst->op) {
346 case QOP_RCP:
347 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
348 src[0]) | unpack);
349 break;
350 case QOP_RSQ:
351 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
352 src[0]) | unpack);
353 break;
354 case QOP_EXP2:
355 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
356 src[0]) | unpack);
357 break;
358 case QOP_LOG2:
359 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
360 src[0]) | unpack);
361 break;
362 default:
363 abort();
364 }
365
366 if (dst.mux != QPU_MUX_R4)
367 queue(c, qpu_a_MOV(dst, qpu_r4()));
368
369 break;
370
371 case QOP_FRAG_X:
372 queue(c, qpu_a_ITOF(dst,
373 qpu_ra(QPU_R_XY_PIXEL_COORD)));
374 break;
375
376 case QOP_FRAG_Y:
377 queue(c, qpu_a_ITOF(dst,
378 qpu_rb(QPU_R_XY_PIXEL_COORD)));
379 break;
380
381 case QOP_FRAG_REV_FLAG:
382 queue(c, qpu_a_ITOF(dst,
383 qpu_rb(QPU_R_MS_REV_FLAGS)));
384 break;
385
386 case QOP_FRAG_Z:
387 case QOP_FRAG_W:
388 /* QOP_FRAG_Z/W don't emit instructions, just allocate
389 * the register to the Z/W payload.
390 */
391 break;
392
393 case QOP_TLB_DISCARD_SETUP:
394 discard = true;
395 queue(c, qpu_a_MOV(src[0], src[0]) | unpack);
396 *last_inst(c) |= QPU_SF;
397 break;
398
399 case QOP_TLB_STENCIL_SETUP:
400 assert(!unpack);
401 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP),
402 src[0]) | unpack);
403 break;
404
405 case QOP_TLB_Z_WRITE:
406 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z),
407 src[0]) | unpack);
408 if (discard) {
409 set_last_cond_add(c, QPU_COND_ZS);
410 }
411 break;
412
413 case QOP_TLB_COLOR_READ:
414 queue(c, qpu_NOP());
415 *last_inst(c) = qpu_set_sig(*last_inst(c),
416 QPU_SIG_COLOR_LOAD);
417
418 if (dst.mux != QPU_MUX_R4)
419 queue(c, qpu_a_MOV(dst, qpu_r4()));
420 break;
421
422 case QOP_TLB_COLOR_WRITE:
423 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]) | unpack);
424 if (discard) {
425 set_last_cond_add(c, QPU_COND_ZS);
426 }
427 break;
428
429 case QOP_VARY_ADD_C:
430 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
431 break;
432
433 case QOP_TEX_S:
434 case QOP_TEX_T:
435 case QOP_TEX_R:
436 case QOP_TEX_B:
437 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
438 (qinst->op - QOP_TEX_S)),
439 src[0]) | unpack);
440 break;
441
442 case QOP_TEX_DIRECT:
443 fixup_raddr_conflict(c, dst, &src[0], &src[1],
444 qinst, &unpack);
445 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
446 src[0], src[1]) | unpack);
447 break;
448
449 case QOP_TEX_RESULT:
450 queue(c, qpu_NOP());
451 *last_inst(c) = qpu_set_sig(*last_inst(c),
452 QPU_SIG_LOAD_TMU0);
453 if (dst.mux != QPU_MUX_R4)
454 queue(c, qpu_a_MOV(dst, qpu_r4()));
455 break;
456
457 default:
458 assert(qinst->op < ARRAY_SIZE(translate));
459 assert(translate[qinst->op].op != 0); /* NOPs */
460
461 /* Skip emitting the MOV if it's a no-op. */
462 if (qir_is_raw_mov(qinst) &&
463 dst.mux == src[0].mux && dst.addr == src[0].addr) {
464 break;
465 }
466
467 /* If we have only one source, put it in the second
468 * argument slot as well so that we don't take up
469 * another raddr just to get unused data.
470 */
471 if (qir_get_op_nsrc(qinst->op) == 1)
472 src[1] = src[0];
473
474 fixup_raddr_conflict(c, dst, &src[0], &src[1],
475 qinst, &unpack);
476
477 if (qir_is_mul(qinst)) {
478 queue(c, qpu_m_alu2(translate[qinst->op].op,
479 dst,
480 src[0], src[1]) | unpack);
481 } else {
482 queue(c, qpu_a_alu2(translate[qinst->op].op,
483 dst,
484 src[0], src[1]) | unpack);
485 }
486 set_last_dst_pack(c, qinst);
487
488 break;
489 }
490
491 if (qinst->sf) {
492 assert(!qir_is_multi_instruction(qinst));
493 *last_inst(c) |= QPU_SF;
494 }
495 }
496
497 qpu_schedule_instructions(c);
498
499 /* thread end can't have VPM write or read */
500 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
501 QPU_WADDR_ADD) == QPU_W_VPM ||
502 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
503 QPU_WADDR_MUL) == QPU_W_VPM ||
504 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
505 QPU_RADDR_A) == QPU_R_VPM ||
506 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
507 QPU_RADDR_B) == QPU_R_VPM) {
508 qpu_serialize_one_inst(c, qpu_NOP());
509 }
510
511 /* thread end can't have uniform read */
512 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
513 QPU_RADDR_A) == QPU_R_UNIF ||
514 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
515 QPU_RADDR_B) == QPU_R_UNIF) {
516 qpu_serialize_one_inst(c, qpu_NOP());
517 }
518
519 /* thread end can't have TLB operations */
520 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
521 qpu_serialize_one_inst(c, qpu_NOP());
522
523 c->qpu_insts[c->qpu_inst_count - 1] =
524 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
525 QPU_SIG_PROG_END);
526 qpu_serialize_one_inst(c, qpu_NOP());
527 qpu_serialize_one_inst(c, qpu_NOP());
528
529 switch (c->stage) {
530 case QSTAGE_VERT:
531 case QSTAGE_COORD:
532 break;
533 case QSTAGE_FRAG:
534 c->qpu_insts[c->qpu_inst_count - 1] =
535 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
536 QPU_SIG_SCOREBOARD_UNLOCK);
537 break;
538 }
539
540 if (vc4_debug & VC4_DEBUG_QPU)
541 vc4_dump_program(c);
542
543 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
544
545 free(temp_registers);
546 }