vc4: When possible, resolve raddr conflicts by swapping files on specials.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29
30 static void
31 vc4_dump_program(struct vc4_compile *c)
32 {
33 fprintf(stderr, "%s:\n", qir_get_stage_name(c->stage));
34
35 for (int i = 0; i < c->qpu_inst_count; i++) {
36 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
37 vc4_qpu_disasm(&c->qpu_insts[i], 1);
38 fprintf(stderr, "\n");
39 }
40 }
41
42 struct queued_qpu_inst {
43 struct simple_node link;
44 uint64_t inst;
45 };
46
47 static void
48 queue(struct vc4_compile *c, uint64_t inst)
49 {
50 struct queued_qpu_inst *q = calloc(1, sizeof(*q));
51 q->inst = inst;
52 insert_at_tail(&c->qpu_inst_list, &q->link);
53 }
54
55 static uint64_t *
56 last_inst(struct vc4_compile *c)
57 {
58 struct queued_qpu_inst *q =
59 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
60 return &q->inst;
61 }
62
63 static void
64 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
65 {
66 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
67 }
68
69 /**
70 * Some special registers can be read from either file, which lets us resolve
71 * raddr conflicts without extra MOVs.
72 */
73 static bool
74 swap_file(struct qpu_reg *src)
75 {
76 switch (src->addr) {
77 case QPU_R_UNIF:
78 case QPU_R_VARY:
79 if (src->mux == QPU_MUX_A)
80 src->mux = QPU_MUX_B;
81 else
82 src->mux = QPU_MUX_A;
83 return true;
84
85 default:
86 return false;
87 }
88 }
89
90 /**
91 * This is used to resolve the fact that we might register-allocate two
92 * different operands of an instruction to the same physical register file
93 * even though instructions have only one field for the register file source
94 * address.
95 *
96 * In that case, we need to move one to a temporary that can be used in the
97 * instruction, instead.
98 */
99 static void
100 fixup_raddr_conflict(struct vc4_compile *c,
101 struct qpu_reg *src0, struct qpu_reg *src1)
102 {
103 if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
104 src0->mux != src1->mux ||
105 src0->addr == src1->addr) {
106 return;
107 }
108
109 if (swap_file(src0) || swap_file(src1))
110 return;
111
112 queue(c, qpu_a_MOV(qpu_r3(), *src1));
113 *src1 = qpu_r3();
114 }
115
116 static void
117 serialize_one_inst(struct vc4_compile *c, uint64_t inst)
118 {
119 if (c->qpu_inst_count >= c->qpu_inst_size) {
120 c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2);
121 c->qpu_insts = realloc(c->qpu_insts,
122 c->qpu_inst_size * sizeof(uint64_t));
123 }
124 c->qpu_insts[c->qpu_inst_count++] = inst;
125 }
126
127 static void
128 serialize_insts(struct vc4_compile *c)
129 {
130 int last_sfu_write = -10;
131 bool scoreboard_wait_emitted = false;
132
133 while (!is_empty_list(&c->qpu_inst_list)) {
134 struct queued_qpu_inst *q =
135 (struct queued_qpu_inst *)first_elem(&c->qpu_inst_list);
136 uint32_t last_waddr_a = QPU_W_NOP, last_waddr_b = QPU_W_NOP;
137 uint32_t raddr_a = QPU_GET_FIELD(q->inst, QPU_RADDR_A);
138 uint32_t raddr_b = QPU_GET_FIELD(q->inst, QPU_RADDR_B);
139
140 if (c->qpu_inst_count > 0) {
141 uint64_t last_inst = c->qpu_insts[c->qpu_inst_count -
142 1];
143 uint32_t last_waddr_add = QPU_GET_FIELD(last_inst,
144 QPU_WADDR_ADD);
145 uint32_t last_waddr_mul = QPU_GET_FIELD(last_inst,
146 QPU_WADDR_MUL);
147
148 if (last_inst & QPU_WS) {
149 last_waddr_a = last_waddr_mul;
150 last_waddr_b = last_waddr_add;
151 } else {
152 last_waddr_a = last_waddr_add;
153 last_waddr_b = last_waddr_mul;
154 }
155 }
156
157 uint32_t src_muxes[] = {
158 QPU_GET_FIELD(q->inst, QPU_ADD_A),
159 QPU_GET_FIELD(q->inst, QPU_ADD_B),
160 QPU_GET_FIELD(q->inst, QPU_MUL_A),
161 QPU_GET_FIELD(q->inst, QPU_MUL_B),
162 };
163
164 /* "An instruction must not read from a location in physical
165 * regfile A or B that was written to by the previous
166 * instruction."
167 */
168 bool needs_raddr_vs_waddr_nop = false;
169 bool reads_r4 = false;
170 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
171 if ((raddr_a < 32 &&
172 src_muxes[i] == QPU_MUX_A &&
173 last_waddr_a == raddr_a) ||
174 (raddr_b < 32 &&
175 src_muxes[i] == QPU_MUX_B &&
176 last_waddr_b == raddr_b)) {
177 needs_raddr_vs_waddr_nop = true;
178 }
179 if (src_muxes[i] == QPU_MUX_R4)
180 reads_r4 = true;
181 }
182
183 if (needs_raddr_vs_waddr_nop) {
184 serialize_one_inst(c, qpu_NOP());
185 }
186
187 /* "After an SFU lookup instruction, accumulator r4 must not
188 * be read in the following two instructions. Any other
189 * instruction that results in r4 being written (that is, TMU
190 * read, TLB read, SFU lookup) cannot occur in the two
191 * instructions following an SFU lookup."
192 */
193 if (reads_r4) {
194 while (c->qpu_inst_count - last_sfu_write < 3) {
195 serialize_one_inst(c, qpu_NOP());
196 }
197 }
198
199 uint32_t waddr_a = QPU_GET_FIELD(q->inst, QPU_WADDR_ADD);
200 uint32_t waddr_m = QPU_GET_FIELD(q->inst, QPU_WADDR_MUL);
201 if ((waddr_a >= QPU_W_SFU_RECIP && waddr_a <= QPU_W_SFU_LOG) ||
202 (waddr_m >= QPU_W_SFU_RECIP && waddr_m <= QPU_W_SFU_LOG)) {
203 last_sfu_write = c->qpu_inst_count;
204 }
205
206 /* "A scoreboard wait must not occur in the first two
207 * instructions of a fragment shader. This is either the
208 * explicit Wait for Scoreboard signal or an implicit wait
209 * with the first tile-buffer read or write instruction."
210 */
211 if (!scoreboard_wait_emitted &&
212 (waddr_a == QPU_W_TLB_Z || waddr_m == QPU_W_TLB_Z ||
213 waddr_a == QPU_W_TLB_COLOR_MS ||
214 waddr_m == QPU_W_TLB_COLOR_MS ||
215 waddr_a == QPU_W_TLB_COLOR_ALL ||
216 waddr_m == QPU_W_TLB_COLOR_ALL ||
217 QPU_GET_FIELD(q->inst, QPU_SIG) == QPU_SIG_COLOR_LOAD)) {
218 while (c->qpu_inst_count < 3 ||
219 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
220 QPU_SIG) != QPU_SIG_NONE) {
221 serialize_one_inst(c, qpu_NOP());
222 }
223 c->qpu_insts[c->qpu_inst_count - 1] =
224 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
225 QPU_SIG_WAIT_FOR_SCOREBOARD);
226 scoreboard_wait_emitted = true;
227 }
228
229 serialize_one_inst(c, q->inst);
230
231 remove_from_list(&q->link);
232 free(q);
233 }
234 }
235
236 void
237 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
238 {
239 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
240 bool discard = false;
241
242 make_empty_list(&c->qpu_inst_list);
243
244 switch (c->stage) {
245 case QSTAGE_VERT:
246 case QSTAGE_COORD:
247 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
248 (0x00001a00 +
249 0x00100000 * c->num_inputs)));
250 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
251 break;
252 case QSTAGE_FRAG:
253 break;
254 }
255
256 struct simple_node *node;
257 foreach(node, &c->instructions) {
258 struct qinst *qinst = (struct qinst *)node;
259
260 #if 0
261 fprintf(stderr, "translating qinst to qpu: ");
262 qir_dump_inst(qinst);
263 fprintf(stderr, "\n");
264 #endif
265
266 static const struct {
267 uint32_t op;
268 bool is_mul;
269 } translate[] = {
270 #define A(name) [QOP_##name] = {QPU_A_##name, false}
271 #define M(name) [QOP_##name] = {QPU_M_##name, true}
272 A(FADD),
273 A(FSUB),
274 A(FMIN),
275 A(FMAX),
276 A(FMINABS),
277 A(FMAXABS),
278 A(FTOI),
279 A(ITOF),
280 A(ADD),
281 A(SUB),
282 A(SHL),
283 A(SHR),
284 A(ASR),
285 A(MIN),
286 A(MAX),
287 A(AND),
288 A(OR),
289 A(XOR),
290 A(NOT),
291
292 M(FMUL),
293 M(MUL24),
294 };
295
296 struct qpu_reg src[4];
297 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
298 int index = qinst->src[i].index;
299 switch (qinst->src[i].file) {
300 case QFILE_NULL:
301 src[i] = qpu_rn(0);
302 break;
303 case QFILE_TEMP:
304 src[i] = temp_registers[index];
305 break;
306 case QFILE_UNIF:
307 src[i] = qpu_unif();
308 break;
309 case QFILE_VARY:
310 src[i] = qpu_vary();
311 break;
312 }
313 }
314
315 struct qpu_reg dst;
316 switch (qinst->dst.file) {
317 case QFILE_NULL:
318 dst = qpu_ra(QPU_W_NOP);
319 break;
320 case QFILE_TEMP:
321 dst = temp_registers[qinst->dst.index];
322 break;
323 case QFILE_VARY:
324 case QFILE_UNIF:
325 assert(!"not reached");
326 break;
327 }
328
329 switch (qinst->op) {
330 case QOP_MOV:
331 /* Skip emitting the MOV if it's a no-op. */
332 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
333 dst.mux != src[0].mux || dst.addr != src[0].addr) {
334 queue(c, qpu_a_MOV(dst, src[0]));
335 }
336 break;
337
338 case QOP_SF:
339 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
340 *last_inst(c) |= QPU_SF;
341 break;
342
343 case QOP_SEL_X_0_ZS:
344 case QOP_SEL_X_0_ZC:
345 case QOP_SEL_X_0_NS:
346 case QOP_SEL_X_0_NC:
347 queue(c, qpu_a_MOV(dst, src[0]));
348 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
349 QPU_COND_ZS);
350
351 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
352 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
353 1) + QPU_COND_ZS);
354 break;
355
356 case QOP_SEL_X_Y_ZS:
357 case QOP_SEL_X_Y_ZC:
358 case QOP_SEL_X_Y_NS:
359 case QOP_SEL_X_Y_NC:
360 queue(c, qpu_a_MOV(dst, src[0]));
361 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
362 QPU_COND_ZS);
363
364 queue(c, qpu_a_MOV(dst, src[1]));
365 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
366 1) + QPU_COND_ZS);
367
368 break;
369
370 case QOP_VPM_WRITE:
371 queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
372 break;
373
374 case QOP_VPM_READ:
375 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
376 break;
377
378 case QOP_RCP:
379 case QOP_RSQ:
380 case QOP_EXP2:
381 case QOP_LOG2:
382 switch (qinst->op) {
383 case QOP_RCP:
384 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
385 src[0]));
386 break;
387 case QOP_RSQ:
388 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
389 src[0]));
390 break;
391 case QOP_EXP2:
392 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
393 src[0]));
394 break;
395 case QOP_LOG2:
396 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
397 src[0]));
398 break;
399 default:
400 abort();
401 }
402
403 queue(c, qpu_a_MOV(dst, qpu_r4()));
404
405 break;
406
407 case QOP_PACK_COLORS:
408 for (int i = 0; i < 4; i++) {
409 queue(c, qpu_m_MOV(qpu_r3(), src[i]));
410 *last_inst(c) |= QPU_PM;
411 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
412 QPU_PACK);
413 }
414
415 queue(c, qpu_a_MOV(dst, qpu_r3()));
416
417 break;
418
419 case QOP_FRAG_X:
420 queue(c, qpu_a_ITOF(dst,
421 qpu_ra(QPU_R_XY_PIXEL_COORD)));
422 break;
423
424 case QOP_FRAG_Y:
425 queue(c, qpu_a_ITOF(dst,
426 qpu_rb(QPU_R_XY_PIXEL_COORD)));
427 break;
428
429 case QOP_FRAG_Z:
430 case QOP_FRAG_W:
431 /* QOP_FRAG_Z/W don't emit instructions, just allocate
432 * the register to the Z/W payload.
433 */
434 break;
435
436 case QOP_TLB_DISCARD_SETUP:
437 discard = true;
438 queue(c, qpu_a_MOV(src[0], src[0]));
439 *last_inst(c) |= QPU_SF;
440 break;
441
442 case QOP_TLB_STENCIL_SETUP:
443 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
444 break;
445
446 case QOP_TLB_Z_WRITE:
447 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
448 if (discard) {
449 set_last_cond_add(c, QPU_COND_ZS);
450 }
451 break;
452
453 case QOP_TLB_COLOR_READ:
454 queue(c, qpu_NOP());
455 *last_inst(c) = qpu_set_sig(*last_inst(c),
456 QPU_SIG_COLOR_LOAD);
457
458 break;
459
460 case QOP_TLB_COLOR_WRITE:
461 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
462 if (discard) {
463 set_last_cond_add(c, QPU_COND_ZS);
464 }
465 break;
466
467 case QOP_VARY_ADD_C:
468 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
469 break;
470
471 case QOP_PACK_SCALED: {
472 uint64_t a = (qpu_a_MOV(dst, src[0]) |
473 QPU_SET_FIELD(QPU_PACK_A_16A,
474 QPU_PACK));
475 uint64_t b = (qpu_a_MOV(dst, src[1]) |
476 QPU_SET_FIELD(QPU_PACK_A_16B,
477 QPU_PACK));
478
479 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
480 queue(c, b);
481 queue(c, a);
482 } else {
483 queue(c, a);
484 queue(c, b);
485 }
486 break;
487 }
488
489 case QOP_TEX_S:
490 case QOP_TEX_T:
491 case QOP_TEX_R:
492 case QOP_TEX_B:
493 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
494 (qinst->op - QOP_TEX_S)),
495 src[0]));
496 break;
497
498 case QOP_TEX_RESULT:
499 queue(c, qpu_NOP());
500 *last_inst(c) = qpu_set_sig(*last_inst(c),
501 QPU_SIG_LOAD_TMU0);
502
503 break;
504
505 case QOP_R4_UNPACK_A:
506 case QOP_R4_UNPACK_B:
507 case QOP_R4_UNPACK_C:
508 case QOP_R4_UNPACK_D:
509 assert(src[0].mux == QPU_MUX_R4);
510 queue(c, qpu_a_MOV(dst, src[0]));
511 *last_inst(c) |= QPU_PM;
512 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
513 (qinst->op -
514 QOP_R4_UNPACK_A),
515 QPU_UNPACK);
516
517 break;
518
519 case QOP_UNPACK_8A:
520 case QOP_UNPACK_8B:
521 case QOP_UNPACK_8C:
522 case QOP_UNPACK_8D: {
523 assert(src[0].mux == QPU_MUX_A);
524
525 /* And, since we're setting the pack bits, if the
526 * destination is in A it would get re-packed.
527 */
528 struct qpu_reg orig_dst = dst;
529 if (orig_dst.mux == QPU_MUX_A)
530 dst = qpu_rn(3);
531
532 queue(c, qpu_a_FMAX(dst, src[0], src[0]));
533 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
534 (qinst->op -
535 QOP_UNPACK_8A),
536 QPU_UNPACK);
537
538 if (orig_dst.mux == QPU_MUX_A) {
539 queue(c, qpu_a_MOV(orig_dst, dst));
540 }
541 }
542 break;
543
544 default:
545 assert(qinst->op < ARRAY_SIZE(translate));
546 assert(translate[qinst->op].op != 0); /* NOPs */
547
548 /* If we have only one source, put it in the second
549 * argument slot as well so that we don't take up
550 * another raddr just to get unused data.
551 */
552 if (qir_get_op_nsrc(qinst->op) == 1)
553 src[1] = src[0];
554
555 fixup_raddr_conflict(c, &src[0], &src[1]);
556
557 if (translate[qinst->op].is_mul) {
558 queue(c, qpu_m_alu2(translate[qinst->op].op,
559 dst,
560 src[0], src[1]));
561 } else {
562 queue(c, qpu_a_alu2(translate[qinst->op].op,
563 dst,
564 src[0], src[1]));
565 }
566 break;
567 }
568 }
569
570 serialize_insts(c);
571
572 /* thread end can't have VPM write */
573 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
574 QPU_WADDR_ADD) == QPU_W_VPM ||
575 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
576 QPU_WADDR_MUL) == QPU_W_VPM) {
577 serialize_one_inst(c, qpu_NOP());
578 }
579
580 /* thread end can't have uniform read */
581 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
582 QPU_RADDR_A) == QPU_R_UNIF ||
583 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
584 QPU_RADDR_B) == QPU_R_UNIF) {
585 serialize_one_inst(c, qpu_NOP());
586 }
587
588 c->qpu_insts[c->qpu_inst_count - 1] =
589 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
590 QPU_SIG_PROG_END);
591 serialize_one_inst(c, qpu_NOP());
592 serialize_one_inst(c, qpu_NOP());
593
594 switch (c->stage) {
595 case QSTAGE_VERT:
596 case QSTAGE_COORD:
597 break;
598 case QSTAGE_FRAG:
599 c->qpu_insts[c->qpu_inst_count - 1] =
600 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
601 QPU_SIG_SCOREBOARD_UNLOCK);
602 break;
603 }
604
605 if (vc4_debug & VC4_DEBUG_QPU)
606 vc4_dump_program(c);
607
608 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
609
610 free(temp_registers);
611 }