vc4: Fix overzealous raddr conflict resolution.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29
30 static void
31 vc4_dump_program(struct vc4_compile *c)
32 {
33 fprintf(stderr, "%s:\n", qir_get_stage_name(c->stage));
34
35 for (int i = 0; i < c->qpu_inst_count; i++) {
36 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
37 vc4_qpu_disasm(&c->qpu_insts[i], 1);
38 fprintf(stderr, "\n");
39 }
40 }
41
42 struct queued_qpu_inst {
43 struct simple_node link;
44 uint64_t inst;
45 };
46
47 static void
48 queue(struct vc4_compile *c, uint64_t inst)
49 {
50 struct queued_qpu_inst *q = calloc(1, sizeof(*q));
51 q->inst = inst;
52 insert_at_tail(&c->qpu_inst_list, &q->link);
53 }
54
55 static uint64_t *
56 last_inst(struct vc4_compile *c)
57 {
58 struct queued_qpu_inst *q =
59 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
60 return &q->inst;
61 }
62
63 static void
64 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
65 {
66 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
67 }
68
69 /**
70 * This is used to resolve the fact that we might register-allocate two
71 * different operands of an instruction to the same physical register file
72 * even though instructions have only one field for the register file source
73 * address.
74 *
75 * In that case, we need to move one to a temporary that can be used in the
76 * instruction, instead.
77 */
78 static void
79 fixup_raddr_conflict(struct vc4_compile *c,
80 struct qpu_reg src0, struct qpu_reg *src1)
81 {
82 if ((src0.mux != QPU_MUX_A && src0.mux != QPU_MUX_B) ||
83 src0.mux != src1->mux ||
84 src0.addr == src1->addr) {
85 return;
86 }
87
88 queue(c, qpu_a_MOV(qpu_r3(), *src1));
89 *src1 = qpu_r3();
90 }
91
92 static void
93 serialize_one_inst(struct vc4_compile *c, uint64_t inst)
94 {
95 if (c->qpu_inst_count >= c->qpu_inst_size) {
96 c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2);
97 c->qpu_insts = realloc(c->qpu_insts,
98 c->qpu_inst_size * sizeof(uint64_t));
99 }
100 c->qpu_insts[c->qpu_inst_count++] = inst;
101 }
102
103 static void
104 serialize_insts(struct vc4_compile *c)
105 {
106 int last_sfu_write = -10;
107 bool scoreboard_wait_emitted = false;
108
109 while (!is_empty_list(&c->qpu_inst_list)) {
110 struct queued_qpu_inst *q =
111 (struct queued_qpu_inst *)first_elem(&c->qpu_inst_list);
112 uint32_t last_waddr_a = QPU_W_NOP, last_waddr_b = QPU_W_NOP;
113 uint32_t raddr_a = QPU_GET_FIELD(q->inst, QPU_RADDR_A);
114 uint32_t raddr_b = QPU_GET_FIELD(q->inst, QPU_RADDR_B);
115
116 if (c->qpu_inst_count > 0) {
117 uint64_t last_inst = c->qpu_insts[c->qpu_inst_count -
118 1];
119 uint32_t last_waddr_add = QPU_GET_FIELD(last_inst,
120 QPU_WADDR_ADD);
121 uint32_t last_waddr_mul = QPU_GET_FIELD(last_inst,
122 QPU_WADDR_MUL);
123
124 if (last_inst & QPU_WS) {
125 last_waddr_a = last_waddr_mul;
126 last_waddr_b = last_waddr_add;
127 } else {
128 last_waddr_a = last_waddr_add;
129 last_waddr_b = last_waddr_mul;
130 }
131 }
132
133 uint32_t src_muxes[] = {
134 QPU_GET_FIELD(q->inst, QPU_ADD_A),
135 QPU_GET_FIELD(q->inst, QPU_ADD_B),
136 QPU_GET_FIELD(q->inst, QPU_MUL_A),
137 QPU_GET_FIELD(q->inst, QPU_MUL_B),
138 };
139
140 /* "An instruction must not read from a location in physical
141 * regfile A or B that was written to by the previous
142 * instruction."
143 */
144 bool needs_raddr_vs_waddr_nop = false;
145 bool reads_r4 = false;
146 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
147 if ((raddr_a < 32 &&
148 src_muxes[i] == QPU_MUX_A &&
149 last_waddr_a == raddr_a) ||
150 (raddr_b < 32 &&
151 src_muxes[i] == QPU_MUX_B &&
152 last_waddr_b == raddr_b)) {
153 needs_raddr_vs_waddr_nop = true;
154 }
155 if (src_muxes[i] == QPU_MUX_R4)
156 reads_r4 = true;
157 }
158
159 if (needs_raddr_vs_waddr_nop) {
160 serialize_one_inst(c, qpu_NOP());
161 }
162
163 /* "After an SFU lookup instruction, accumulator r4 must not
164 * be read in the following two instructions. Any other
165 * instruction that results in r4 being written (that is, TMU
166 * read, TLB read, SFU lookup) cannot occur in the two
167 * instructions following an SFU lookup."
168 */
169 if (reads_r4) {
170 while (c->qpu_inst_count - last_sfu_write < 3) {
171 serialize_one_inst(c, qpu_NOP());
172 }
173 }
174
175 uint32_t waddr_a = QPU_GET_FIELD(q->inst, QPU_WADDR_ADD);
176 uint32_t waddr_m = QPU_GET_FIELD(q->inst, QPU_WADDR_MUL);
177 if ((waddr_a >= QPU_W_SFU_RECIP && waddr_a <= QPU_W_SFU_LOG) ||
178 (waddr_m >= QPU_W_SFU_RECIP && waddr_m <= QPU_W_SFU_LOG)) {
179 last_sfu_write = c->qpu_inst_count;
180 }
181
182 /* "A scoreboard wait must not occur in the first two
183 * instructions of a fragment shader. This is either the
184 * explicit Wait for Scoreboard signal or an implicit wait
185 * with the first tile-buffer read or write instruction."
186 */
187 if (!scoreboard_wait_emitted &&
188 (waddr_a == QPU_W_TLB_Z || waddr_m == QPU_W_TLB_Z ||
189 waddr_a == QPU_W_TLB_COLOR_MS ||
190 waddr_m == QPU_W_TLB_COLOR_MS ||
191 waddr_a == QPU_W_TLB_COLOR_ALL ||
192 waddr_m == QPU_W_TLB_COLOR_ALL ||
193 QPU_GET_FIELD(q->inst, QPU_SIG) == QPU_SIG_COLOR_LOAD)) {
194 while (c->qpu_inst_count < 3 ||
195 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
196 QPU_SIG) != QPU_SIG_NONE) {
197 serialize_one_inst(c, qpu_NOP());
198 }
199 c->qpu_insts[c->qpu_inst_count - 1] =
200 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
201 QPU_SIG_WAIT_FOR_SCOREBOARD);
202 scoreboard_wait_emitted = true;
203 }
204
205 serialize_one_inst(c, q->inst);
206
207 remove_from_list(&q->link);
208 free(q);
209 }
210 }
211
212 void
213 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
214 {
215 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
216 bool discard = false;
217
218 make_empty_list(&c->qpu_inst_list);
219
220 switch (c->stage) {
221 case QSTAGE_VERT:
222 case QSTAGE_COORD:
223 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
224 (0x00001a00 +
225 0x00100000 * c->num_inputs)));
226 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
227 break;
228 case QSTAGE_FRAG:
229 break;
230 }
231
232 struct simple_node *node;
233 foreach(node, &c->instructions) {
234 struct qinst *qinst = (struct qinst *)node;
235
236 #if 0
237 fprintf(stderr, "translating qinst to qpu: ");
238 qir_dump_inst(qinst);
239 fprintf(stderr, "\n");
240 #endif
241
242 static const struct {
243 uint32_t op;
244 bool is_mul;
245 } translate[] = {
246 #define A(name) [QOP_##name] = {QPU_A_##name, false}
247 #define M(name) [QOP_##name] = {QPU_M_##name, true}
248 A(FADD),
249 A(FSUB),
250 A(FMIN),
251 A(FMAX),
252 A(FMINABS),
253 A(FMAXABS),
254 A(FTOI),
255 A(ITOF),
256 A(ADD),
257 A(SUB),
258 A(SHL),
259 A(SHR),
260 A(ASR),
261 A(MIN),
262 A(MAX),
263 A(AND),
264 A(OR),
265 A(XOR),
266 A(NOT),
267
268 M(FMUL),
269 M(MUL24),
270 };
271
272 struct qpu_reg src[4];
273 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
274 int index = qinst->src[i].index;
275 switch (qinst->src[i].file) {
276 case QFILE_NULL:
277 src[i] = qpu_rn(0);
278 break;
279 case QFILE_TEMP:
280 src[i] = temp_registers[index];
281 break;
282 case QFILE_UNIF:
283 src[i] = qpu_unif();
284 break;
285 case QFILE_VARY:
286 src[i] = qpu_vary();
287 break;
288 }
289 }
290
291 struct qpu_reg dst;
292 switch (qinst->dst.file) {
293 case QFILE_NULL:
294 dst = qpu_ra(QPU_W_NOP);
295 break;
296 case QFILE_TEMP:
297 dst = temp_registers[qinst->dst.index];
298 break;
299 case QFILE_VARY:
300 case QFILE_UNIF:
301 assert(!"not reached");
302 break;
303 }
304
305 switch (qinst->op) {
306 case QOP_MOV:
307 /* Skip emitting the MOV if it's a no-op. */
308 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
309 dst.mux != src[0].mux || dst.addr != src[0].addr) {
310 queue(c, qpu_a_MOV(dst, src[0]));
311 }
312 break;
313
314 case QOP_SF:
315 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
316 *last_inst(c) |= QPU_SF;
317 break;
318
319 case QOP_SEL_X_0_ZS:
320 case QOP_SEL_X_0_ZC:
321 case QOP_SEL_X_0_NS:
322 case QOP_SEL_X_0_NC:
323 queue(c, qpu_a_MOV(dst, src[0]));
324 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
325 QPU_COND_ZS);
326
327 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
328 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
329 1) + QPU_COND_ZS);
330 break;
331
332 case QOP_SEL_X_Y_ZS:
333 case QOP_SEL_X_Y_ZC:
334 case QOP_SEL_X_Y_NS:
335 case QOP_SEL_X_Y_NC:
336 queue(c, qpu_a_MOV(dst, src[0]));
337 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
338 QPU_COND_ZS);
339
340 queue(c, qpu_a_MOV(dst, src[1]));
341 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
342 1) + QPU_COND_ZS);
343
344 break;
345
346 case QOP_VPM_WRITE:
347 queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
348 break;
349
350 case QOP_VPM_READ:
351 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
352 break;
353
354 case QOP_RCP:
355 case QOP_RSQ:
356 case QOP_EXP2:
357 case QOP_LOG2:
358 switch (qinst->op) {
359 case QOP_RCP:
360 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
361 src[0]));
362 break;
363 case QOP_RSQ:
364 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
365 src[0]));
366 break;
367 case QOP_EXP2:
368 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
369 src[0]));
370 break;
371 case QOP_LOG2:
372 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
373 src[0]));
374 break;
375 default:
376 abort();
377 }
378
379 queue(c, qpu_a_MOV(dst, qpu_r4()));
380
381 break;
382
383 case QOP_PACK_COLORS:
384 for (int i = 0; i < 4; i++) {
385 queue(c, qpu_m_MOV(qpu_r3(), src[i]));
386 *last_inst(c) |= QPU_PM;
387 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
388 QPU_PACK);
389 }
390
391 queue(c, qpu_a_MOV(dst, qpu_r3()));
392
393 break;
394
395 case QOP_FRAG_X:
396 queue(c, qpu_a_ITOF(dst,
397 qpu_ra(QPU_R_XY_PIXEL_COORD)));
398 break;
399
400 case QOP_FRAG_Y:
401 queue(c, qpu_a_ITOF(dst,
402 qpu_rb(QPU_R_XY_PIXEL_COORD)));
403 break;
404
405 case QOP_FRAG_Z:
406 case QOP_FRAG_W:
407 /* QOP_FRAG_Z/W don't emit instructions, just allocate
408 * the register to the Z/W payload.
409 */
410 break;
411
412 case QOP_TLB_DISCARD_SETUP:
413 discard = true;
414 queue(c, qpu_a_MOV(src[0], src[0]));
415 *last_inst(c) |= QPU_SF;
416 break;
417
418 case QOP_TLB_STENCIL_SETUP:
419 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
420 break;
421
422 case QOP_TLB_Z_WRITE:
423 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
424 if (discard) {
425 set_last_cond_add(c, QPU_COND_ZS);
426 }
427 break;
428
429 case QOP_TLB_COLOR_READ:
430 queue(c, qpu_NOP());
431 *last_inst(c) = qpu_set_sig(*last_inst(c),
432 QPU_SIG_COLOR_LOAD);
433
434 break;
435
436 case QOP_TLB_COLOR_WRITE:
437 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
438 if (discard) {
439 set_last_cond_add(c, QPU_COND_ZS);
440 }
441 break;
442
443 case QOP_VARY_ADD_C:
444 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
445 break;
446
447 case QOP_PACK_SCALED: {
448 uint64_t a = (qpu_a_MOV(dst, src[0]) |
449 QPU_SET_FIELD(QPU_PACK_A_16A,
450 QPU_PACK));
451 uint64_t b = (qpu_a_MOV(dst, src[1]) |
452 QPU_SET_FIELD(QPU_PACK_A_16B,
453 QPU_PACK));
454
455 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
456 queue(c, b);
457 queue(c, a);
458 } else {
459 queue(c, a);
460 queue(c, b);
461 }
462 break;
463 }
464
465 case QOP_TEX_S:
466 case QOP_TEX_T:
467 case QOP_TEX_R:
468 case QOP_TEX_B:
469 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
470 (qinst->op - QOP_TEX_S)),
471 src[0]));
472 break;
473
474 case QOP_TEX_RESULT:
475 queue(c, qpu_NOP());
476 *last_inst(c) = qpu_set_sig(*last_inst(c),
477 QPU_SIG_LOAD_TMU0);
478
479 break;
480
481 case QOP_R4_UNPACK_A:
482 case QOP_R4_UNPACK_B:
483 case QOP_R4_UNPACK_C:
484 case QOP_R4_UNPACK_D:
485 assert(src[0].mux == QPU_MUX_R4);
486 queue(c, qpu_a_MOV(dst, src[0]));
487 *last_inst(c) |= QPU_PM;
488 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
489 (qinst->op -
490 QOP_R4_UNPACK_A),
491 QPU_UNPACK);
492
493 break;
494
495 case QOP_UNPACK_8A:
496 case QOP_UNPACK_8B:
497 case QOP_UNPACK_8C:
498 case QOP_UNPACK_8D: {
499 assert(src[0].mux == QPU_MUX_A);
500
501 /* And, since we're setting the pack bits, if the
502 * destination is in A it would get re-packed.
503 */
504 struct qpu_reg orig_dst = dst;
505 if (orig_dst.mux == QPU_MUX_A)
506 dst = qpu_rn(3);
507
508 queue(c, qpu_a_FMAX(dst, src[0], src[0]));
509 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
510 (qinst->op -
511 QOP_UNPACK_8A),
512 QPU_UNPACK);
513
514 if (orig_dst.mux == QPU_MUX_A) {
515 queue(c, qpu_a_MOV(orig_dst, dst));
516 }
517 }
518 break;
519
520 default:
521 assert(qinst->op < ARRAY_SIZE(translate));
522 assert(translate[qinst->op].op != 0); /* NOPs */
523
524 /* If we have only one source, put it in the second
525 * argument slot as well so that we don't take up
526 * another raddr just to get unused data.
527 */
528 if (qir_get_op_nsrc(qinst->op) == 1)
529 src[1] = src[0];
530
531 fixup_raddr_conflict(c, src[0], &src[1]);
532
533 if (translate[qinst->op].is_mul) {
534 queue(c, qpu_m_alu2(translate[qinst->op].op,
535 dst,
536 src[0], src[1]));
537 } else {
538 queue(c, qpu_a_alu2(translate[qinst->op].op,
539 dst,
540 src[0], src[1]));
541 }
542 break;
543 }
544 }
545
546 serialize_insts(c);
547
548 /* thread end can't have VPM write */
549 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
550 QPU_WADDR_ADD) == QPU_W_VPM ||
551 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
552 QPU_WADDR_MUL) == QPU_W_VPM) {
553 serialize_one_inst(c, qpu_NOP());
554 }
555
556 /* thread end can't have uniform read */
557 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
558 QPU_RADDR_A) == QPU_R_UNIF ||
559 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
560 QPU_RADDR_B) == QPU_R_UNIF) {
561 serialize_one_inst(c, qpu_NOP());
562 }
563
564 c->qpu_insts[c->qpu_inst_count - 1] =
565 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
566 QPU_SIG_PROG_END);
567 serialize_one_inst(c, qpu_NOP());
568 serialize_one_inst(c, qpu_NOP());
569
570 switch (c->stage) {
571 case QSTAGE_VERT:
572 case QSTAGE_COORD:
573 break;
574 case QSTAGE_FRAG:
575 c->qpu_insts[c->qpu_inst_count - 1] =
576 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
577 QPU_SIG_SCOREBOARD_UNLOCK);
578 break;
579 }
580
581 if (vc4_debug & VC4_DEBUG_QPU)
582 vc4_dump_program(c);
583
584 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
585
586 free(temp_registers);
587 }