vc4: Drop pointless raddr conflict handling on SF.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29
30 static void
31 vc4_dump_program(struct vc4_compile *c)
32 {
33 fprintf(stderr, "%s:\n", qir_get_stage_name(c->stage));
34
35 for (int i = 0; i < c->qpu_inst_count; i++) {
36 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
37 vc4_qpu_disasm(&c->qpu_insts[i], 1);
38 fprintf(stderr, "\n");
39 }
40 }
41
42 struct queued_qpu_inst {
43 struct simple_node link;
44 uint64_t inst;
45 };
46
47 static void
48 queue(struct vc4_compile *c, uint64_t inst)
49 {
50 struct queued_qpu_inst *q = calloc(1, sizeof(*q));
51 q->inst = inst;
52 insert_at_tail(&c->qpu_inst_list, &q->link);
53 }
54
55 static uint64_t *
56 last_inst(struct vc4_compile *c)
57 {
58 struct queued_qpu_inst *q =
59 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
60 return &q->inst;
61 }
62
63 static void
64 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
65 {
66 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
67 }
68
69 /**
70 * This is used to resolve the fact that we might register-allocate two
71 * different operands of an instruction to the same physical register file
72 * even though instructions have only one field for the register file source
73 * address.
74 *
75 * In that case, we need to move one to a temporary that can be used in the
76 * instruction, instead.
77 */
78 static void
79 fixup_raddr_conflict(struct vc4_compile *c,
80 struct qpu_reg src0, struct qpu_reg *src1)
81 {
82 if ((src0.mux == QPU_MUX_A || src0.mux == QPU_MUX_B) &&
83 (src1->mux == QPU_MUX_A || src1->mux == QPU_MUX_B) &&
84 src0.addr != src1->addr) {
85 queue(c, qpu_a_MOV(qpu_r3(), *src1));
86 *src1 = qpu_r3();
87 }
88 }
89
90 static void
91 serialize_one_inst(struct vc4_compile *c, uint64_t inst)
92 {
93 if (c->qpu_inst_count >= c->qpu_inst_size) {
94 c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2);
95 c->qpu_insts = realloc(c->qpu_insts,
96 c->qpu_inst_size * sizeof(uint64_t));
97 }
98 c->qpu_insts[c->qpu_inst_count++] = inst;
99 }
100
101 static void
102 serialize_insts(struct vc4_compile *c)
103 {
104 int last_sfu_write = -10;
105 bool scoreboard_wait_emitted = false;
106
107 while (!is_empty_list(&c->qpu_inst_list)) {
108 struct queued_qpu_inst *q =
109 (struct queued_qpu_inst *)first_elem(&c->qpu_inst_list);
110 uint32_t last_waddr_a = QPU_W_NOP, last_waddr_b = QPU_W_NOP;
111 uint32_t raddr_a = QPU_GET_FIELD(q->inst, QPU_RADDR_A);
112 uint32_t raddr_b = QPU_GET_FIELD(q->inst, QPU_RADDR_B);
113
114 if (c->qpu_inst_count > 0) {
115 uint64_t last_inst = c->qpu_insts[c->qpu_inst_count -
116 1];
117 uint32_t last_waddr_add = QPU_GET_FIELD(last_inst,
118 QPU_WADDR_ADD);
119 uint32_t last_waddr_mul = QPU_GET_FIELD(last_inst,
120 QPU_WADDR_MUL);
121
122 if (last_inst & QPU_WS) {
123 last_waddr_a = last_waddr_mul;
124 last_waddr_b = last_waddr_add;
125 } else {
126 last_waddr_a = last_waddr_add;
127 last_waddr_b = last_waddr_mul;
128 }
129 }
130
131 uint32_t src_muxes[] = {
132 QPU_GET_FIELD(q->inst, QPU_ADD_A),
133 QPU_GET_FIELD(q->inst, QPU_ADD_B),
134 QPU_GET_FIELD(q->inst, QPU_MUL_A),
135 QPU_GET_FIELD(q->inst, QPU_MUL_B),
136 };
137
138 /* "An instruction must not read from a location in physical
139 * regfile A or B that was written to by the previous
140 * instruction."
141 */
142 bool needs_raddr_vs_waddr_nop = false;
143 bool reads_r4 = false;
144 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
145 if ((raddr_a < 32 &&
146 src_muxes[i] == QPU_MUX_A &&
147 last_waddr_a == raddr_a) ||
148 (raddr_b < 32 &&
149 src_muxes[i] == QPU_MUX_B &&
150 last_waddr_b == raddr_b)) {
151 needs_raddr_vs_waddr_nop = true;
152 }
153 if (src_muxes[i] == QPU_MUX_R4)
154 reads_r4 = true;
155 }
156
157 if (needs_raddr_vs_waddr_nop) {
158 serialize_one_inst(c, qpu_NOP());
159 }
160
161 /* "After an SFU lookup instruction, accumulator r4 must not
162 * be read in the following two instructions. Any other
163 * instruction that results in r4 being written (that is, TMU
164 * read, TLB read, SFU lookup) cannot occur in the two
165 * instructions following an SFU lookup."
166 */
167 if (reads_r4) {
168 while (c->qpu_inst_count - last_sfu_write < 3) {
169 serialize_one_inst(c, qpu_NOP());
170 }
171 }
172
173 uint32_t waddr_a = QPU_GET_FIELD(q->inst, QPU_WADDR_ADD);
174 uint32_t waddr_m = QPU_GET_FIELD(q->inst, QPU_WADDR_MUL);
175 if ((waddr_a >= QPU_W_SFU_RECIP && waddr_a <= QPU_W_SFU_LOG) ||
176 (waddr_m >= QPU_W_SFU_RECIP && waddr_m <= QPU_W_SFU_LOG)) {
177 last_sfu_write = c->qpu_inst_count;
178 }
179
180 /* "A scoreboard wait must not occur in the first two
181 * instructions of a fragment shader. This is either the
182 * explicit Wait for Scoreboard signal or an implicit wait
183 * with the first tile-buffer read or write instruction."
184 */
185 if (!scoreboard_wait_emitted &&
186 (waddr_a == QPU_W_TLB_Z || waddr_m == QPU_W_TLB_Z ||
187 waddr_a == QPU_W_TLB_COLOR_MS ||
188 waddr_m == QPU_W_TLB_COLOR_MS ||
189 waddr_a == QPU_W_TLB_COLOR_ALL ||
190 waddr_m == QPU_W_TLB_COLOR_ALL ||
191 QPU_GET_FIELD(q->inst, QPU_SIG) == QPU_SIG_COLOR_LOAD)) {
192 while (c->qpu_inst_count < 3 ||
193 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
194 QPU_SIG) != QPU_SIG_NONE) {
195 serialize_one_inst(c, qpu_NOP());
196 }
197 c->qpu_insts[c->qpu_inst_count - 1] =
198 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
199 QPU_SIG_WAIT_FOR_SCOREBOARD);
200 scoreboard_wait_emitted = true;
201 }
202
203 serialize_one_inst(c, q->inst);
204
205 remove_from_list(&q->link);
206 free(q);
207 }
208 }
209
210 void
211 vc4_generate_code(struct vc4_compile *c)
212 {
213 struct qpu_reg allocate_to_qpu_reg[3 + 32 + 32];
214 bool reg_in_use[ARRAY_SIZE(allocate_to_qpu_reg)];
215 int *reg_allocated = calloc(c->num_temps, sizeof(*reg_allocated));
216 int *reg_uses_remaining =
217 calloc(c->num_temps, sizeof(*reg_uses_remaining));
218 bool discard = false;
219
220 for (int i = 0; i < ARRAY_SIZE(reg_in_use); i++)
221 reg_in_use[i] = false;
222 for (int i = 0; i < c->num_temps; i++)
223 reg_allocated[i] = -1;
224 for (int i = 0; i < 3; i++)
225 allocate_to_qpu_reg[i] = qpu_rn(i);
226 for (int i = 0; i < 32; i++)
227 allocate_to_qpu_reg[i + 3] = qpu_ra(i);
228 for (int i = 0; i < 32; i++)
229 allocate_to_qpu_reg[i + 3 + 32] = qpu_rb(i);
230
231 make_empty_list(&c->qpu_inst_list);
232
233 struct simple_node *node;
234 foreach(node, &c->instructions) {
235 struct qinst *qinst = (struct qinst *)node;
236
237 if (qinst->dst.file == QFILE_TEMP)
238 reg_uses_remaining[qinst->dst.index]++;
239 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
240 if (qinst->src[i].file == QFILE_TEMP)
241 reg_uses_remaining[qinst->src[i].index]++;
242 }
243 if (qinst->op == QOP_TLB_PASSTHROUGH_Z_WRITE ||
244 qinst->op == QOP_FRAG_Z)
245 reg_in_use[3 + 32 + QPU_R_FRAG_PAYLOAD_ZW] = true;
246 }
247
248 switch (c->stage) {
249 case QSTAGE_VERT:
250 case QSTAGE_COORD:
251 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
252 (0x00001a00 +
253 0x00100000 * c->num_inputs)));
254 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
255 break;
256 case QSTAGE_FRAG:
257 break;
258 }
259
260 foreach(node, &c->instructions) {
261 struct qinst *qinst = (struct qinst *)node;
262
263 #if 0
264 fprintf(stderr, "translating qinst to qpu: ");
265 qir_dump_inst(qinst);
266 fprintf(stderr, "\n");
267 #endif
268
269 static const struct {
270 uint32_t op;
271 bool is_mul;
272 } translate[] = {
273 #define A(name) [QOP_##name] = {QPU_A_##name, false}
274 #define M(name) [QOP_##name] = {QPU_M_##name, true}
275 A(FADD),
276 A(FSUB),
277 A(FMIN),
278 A(FMAX),
279 A(FMINABS),
280 A(FMAXABS),
281 A(FTOI),
282 A(ITOF),
283 A(ADD),
284 A(SUB),
285 A(SHL),
286 A(SHR),
287 A(ASR),
288 A(MIN),
289 A(MAX),
290 A(AND),
291 A(OR),
292 A(XOR),
293 A(NOT),
294
295 M(FMUL),
296 M(MUL24),
297 };
298
299 struct qpu_reg src[4];
300 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
301 int index = qinst->src[i].index;
302 switch (qinst->src[i].file) {
303 case QFILE_NULL:
304 src[i] = qpu_rn(0);
305 break;
306 case QFILE_TEMP:
307 if (reg_allocated[index] == -1) {
308 fprintf(stderr, "undefined reg use: ");
309 qir_dump_inst(qinst);
310 fprintf(stderr, "\n");
311
312 src[i] = qpu_rn(0);
313 } else {
314 src[i] = allocate_to_qpu_reg[reg_allocated[index]];
315 reg_uses_remaining[index]--;
316 if (reg_uses_remaining[index] == 0)
317 reg_in_use[reg_allocated[index]] = false;
318 }
319 break;
320 case QFILE_UNIF:
321 src[i] = qpu_unif();
322 break;
323 case QFILE_VARY:
324 src[i] = qpu_vary();
325 break;
326 }
327 }
328
329 struct qpu_reg dst;
330 switch (qinst->dst.file) {
331 case QFILE_NULL:
332 dst = qpu_ra(QPU_W_NOP);
333 break;
334
335 case QFILE_TEMP:
336 if (reg_allocated[qinst->dst.index] == -1) {
337 int alloc;
338 for (alloc = 0;
339 alloc < ARRAY_SIZE(reg_in_use);
340 alloc++) {
341 /* The pack flags require an A-file register. */
342 if (qinst->op == QOP_PACK_SCALED &&
343 allocate_to_qpu_reg[alloc].mux != QPU_MUX_A) {
344 continue;
345 }
346
347 if (!reg_in_use[alloc])
348 break;
349 }
350 assert(alloc != ARRAY_SIZE(reg_in_use) && "need better reg alloc");
351 reg_in_use[alloc] = true;
352 reg_allocated[qinst->dst.index] = alloc;
353 }
354
355 dst = allocate_to_qpu_reg[reg_allocated[qinst->dst.index]];
356
357 reg_uses_remaining[qinst->dst.index]--;
358 if (reg_uses_remaining[qinst->dst.index] == 0) {
359 reg_in_use[reg_allocated[qinst->dst.index]] =
360 false;
361 }
362 break;
363
364 case QFILE_VARY:
365 case QFILE_UNIF:
366 assert(!"not reached");
367 break;
368 }
369
370 switch (qinst->op) {
371 case QOP_MOV:
372 /* Skip emitting the MOV if it's a no-op. */
373 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
374 dst.mux != src[0].mux || dst.addr != src[0].addr) {
375 queue(c, qpu_a_MOV(dst, src[0]));
376 }
377 break;
378
379 case QOP_SF:
380 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
381 *last_inst(c) |= QPU_SF;
382 break;
383
384 case QOP_SEL_X_0_ZS:
385 case QOP_SEL_X_0_ZC:
386 case QOP_SEL_X_0_NS:
387 case QOP_SEL_X_0_NC:
388 queue(c, qpu_a_MOV(dst, src[0]));
389 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
390 QPU_COND_ZS);
391
392 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
393 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
394 1) + QPU_COND_ZS);
395 break;
396
397 case QOP_SEL_X_Y_ZS:
398 case QOP_SEL_X_Y_ZC:
399 case QOP_SEL_X_Y_NS:
400 case QOP_SEL_X_Y_NC:
401 queue(c, qpu_a_MOV(dst, src[0]));
402 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
403 QPU_COND_ZS);
404
405 queue(c, qpu_a_MOV(dst, src[1]));
406 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
407 1) + QPU_COND_ZS);
408
409 break;
410
411 case QOP_VPM_WRITE:
412 queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
413 break;
414
415 case QOP_VPM_READ:
416 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
417 break;
418
419 case QOP_RCP:
420 case QOP_RSQ:
421 case QOP_EXP2:
422 case QOP_LOG2:
423 switch (qinst->op) {
424 case QOP_RCP:
425 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
426 src[0]));
427 break;
428 case QOP_RSQ:
429 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
430 src[0]));
431 break;
432 case QOP_EXP2:
433 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
434 src[0]));
435 break;
436 case QOP_LOG2:
437 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
438 src[0]));
439 break;
440 default:
441 abort();
442 }
443
444 queue(c, qpu_a_MOV(dst, qpu_r4()));
445
446 break;
447
448 case QOP_PACK_COLORS:
449 for (int i = 0; i < 4; i++) {
450 queue(c, qpu_m_MOV(qpu_r3(), src[i]));
451 *last_inst(c) |= QPU_PM;
452 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
453 QPU_PACK);
454 }
455
456 queue(c, qpu_a_MOV(dst, qpu_r3()));
457
458 break;
459
460 case QOP_FRAG_X:
461 queue(c, qpu_a_ITOF(dst,
462 qpu_ra(QPU_R_XY_PIXEL_COORD)));
463 break;
464
465 case QOP_FRAG_Y:
466 queue(c, qpu_a_ITOF(dst,
467 qpu_rb(QPU_R_XY_PIXEL_COORD)));
468 break;
469
470 case QOP_FRAG_Z:
471 queue(c, qpu_a_ITOF(dst,
472 qpu_rb(QPU_R_FRAG_PAYLOAD_ZW)));
473 break;
474
475 case QOP_FRAG_RCP_W:
476 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
477 qpu_ra(QPU_R_FRAG_PAYLOAD_ZW)));
478
479 queue(c, qpu_a_MOV(dst, qpu_r4()));
480 break;
481
482 case QOP_TLB_DISCARD_SETUP:
483 discard = true;
484 queue(c, qpu_a_MOV(src[0], src[0]));
485 *last_inst(c) |= QPU_SF;
486 break;
487
488 case QOP_TLB_PASSTHROUGH_Z_WRITE:
489 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z),
490 qpu_rb(QPU_R_FRAG_PAYLOAD_ZW)));
491 if (discard) {
492 set_last_cond_add(c, QPU_COND_ZS);
493 }
494 break;
495
496 case QOP_TLB_COLOR_READ:
497 queue(c, qpu_NOP());
498 *last_inst(c) = qpu_set_sig(*last_inst(c),
499 QPU_SIG_COLOR_LOAD);
500
501 break;
502
503 case QOP_TLB_COLOR_WRITE:
504 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
505 if (discard) {
506 set_last_cond_add(c, QPU_COND_ZS);
507 }
508 break;
509
510 case QOP_VARY_ADD_C:
511 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
512 break;
513
514 case QOP_PACK_SCALED: {
515 uint64_t a = (qpu_a_MOV(dst, src[0]) |
516 QPU_SET_FIELD(QPU_PACK_A_16A,
517 QPU_PACK));
518 uint64_t b = (qpu_a_MOV(dst, src[1]) |
519 QPU_SET_FIELD(QPU_PACK_A_16B,
520 QPU_PACK));
521
522 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
523 queue(c, b);
524 queue(c, a);
525 } else {
526 queue(c, a);
527 queue(c, b);
528 }
529 break;
530 }
531
532 case QOP_TEX_S:
533 case QOP_TEX_T:
534 case QOP_TEX_R:
535 case QOP_TEX_B:
536 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
537 (qinst->op - QOP_TEX_S)),
538 src[0]));
539 break;
540
541 case QOP_TEX_RESULT:
542 queue(c, qpu_NOP());
543 *last_inst(c) = qpu_set_sig(*last_inst(c),
544 QPU_SIG_LOAD_TMU0);
545
546 break;
547
548 case QOP_R4_UNPACK_A:
549 case QOP_R4_UNPACK_B:
550 case QOP_R4_UNPACK_C:
551 case QOP_R4_UNPACK_D:
552 queue(c, qpu_a_MOV(dst, qpu_r4()));
553 *last_inst(c) |= QPU_PM;
554 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_R4_8A +
555 (qinst->op -
556 QOP_R4_UNPACK_A),
557 QPU_UNPACK);
558
559 break;
560
561 default:
562 assert(qinst->op < ARRAY_SIZE(translate));
563 assert(translate[qinst->op].op != 0); /* NOPs */
564
565 /* If we have only one source, put it in the second
566 * argument slot as well so that we don't take up
567 * another raddr just to get unused data.
568 */
569 if (qir_get_op_nsrc(qinst->op) == 1)
570 src[1] = src[0];
571
572 fixup_raddr_conflict(c, src[0], &src[1]);
573
574 if (translate[qinst->op].is_mul) {
575 queue(c, qpu_m_alu2(translate[qinst->op].op,
576 dst,
577 src[0], src[1]));
578 } else {
579 queue(c, qpu_a_alu2(translate[qinst->op].op,
580 dst,
581 src[0], src[1]));
582 }
583 break;
584 }
585 }
586
587 serialize_insts(c);
588
589 /* thread end can't have VPM write */
590 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
591 QPU_WADDR_ADD) == QPU_W_VPM ||
592 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
593 QPU_WADDR_MUL) == QPU_W_VPM) {
594 serialize_one_inst(c, qpu_NOP());
595 }
596
597 c->qpu_insts[c->qpu_inst_count - 1] =
598 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
599 QPU_SIG_PROG_END);
600 serialize_one_inst(c, qpu_NOP());
601 serialize_one_inst(c, qpu_NOP());
602
603 switch (c->stage) {
604 case QSTAGE_VERT:
605 case QSTAGE_COORD:
606 break;
607 case QSTAGE_FRAG:
608 c->qpu_insts[c->qpu_inst_count - 1] =
609 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
610 QPU_SIG_SCOREBOARD_UNLOCK);
611 break;
612 }
613
614 if (vc4_debug & VC4_DEBUG_QPU)
615 vc4_dump_program(c);
616
617 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
618 }