vc4: Write the VPM read setup multiple times to queue all the inputs.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29
30 static void
31 vc4_dump_program(struct vc4_compile *c)
32 {
33 fprintf(stderr, "%s:\n", qir_get_stage_name(c->stage));
34
35 for (int i = 0; i < c->qpu_inst_count; i++) {
36 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
37 vc4_qpu_disasm(&c->qpu_insts[i], 1);
38 fprintf(stderr, "\n");
39 }
40 }
41
42 struct queued_qpu_inst {
43 struct simple_node link;
44 uint64_t inst;
45 };
46
47 static void
48 queue(struct vc4_compile *c, uint64_t inst)
49 {
50 struct queued_qpu_inst *q = calloc(1, sizeof(*q));
51 q->inst = inst;
52 insert_at_tail(&c->qpu_inst_list, &q->link);
53 }
54
55 static uint64_t *
56 last_inst(struct vc4_compile *c)
57 {
58 struct queued_qpu_inst *q =
59 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
60 return &q->inst;
61 }
62
63 static void
64 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
65 {
66 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
67 }
68
69 /**
70 * Some special registers can be read from either file, which lets us resolve
71 * raddr conflicts without extra MOVs.
72 */
73 static bool
74 swap_file(struct qpu_reg *src)
75 {
76 switch (src->addr) {
77 case QPU_R_UNIF:
78 case QPU_R_VARY:
79 if (src->mux == QPU_MUX_A)
80 src->mux = QPU_MUX_B;
81 else
82 src->mux = QPU_MUX_A;
83 return true;
84
85 default:
86 return false;
87 }
88 }
89
90 /**
91 * This is used to resolve the fact that we might register-allocate two
92 * different operands of an instruction to the same physical register file
93 * even though instructions have only one field for the register file source
94 * address.
95 *
96 * In that case, we need to move one to a temporary that can be used in the
97 * instruction, instead.
98 */
99 static void
100 fixup_raddr_conflict(struct vc4_compile *c,
101 struct qpu_reg *src0, struct qpu_reg *src1)
102 {
103 if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
104 src0->mux != src1->mux ||
105 src0->addr == src1->addr) {
106 return;
107 }
108
109 if (swap_file(src0) || swap_file(src1))
110 return;
111
112 queue(c, qpu_a_MOV(qpu_r3(), *src1));
113 *src1 = qpu_r3();
114 }
115
116 static void
117 serialize_one_inst(struct vc4_compile *c, uint64_t inst)
118 {
119 if (c->qpu_inst_count >= c->qpu_inst_size) {
120 c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2);
121 c->qpu_insts = realloc(c->qpu_insts,
122 c->qpu_inst_size * sizeof(uint64_t));
123 }
124 c->qpu_insts[c->qpu_inst_count++] = inst;
125 }
126
127 static void
128 serialize_insts(struct vc4_compile *c)
129 {
130 int last_sfu_write = -10;
131 bool scoreboard_wait_emitted = false;
132
133 while (!is_empty_list(&c->qpu_inst_list)) {
134 struct queued_qpu_inst *q =
135 (struct queued_qpu_inst *)first_elem(&c->qpu_inst_list);
136 uint32_t last_waddr_a = QPU_W_NOP, last_waddr_b = QPU_W_NOP;
137 uint32_t raddr_a = QPU_GET_FIELD(q->inst, QPU_RADDR_A);
138 uint32_t raddr_b = QPU_GET_FIELD(q->inst, QPU_RADDR_B);
139
140 if (c->qpu_inst_count > 0) {
141 uint64_t last_inst = c->qpu_insts[c->qpu_inst_count -
142 1];
143 uint32_t last_waddr_add = QPU_GET_FIELD(last_inst,
144 QPU_WADDR_ADD);
145 uint32_t last_waddr_mul = QPU_GET_FIELD(last_inst,
146 QPU_WADDR_MUL);
147
148 if (last_inst & QPU_WS) {
149 last_waddr_a = last_waddr_mul;
150 last_waddr_b = last_waddr_add;
151 } else {
152 last_waddr_a = last_waddr_add;
153 last_waddr_b = last_waddr_mul;
154 }
155 }
156
157 uint32_t src_muxes[] = {
158 QPU_GET_FIELD(q->inst, QPU_ADD_A),
159 QPU_GET_FIELD(q->inst, QPU_ADD_B),
160 QPU_GET_FIELD(q->inst, QPU_MUL_A),
161 QPU_GET_FIELD(q->inst, QPU_MUL_B),
162 };
163
164 /* "An instruction must not read from a location in physical
165 * regfile A or B that was written to by the previous
166 * instruction."
167 */
168 bool needs_raddr_vs_waddr_nop = false;
169 bool reads_r4 = false;
170 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
171 if ((raddr_a < 32 &&
172 src_muxes[i] == QPU_MUX_A &&
173 last_waddr_a == raddr_a) ||
174 (raddr_b < 32 &&
175 src_muxes[i] == QPU_MUX_B &&
176 last_waddr_b == raddr_b)) {
177 needs_raddr_vs_waddr_nop = true;
178 }
179 if (src_muxes[i] == QPU_MUX_R4)
180 reads_r4 = true;
181 }
182
183 if (needs_raddr_vs_waddr_nop) {
184 serialize_one_inst(c, qpu_NOP());
185 }
186
187 /* "After an SFU lookup instruction, accumulator r4 must not
188 * be read in the following two instructions. Any other
189 * instruction that results in r4 being written (that is, TMU
190 * read, TLB read, SFU lookup) cannot occur in the two
191 * instructions following an SFU lookup."
192 */
193 if (reads_r4) {
194 while (c->qpu_inst_count - last_sfu_write < 3) {
195 serialize_one_inst(c, qpu_NOP());
196 }
197 }
198
199 uint32_t waddr_a = QPU_GET_FIELD(q->inst, QPU_WADDR_ADD);
200 uint32_t waddr_m = QPU_GET_FIELD(q->inst, QPU_WADDR_MUL);
201 if ((waddr_a >= QPU_W_SFU_RECIP && waddr_a <= QPU_W_SFU_LOG) ||
202 (waddr_m >= QPU_W_SFU_RECIP && waddr_m <= QPU_W_SFU_LOG)) {
203 last_sfu_write = c->qpu_inst_count;
204 }
205
206 /* "A scoreboard wait must not occur in the first two
207 * instructions of a fragment shader. This is either the
208 * explicit Wait for Scoreboard signal or an implicit wait
209 * with the first tile-buffer read or write instruction."
210 */
211 if (!scoreboard_wait_emitted &&
212 (waddr_a == QPU_W_TLB_Z || waddr_m == QPU_W_TLB_Z ||
213 waddr_a == QPU_W_TLB_COLOR_MS ||
214 waddr_m == QPU_W_TLB_COLOR_MS ||
215 waddr_a == QPU_W_TLB_COLOR_ALL ||
216 waddr_m == QPU_W_TLB_COLOR_ALL ||
217 QPU_GET_FIELD(q->inst, QPU_SIG) == QPU_SIG_COLOR_LOAD)) {
218 while (c->qpu_inst_count < 3 ||
219 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
220 QPU_SIG) != QPU_SIG_NONE) {
221 serialize_one_inst(c, qpu_NOP());
222 }
223 c->qpu_insts[c->qpu_inst_count - 1] =
224 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
225 QPU_SIG_WAIT_FOR_SCOREBOARD);
226 scoreboard_wait_emitted = true;
227 }
228
229 serialize_one_inst(c, q->inst);
230
231 remove_from_list(&q->link);
232 free(q);
233 }
234 }
235
236 void
237 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
238 {
239 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
240 bool discard = false;
241 uint32_t inputs_remaining = c->num_inputs;
242 uint32_t vpm_read_fifo_count = 0;
243 uint32_t vpm_read_offset = 0;
244
245 make_empty_list(&c->qpu_inst_list);
246
247 switch (c->stage) {
248 case QSTAGE_VERT:
249 case QSTAGE_COORD:
250 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
251 * load up to 16 dwords (4 vec4s) per vertex.
252 */
253 while (inputs_remaining) {
254 uint32_t num_entries = MIN2(inputs_remaining, 16);
255 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
256 vpm_read_offset |
257 0x00001a00 |
258 ((num_entries & 0xf) << 20)));
259 inputs_remaining -= num_entries;
260 vpm_read_offset += num_entries;
261 vpm_read_fifo_count++;
262 }
263 assert(vpm_read_fifo_count <= 4);
264
265 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
266 break;
267 case QSTAGE_FRAG:
268 break;
269 }
270
271 struct simple_node *node;
272 foreach(node, &c->instructions) {
273 struct qinst *qinst = (struct qinst *)node;
274
275 #if 0
276 fprintf(stderr, "translating qinst to qpu: ");
277 qir_dump_inst(qinst);
278 fprintf(stderr, "\n");
279 #endif
280
281 static const struct {
282 uint32_t op;
283 bool is_mul;
284 } translate[] = {
285 #define A(name) [QOP_##name] = {QPU_A_##name, false}
286 #define M(name) [QOP_##name] = {QPU_M_##name, true}
287 A(FADD),
288 A(FSUB),
289 A(FMIN),
290 A(FMAX),
291 A(FMINABS),
292 A(FMAXABS),
293 A(FTOI),
294 A(ITOF),
295 A(ADD),
296 A(SUB),
297 A(SHL),
298 A(SHR),
299 A(ASR),
300 A(MIN),
301 A(MAX),
302 A(AND),
303 A(OR),
304 A(XOR),
305 A(NOT),
306
307 M(FMUL),
308 M(MUL24),
309 };
310
311 struct qpu_reg src[4];
312 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
313 int index = qinst->src[i].index;
314 switch (qinst->src[i].file) {
315 case QFILE_NULL:
316 src[i] = qpu_rn(0);
317 break;
318 case QFILE_TEMP:
319 src[i] = temp_registers[index];
320 break;
321 case QFILE_UNIF:
322 src[i] = qpu_unif();
323 break;
324 case QFILE_VARY:
325 src[i] = qpu_vary();
326 break;
327 }
328 }
329
330 struct qpu_reg dst;
331 switch (qinst->dst.file) {
332 case QFILE_NULL:
333 dst = qpu_ra(QPU_W_NOP);
334 break;
335 case QFILE_TEMP:
336 dst = temp_registers[qinst->dst.index];
337 break;
338 case QFILE_VARY:
339 case QFILE_UNIF:
340 assert(!"not reached");
341 break;
342 }
343
344 switch (qinst->op) {
345 case QOP_MOV:
346 /* Skip emitting the MOV if it's a no-op. */
347 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
348 dst.mux != src[0].mux || dst.addr != src[0].addr) {
349 queue(c, qpu_a_MOV(dst, src[0]));
350 }
351 break;
352
353 case QOP_SF:
354 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
355 *last_inst(c) |= QPU_SF;
356 break;
357
358 case QOP_SEL_X_0_ZS:
359 case QOP_SEL_X_0_ZC:
360 case QOP_SEL_X_0_NS:
361 case QOP_SEL_X_0_NC:
362 queue(c, qpu_a_MOV(dst, src[0]));
363 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
364 QPU_COND_ZS);
365
366 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
367 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
368 1) + QPU_COND_ZS);
369 break;
370
371 case QOP_SEL_X_Y_ZS:
372 case QOP_SEL_X_Y_ZC:
373 case QOP_SEL_X_Y_NS:
374 case QOP_SEL_X_Y_NC:
375 queue(c, qpu_a_MOV(dst, src[0]));
376 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
377 QPU_COND_ZS);
378
379 queue(c, qpu_a_MOV(dst, src[1]));
380 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
381 1) + QPU_COND_ZS);
382
383 break;
384
385 case QOP_VPM_WRITE:
386 queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
387 break;
388
389 case QOP_VPM_READ:
390 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
391 break;
392
393 case QOP_RCP:
394 case QOP_RSQ:
395 case QOP_EXP2:
396 case QOP_LOG2:
397 switch (qinst->op) {
398 case QOP_RCP:
399 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
400 src[0]));
401 break;
402 case QOP_RSQ:
403 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
404 src[0]));
405 break;
406 case QOP_EXP2:
407 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
408 src[0]));
409 break;
410 case QOP_LOG2:
411 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
412 src[0]));
413 break;
414 default:
415 abort();
416 }
417
418 queue(c, qpu_a_MOV(dst, qpu_r4()));
419
420 break;
421
422 case QOP_PACK_COLORS:
423 for (int i = 0; i < 4; i++) {
424 queue(c, qpu_m_MOV(qpu_r3(), src[i]));
425 *last_inst(c) |= QPU_PM;
426 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
427 QPU_PACK);
428 }
429
430 queue(c, qpu_a_MOV(dst, qpu_r3()));
431
432 break;
433
434 case QOP_FRAG_X:
435 queue(c, qpu_a_ITOF(dst,
436 qpu_ra(QPU_R_XY_PIXEL_COORD)));
437 break;
438
439 case QOP_FRAG_Y:
440 queue(c, qpu_a_ITOF(dst,
441 qpu_rb(QPU_R_XY_PIXEL_COORD)));
442 break;
443
444 case QOP_FRAG_REV_FLAG:
445 queue(c, qpu_a_ITOF(dst,
446 qpu_rb(QPU_R_MS_REV_FLAGS)));
447 break;
448
449 case QOP_FRAG_Z:
450 case QOP_FRAG_W:
451 /* QOP_FRAG_Z/W don't emit instructions, just allocate
452 * the register to the Z/W payload.
453 */
454 break;
455
456 case QOP_TLB_DISCARD_SETUP:
457 discard = true;
458 queue(c, qpu_a_MOV(src[0], src[0]));
459 *last_inst(c) |= QPU_SF;
460 break;
461
462 case QOP_TLB_STENCIL_SETUP:
463 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
464 break;
465
466 case QOP_TLB_Z_WRITE:
467 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
468 if (discard) {
469 set_last_cond_add(c, QPU_COND_ZS);
470 }
471 break;
472
473 case QOP_TLB_COLOR_READ:
474 queue(c, qpu_NOP());
475 *last_inst(c) = qpu_set_sig(*last_inst(c),
476 QPU_SIG_COLOR_LOAD);
477
478 break;
479
480 case QOP_TLB_COLOR_WRITE:
481 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
482 if (discard) {
483 set_last_cond_add(c, QPU_COND_ZS);
484 }
485 break;
486
487 case QOP_VARY_ADD_C:
488 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
489 break;
490
491 case QOP_PACK_SCALED: {
492 uint64_t a = (qpu_a_MOV(dst, src[0]) |
493 QPU_SET_FIELD(QPU_PACK_A_16A,
494 QPU_PACK));
495 uint64_t b = (qpu_a_MOV(dst, src[1]) |
496 QPU_SET_FIELD(QPU_PACK_A_16B,
497 QPU_PACK));
498
499 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
500 queue(c, b);
501 queue(c, a);
502 } else {
503 queue(c, a);
504 queue(c, b);
505 }
506 break;
507 }
508
509 case QOP_TEX_S:
510 case QOP_TEX_T:
511 case QOP_TEX_R:
512 case QOP_TEX_B:
513 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
514 (qinst->op - QOP_TEX_S)),
515 src[0]));
516 break;
517
518 case QOP_TEX_RESULT:
519 queue(c, qpu_NOP());
520 *last_inst(c) = qpu_set_sig(*last_inst(c),
521 QPU_SIG_LOAD_TMU0);
522
523 break;
524
525 case QOP_R4_UNPACK_A:
526 case QOP_R4_UNPACK_B:
527 case QOP_R4_UNPACK_C:
528 case QOP_R4_UNPACK_D:
529 assert(src[0].mux == QPU_MUX_R4);
530 queue(c, qpu_a_MOV(dst, src[0]));
531 *last_inst(c) |= QPU_PM;
532 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
533 (qinst->op -
534 QOP_R4_UNPACK_A),
535 QPU_UNPACK);
536
537 break;
538
539 case QOP_UNPACK_8A:
540 case QOP_UNPACK_8B:
541 case QOP_UNPACK_8C:
542 case QOP_UNPACK_8D: {
543 assert(src[0].mux == QPU_MUX_A);
544
545 /* And, since we're setting the pack bits, if the
546 * destination is in A it would get re-packed.
547 */
548 struct qpu_reg orig_dst = dst;
549 if (orig_dst.mux == QPU_MUX_A)
550 dst = qpu_rn(3);
551
552 queue(c, qpu_a_FMAX(dst, src[0], src[0]));
553 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
554 (qinst->op -
555 QOP_UNPACK_8A),
556 QPU_UNPACK);
557
558 if (orig_dst.mux == QPU_MUX_A) {
559 queue(c, qpu_a_MOV(orig_dst, dst));
560 }
561 }
562 break;
563
564 default:
565 assert(qinst->op < ARRAY_SIZE(translate));
566 assert(translate[qinst->op].op != 0); /* NOPs */
567
568 /* If we have only one source, put it in the second
569 * argument slot as well so that we don't take up
570 * another raddr just to get unused data.
571 */
572 if (qir_get_op_nsrc(qinst->op) == 1)
573 src[1] = src[0];
574
575 fixup_raddr_conflict(c, &src[0], &src[1]);
576
577 if (translate[qinst->op].is_mul) {
578 queue(c, qpu_m_alu2(translate[qinst->op].op,
579 dst,
580 src[0], src[1]));
581 } else {
582 queue(c, qpu_a_alu2(translate[qinst->op].op,
583 dst,
584 src[0], src[1]));
585 }
586 break;
587 }
588 }
589
590 serialize_insts(c);
591
592 /* thread end can't have VPM write */
593 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
594 QPU_WADDR_ADD) == QPU_W_VPM ||
595 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
596 QPU_WADDR_MUL) == QPU_W_VPM) {
597 serialize_one_inst(c, qpu_NOP());
598 }
599
600 /* thread end can't have uniform read */
601 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
602 QPU_RADDR_A) == QPU_R_UNIF ||
603 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
604 QPU_RADDR_B) == QPU_R_UNIF) {
605 serialize_one_inst(c, qpu_NOP());
606 }
607
608 c->qpu_insts[c->qpu_inst_count - 1] =
609 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
610 QPU_SIG_PROG_END);
611 serialize_one_inst(c, qpu_NOP());
612 serialize_one_inst(c, qpu_NOP());
613
614 switch (c->stage) {
615 case QSTAGE_VERT:
616 case QSTAGE_COORD:
617 break;
618 case QSTAGE_FRAG:
619 c->qpu_insts[c->qpu_inst_count - 1] =
620 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
621 QPU_SIG_SCOREBOARD_UNLOCK);
622 break;
623 }
624
625 if (vc4_debug & VC4_DEBUG_QPU)
626 vc4_dump_program(c);
627
628 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
629
630 free(temp_registers);
631 }