vc4: Drop the explicit scoreboard wait.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29
30 static void
31 vc4_dump_program(struct vc4_compile *c)
32 {
33 fprintf(stderr, "%s prog %d/%d QPU:\n",
34 qir_get_stage_name(c->stage),
35 c->program_id, c->variant_id);
36
37 for (int i = 0; i < c->qpu_inst_count; i++) {
38 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
39 vc4_qpu_disasm(&c->qpu_insts[i], 1);
40 fprintf(stderr, "\n");
41 }
42 }
43
44 struct queued_qpu_inst {
45 struct simple_node link;
46 uint64_t inst;
47 };
48
49 static void
50 queue(struct vc4_compile *c, uint64_t inst)
51 {
52 struct queued_qpu_inst *q = calloc(1, sizeof(*q));
53 q->inst = inst;
54 insert_at_tail(&c->qpu_inst_list, &q->link);
55 }
56
57 static uint64_t *
58 last_inst(struct vc4_compile *c)
59 {
60 struct queued_qpu_inst *q =
61 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
62 return &q->inst;
63 }
64
65 static void
66 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
67 {
68 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
69 }
70
71 /**
72 * Some special registers can be read from either file, which lets us resolve
73 * raddr conflicts without extra MOVs.
74 */
75 static bool
76 swap_file(struct qpu_reg *src)
77 {
78 switch (src->addr) {
79 case QPU_R_UNIF:
80 case QPU_R_VARY:
81 if (src->mux == QPU_MUX_A)
82 src->mux = QPU_MUX_B;
83 else
84 src->mux = QPU_MUX_A;
85 return true;
86
87 default:
88 return false;
89 }
90 }
91
92 /**
93 * This is used to resolve the fact that we might register-allocate two
94 * different operands of an instruction to the same physical register file
95 * even though instructions have only one field for the register file source
96 * address.
97 *
98 * In that case, we need to move one to a temporary that can be used in the
99 * instruction, instead.
100 */
101 static void
102 fixup_raddr_conflict(struct vc4_compile *c,
103 struct qpu_reg *src0, struct qpu_reg *src1)
104 {
105 if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
106 src0->mux != src1->mux ||
107 src0->addr == src1->addr) {
108 return;
109 }
110
111 if (swap_file(src0) || swap_file(src1))
112 return;
113
114 queue(c, qpu_a_MOV(qpu_r3(), *src1));
115 *src1 = qpu_r3();
116 }
117
118 static void
119 serialize_one_inst(struct vc4_compile *c, uint64_t inst)
120 {
121 if (c->qpu_inst_count >= c->qpu_inst_size) {
122 c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2);
123 c->qpu_insts = realloc(c->qpu_insts,
124 c->qpu_inst_size * sizeof(uint64_t));
125 }
126 c->qpu_insts[c->qpu_inst_count++] = inst;
127 }
128
129 static void
130 serialize_insts(struct vc4_compile *c)
131 {
132 int last_sfu_write = -10;
133
134 while (!is_empty_list(&c->qpu_inst_list)) {
135 struct queued_qpu_inst *q =
136 (struct queued_qpu_inst *)first_elem(&c->qpu_inst_list);
137 uint32_t last_waddr_a = QPU_W_NOP, last_waddr_b = QPU_W_NOP;
138 uint32_t raddr_a = QPU_GET_FIELD(q->inst, QPU_RADDR_A);
139 uint32_t raddr_b = QPU_GET_FIELD(q->inst, QPU_RADDR_B);
140
141 if (c->qpu_inst_count > 0) {
142 uint64_t last_inst = c->qpu_insts[c->qpu_inst_count -
143 1];
144 uint32_t last_waddr_add = QPU_GET_FIELD(last_inst,
145 QPU_WADDR_ADD);
146 uint32_t last_waddr_mul = QPU_GET_FIELD(last_inst,
147 QPU_WADDR_MUL);
148
149 if (last_inst & QPU_WS) {
150 last_waddr_a = last_waddr_mul;
151 last_waddr_b = last_waddr_add;
152 } else {
153 last_waddr_a = last_waddr_add;
154 last_waddr_b = last_waddr_mul;
155 }
156 }
157
158 uint32_t src_muxes[] = {
159 QPU_GET_FIELD(q->inst, QPU_ADD_A),
160 QPU_GET_FIELD(q->inst, QPU_ADD_B),
161 QPU_GET_FIELD(q->inst, QPU_MUL_A),
162 QPU_GET_FIELD(q->inst, QPU_MUL_B),
163 };
164
165 /* "An instruction must not read from a location in physical
166 * regfile A or B that was written to by the previous
167 * instruction."
168 */
169 bool needs_raddr_vs_waddr_nop = false;
170 bool reads_r4 = false;
171 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
172 if ((raddr_a < 32 &&
173 src_muxes[i] == QPU_MUX_A &&
174 last_waddr_a == raddr_a) ||
175 (raddr_b < 32 &&
176 src_muxes[i] == QPU_MUX_B &&
177 last_waddr_b == raddr_b)) {
178 needs_raddr_vs_waddr_nop = true;
179 }
180 if (src_muxes[i] == QPU_MUX_R4)
181 reads_r4 = true;
182 }
183
184 if (needs_raddr_vs_waddr_nop) {
185 serialize_one_inst(c, qpu_NOP());
186 }
187
188 /* "After an SFU lookup instruction, accumulator r4 must not
189 * be read in the following two instructions. Any other
190 * instruction that results in r4 being written (that is, TMU
191 * read, TLB read, SFU lookup) cannot occur in the two
192 * instructions following an SFU lookup."
193 */
194 if (reads_r4) {
195 while (c->qpu_inst_count - last_sfu_write < 3) {
196 serialize_one_inst(c, qpu_NOP());
197 }
198 }
199
200 uint32_t waddr_a = QPU_GET_FIELD(q->inst, QPU_WADDR_ADD);
201 uint32_t waddr_m = QPU_GET_FIELD(q->inst, QPU_WADDR_MUL);
202 if ((waddr_a >= QPU_W_SFU_RECIP && waddr_a <= QPU_W_SFU_LOG) ||
203 (waddr_m >= QPU_W_SFU_RECIP && waddr_m <= QPU_W_SFU_LOG)) {
204 last_sfu_write = c->qpu_inst_count;
205 }
206
207 /* "A scoreboard wait must not occur in the first two
208 * instructions of a fragment shader. This is either the
209 * explicit Wait for Scoreboard signal or an implicit wait
210 * with the first tile-buffer read or write instruction."
211 */
212 if (waddr_a == QPU_W_TLB_Z ||
213 waddr_m == QPU_W_TLB_Z ||
214 waddr_a == QPU_W_TLB_COLOR_MS ||
215 waddr_m == QPU_W_TLB_COLOR_MS ||
216 waddr_a == QPU_W_TLB_COLOR_ALL ||
217 waddr_m == QPU_W_TLB_COLOR_ALL ||
218 QPU_GET_FIELD(q->inst, QPU_SIG) == QPU_SIG_COLOR_LOAD) {
219 while (c->qpu_inst_count < 3 ||
220 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
221 QPU_SIG) != QPU_SIG_NONE) {
222 serialize_one_inst(c, qpu_NOP());
223 }
224 }
225
226 serialize_one_inst(c, q->inst);
227
228 remove_from_list(&q->link);
229 free(q);
230 }
231 }
232
233 void
234 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
235 {
236 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
237 bool discard = false;
238 uint32_t inputs_remaining = c->num_inputs;
239 uint32_t vpm_read_fifo_count = 0;
240 uint32_t vpm_read_offset = 0;
241
242 make_empty_list(&c->qpu_inst_list);
243
244 switch (c->stage) {
245 case QSTAGE_VERT:
246 case QSTAGE_COORD:
247 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
248 * load up to 16 dwords (4 vec4s) per vertex.
249 */
250 while (inputs_remaining) {
251 uint32_t num_entries = MIN2(inputs_remaining, 16);
252 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
253 vpm_read_offset |
254 0x00001a00 |
255 ((num_entries & 0xf) << 20)));
256 inputs_remaining -= num_entries;
257 vpm_read_offset += num_entries;
258 vpm_read_fifo_count++;
259 }
260 assert(vpm_read_fifo_count <= 4);
261
262 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
263 break;
264 case QSTAGE_FRAG:
265 break;
266 }
267
268 struct simple_node *node;
269 foreach(node, &c->instructions) {
270 struct qinst *qinst = (struct qinst *)node;
271
272 #if 0
273 fprintf(stderr, "translating qinst to qpu: ");
274 qir_dump_inst(qinst);
275 fprintf(stderr, "\n");
276 #endif
277
278 static const struct {
279 uint32_t op;
280 bool is_mul;
281 } translate[] = {
282 #define A(name) [QOP_##name] = {QPU_A_##name, false}
283 #define M(name) [QOP_##name] = {QPU_M_##name, true}
284 A(FADD),
285 A(FSUB),
286 A(FMIN),
287 A(FMAX),
288 A(FMINABS),
289 A(FMAXABS),
290 A(FTOI),
291 A(ITOF),
292 A(ADD),
293 A(SUB),
294 A(SHL),
295 A(SHR),
296 A(ASR),
297 A(MIN),
298 A(MAX),
299 A(AND),
300 A(OR),
301 A(XOR),
302 A(NOT),
303
304 M(FMUL),
305 M(MUL24),
306 };
307
308 struct qpu_reg src[4];
309 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
310 int index = qinst->src[i].index;
311 switch (qinst->src[i].file) {
312 case QFILE_NULL:
313 src[i] = qpu_rn(0);
314 break;
315 case QFILE_TEMP:
316 src[i] = temp_registers[index];
317 break;
318 case QFILE_UNIF:
319 src[i] = qpu_unif();
320 break;
321 case QFILE_VARY:
322 src[i] = qpu_vary();
323 break;
324 }
325 }
326
327 struct qpu_reg dst;
328 switch (qinst->dst.file) {
329 case QFILE_NULL:
330 dst = qpu_ra(QPU_W_NOP);
331 break;
332 case QFILE_TEMP:
333 dst = temp_registers[qinst->dst.index];
334 break;
335 case QFILE_VARY:
336 case QFILE_UNIF:
337 assert(!"not reached");
338 break;
339 }
340
341 switch (qinst->op) {
342 case QOP_MOV:
343 /* Skip emitting the MOV if it's a no-op. */
344 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
345 dst.mux != src[0].mux || dst.addr != src[0].addr) {
346 queue(c, qpu_a_MOV(dst, src[0]));
347 }
348 break;
349
350 case QOP_SF:
351 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
352 *last_inst(c) |= QPU_SF;
353 break;
354
355 case QOP_SEL_X_0_ZS:
356 case QOP_SEL_X_0_ZC:
357 case QOP_SEL_X_0_NS:
358 case QOP_SEL_X_0_NC:
359 queue(c, qpu_a_MOV(dst, src[0]));
360 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
361 QPU_COND_ZS);
362
363 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
364 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
365 1) + QPU_COND_ZS);
366 break;
367
368 case QOP_SEL_X_Y_ZS:
369 case QOP_SEL_X_Y_ZC:
370 case QOP_SEL_X_Y_NS:
371 case QOP_SEL_X_Y_NC:
372 queue(c, qpu_a_MOV(dst, src[0]));
373 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
374 QPU_COND_ZS);
375
376 queue(c, qpu_a_MOV(dst, src[1]));
377 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
378 1) + QPU_COND_ZS);
379
380 break;
381
382 case QOP_VPM_WRITE:
383 queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
384 break;
385
386 case QOP_VPM_READ:
387 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
388 break;
389
390 case QOP_RCP:
391 case QOP_RSQ:
392 case QOP_EXP2:
393 case QOP_LOG2:
394 switch (qinst->op) {
395 case QOP_RCP:
396 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
397 src[0]));
398 break;
399 case QOP_RSQ:
400 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
401 src[0]));
402 break;
403 case QOP_EXP2:
404 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
405 src[0]));
406 break;
407 case QOP_LOG2:
408 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
409 src[0]));
410 break;
411 default:
412 abort();
413 }
414
415 queue(c, qpu_a_MOV(dst, qpu_r4()));
416
417 break;
418
419 case QOP_PACK_COLORS:
420 for (int i = 0; i < 4; i++) {
421 queue(c, qpu_m_MOV(qpu_r3(), src[i]));
422 *last_inst(c) |= QPU_PM;
423 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
424 QPU_PACK);
425 }
426
427 queue(c, qpu_a_MOV(dst, qpu_r3()));
428
429 break;
430
431 case QOP_FRAG_X:
432 queue(c, qpu_a_ITOF(dst,
433 qpu_ra(QPU_R_XY_PIXEL_COORD)));
434 break;
435
436 case QOP_FRAG_Y:
437 queue(c, qpu_a_ITOF(dst,
438 qpu_rb(QPU_R_XY_PIXEL_COORD)));
439 break;
440
441 case QOP_FRAG_REV_FLAG:
442 queue(c, qpu_a_ITOF(dst,
443 qpu_rb(QPU_R_MS_REV_FLAGS)));
444 break;
445
446 case QOP_FRAG_Z:
447 case QOP_FRAG_W:
448 /* QOP_FRAG_Z/W don't emit instructions, just allocate
449 * the register to the Z/W payload.
450 */
451 break;
452
453 case QOP_TLB_DISCARD_SETUP:
454 discard = true;
455 queue(c, qpu_a_MOV(src[0], src[0]));
456 *last_inst(c) |= QPU_SF;
457 break;
458
459 case QOP_TLB_STENCIL_SETUP:
460 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
461 break;
462
463 case QOP_TLB_Z_WRITE:
464 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
465 if (discard) {
466 set_last_cond_add(c, QPU_COND_ZS);
467 }
468 break;
469
470 case QOP_TLB_COLOR_READ:
471 queue(c, qpu_NOP());
472 *last_inst(c) = qpu_set_sig(*last_inst(c),
473 QPU_SIG_COLOR_LOAD);
474
475 break;
476
477 case QOP_TLB_COLOR_WRITE:
478 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
479 if (discard) {
480 set_last_cond_add(c, QPU_COND_ZS);
481 }
482 break;
483
484 case QOP_VARY_ADD_C:
485 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
486 break;
487
488 case QOP_PACK_SCALED: {
489 uint64_t a = (qpu_a_MOV(dst, src[0]) |
490 QPU_SET_FIELD(QPU_PACK_A_16A,
491 QPU_PACK));
492 uint64_t b = (qpu_a_MOV(dst, src[1]) |
493 QPU_SET_FIELD(QPU_PACK_A_16B,
494 QPU_PACK));
495
496 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
497 queue(c, b);
498 queue(c, a);
499 } else {
500 queue(c, a);
501 queue(c, b);
502 }
503 break;
504 }
505
506 case QOP_TEX_S:
507 case QOP_TEX_T:
508 case QOP_TEX_R:
509 case QOP_TEX_B:
510 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
511 (qinst->op - QOP_TEX_S)),
512 src[0]));
513 break;
514
515 case QOP_TEX_DIRECT:
516 fixup_raddr_conflict(c, &src[0], &src[1]);
517 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
518 break;
519
520 case QOP_TEX_RESULT:
521 queue(c, qpu_NOP());
522 *last_inst(c) = qpu_set_sig(*last_inst(c),
523 QPU_SIG_LOAD_TMU0);
524
525 break;
526
527 case QOP_R4_UNPACK_A:
528 case QOP_R4_UNPACK_B:
529 case QOP_R4_UNPACK_C:
530 case QOP_R4_UNPACK_D:
531 assert(src[0].mux == QPU_MUX_R4);
532 queue(c, qpu_a_MOV(dst, src[0]));
533 *last_inst(c) |= QPU_PM;
534 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
535 (qinst->op -
536 QOP_R4_UNPACK_A),
537 QPU_UNPACK);
538
539 break;
540
541 case QOP_UNPACK_8A:
542 case QOP_UNPACK_8B:
543 case QOP_UNPACK_8C:
544 case QOP_UNPACK_8D: {
545 assert(src[0].mux == QPU_MUX_A);
546
547 /* And, since we're setting the pack bits, if the
548 * destination is in A it would get re-packed.
549 */
550 struct qpu_reg orig_dst = dst;
551 if (orig_dst.mux == QPU_MUX_A)
552 dst = qpu_rn(3);
553
554 queue(c, qpu_a_FMAX(dst, src[0], src[0]));
555 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
556 (qinst->op -
557 QOP_UNPACK_8A),
558 QPU_UNPACK);
559
560 if (orig_dst.mux == QPU_MUX_A) {
561 queue(c, qpu_a_MOV(orig_dst, dst));
562 }
563 }
564 break;
565
566 default:
567 assert(qinst->op < ARRAY_SIZE(translate));
568 assert(translate[qinst->op].op != 0); /* NOPs */
569
570 /* If we have only one source, put it in the second
571 * argument slot as well so that we don't take up
572 * another raddr just to get unused data.
573 */
574 if (qir_get_op_nsrc(qinst->op) == 1)
575 src[1] = src[0];
576
577 fixup_raddr_conflict(c, &src[0], &src[1]);
578
579 if (translate[qinst->op].is_mul) {
580 queue(c, qpu_m_alu2(translate[qinst->op].op,
581 dst,
582 src[0], src[1]));
583 } else {
584 queue(c, qpu_a_alu2(translate[qinst->op].op,
585 dst,
586 src[0], src[1]));
587 }
588 break;
589 }
590 }
591
592 serialize_insts(c);
593
594 /* thread end can't have VPM write or read */
595 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
596 QPU_WADDR_ADD) == QPU_W_VPM ||
597 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
598 QPU_WADDR_MUL) == QPU_W_VPM ||
599 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
600 QPU_RADDR_A) == QPU_R_VPM ||
601 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
602 QPU_RADDR_B) == QPU_R_VPM) {
603 serialize_one_inst(c, qpu_NOP());
604 }
605
606 /* thread end can't have uniform read */
607 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
608 QPU_RADDR_A) == QPU_R_UNIF ||
609 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
610 QPU_RADDR_B) == QPU_R_UNIF) {
611 serialize_one_inst(c, qpu_NOP());
612 }
613
614 /* thread end can't have TLB operations */
615 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
616 serialize_one_inst(c, qpu_NOP());
617
618 c->qpu_insts[c->qpu_inst_count - 1] =
619 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
620 QPU_SIG_PROG_END);
621 serialize_one_inst(c, qpu_NOP());
622 serialize_one_inst(c, qpu_NOP());
623
624 switch (c->stage) {
625 case QSTAGE_VERT:
626 case QSTAGE_COORD:
627 break;
628 case QSTAGE_FRAG:
629 c->qpu_insts[c->qpu_inst_count - 1] =
630 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
631 QPU_SIG_SCOREBOARD_UNLOCK);
632 break;
633 }
634
635 if (vc4_debug & VC4_DEBUG_QPU)
636 vc4_dump_program(c);
637
638 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
639
640 free(temp_registers);
641 }