vc4: Don't forget to do initial tile clearing for depth/stencil.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29
30 static void
31 vc4_dump_program(struct vc4_compile *c)
32 {
33 fprintf(stderr, "%s:\n", qir_get_stage_name(c->stage));
34
35 for (int i = 0; i < c->qpu_inst_count; i++) {
36 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
37 vc4_qpu_disasm(&c->qpu_insts[i], 1);
38 fprintf(stderr, "\n");
39 }
40 }
41
42 struct queued_qpu_inst {
43 struct simple_node link;
44 uint64_t inst;
45 };
46
47 static void
48 queue(struct vc4_compile *c, uint64_t inst)
49 {
50 struct queued_qpu_inst *q = calloc(1, sizeof(*q));
51 q->inst = inst;
52 insert_at_tail(&c->qpu_inst_list, &q->link);
53 }
54
55 static uint64_t *
56 last_inst(struct vc4_compile *c)
57 {
58 struct queued_qpu_inst *q =
59 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
60 return &q->inst;
61 }
62
63 static void
64 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
65 {
66 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
67 }
68
69 /**
70 * This is used to resolve the fact that we might register-allocate two
71 * different operands of an instruction to the same physical register file
72 * even though instructions have only one field for the register file source
73 * address.
74 *
75 * In that case, we need to move one to a temporary that can be used in the
76 * instruction, instead.
77 */
78 static void
79 fixup_raddr_conflict(struct vc4_compile *c,
80 struct qpu_reg src0, struct qpu_reg *src1)
81 {
82 if ((src0.mux == QPU_MUX_A || src0.mux == QPU_MUX_B) &&
83 (src1->mux == QPU_MUX_A || src1->mux == QPU_MUX_B) &&
84 src0.addr != src1->addr) {
85 queue(c, qpu_a_MOV(qpu_r3(), *src1));
86 *src1 = qpu_r3();
87 }
88 }
89
90 static void
91 serialize_one_inst(struct vc4_compile *c, uint64_t inst)
92 {
93 if (c->qpu_inst_count >= c->qpu_inst_size) {
94 c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2);
95 c->qpu_insts = realloc(c->qpu_insts,
96 c->qpu_inst_size * sizeof(uint64_t));
97 }
98 c->qpu_insts[c->qpu_inst_count++] = inst;
99 }
100
101 static void
102 serialize_insts(struct vc4_compile *c)
103 {
104 int last_sfu_write = -10;
105 bool scoreboard_wait_emitted = false;
106
107 while (!is_empty_list(&c->qpu_inst_list)) {
108 struct queued_qpu_inst *q =
109 (struct queued_qpu_inst *)first_elem(&c->qpu_inst_list);
110 uint32_t last_waddr_a = QPU_W_NOP, last_waddr_b = QPU_W_NOP;
111 uint32_t raddr_a = QPU_GET_FIELD(q->inst, QPU_RADDR_A);
112 uint32_t raddr_b = QPU_GET_FIELD(q->inst, QPU_RADDR_B);
113
114 if (c->qpu_inst_count > 0) {
115 uint64_t last_inst = c->qpu_insts[c->qpu_inst_count -
116 1];
117 uint32_t last_waddr_add = QPU_GET_FIELD(last_inst,
118 QPU_WADDR_ADD);
119 uint32_t last_waddr_mul = QPU_GET_FIELD(last_inst,
120 QPU_WADDR_MUL);
121
122 if (last_inst & QPU_WS) {
123 last_waddr_a = last_waddr_mul;
124 last_waddr_b = last_waddr_add;
125 } else {
126 last_waddr_a = last_waddr_add;
127 last_waddr_b = last_waddr_mul;
128 }
129 }
130
131 uint32_t src_muxes[] = {
132 QPU_GET_FIELD(q->inst, QPU_ADD_A),
133 QPU_GET_FIELD(q->inst, QPU_ADD_B),
134 QPU_GET_FIELD(q->inst, QPU_MUL_A),
135 QPU_GET_FIELD(q->inst, QPU_MUL_B),
136 };
137
138 /* "An instruction must not read from a location in physical
139 * regfile A or B that was written to by the previous
140 * instruction."
141 */
142 bool needs_raddr_vs_waddr_nop = false;
143 bool reads_r4 = false;
144 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
145 if ((raddr_a < 32 &&
146 src_muxes[i] == QPU_MUX_A &&
147 last_waddr_a == raddr_a) ||
148 (raddr_b < 32 &&
149 src_muxes[i] == QPU_MUX_B &&
150 last_waddr_b == raddr_b)) {
151 needs_raddr_vs_waddr_nop = true;
152 }
153 if (src_muxes[i] == QPU_MUX_R4)
154 reads_r4 = true;
155 }
156
157 if (needs_raddr_vs_waddr_nop) {
158 serialize_one_inst(c, qpu_NOP());
159 }
160
161 /* "After an SFU lookup instruction, accumulator r4 must not
162 * be read in the following two instructions. Any other
163 * instruction that results in r4 being written (that is, TMU
164 * read, TLB read, SFU lookup) cannot occur in the two
165 * instructions following an SFU lookup."
166 */
167 if (reads_r4) {
168 while (c->qpu_inst_count - last_sfu_write < 3) {
169 serialize_one_inst(c, qpu_NOP());
170 }
171 }
172
173 uint32_t waddr_a = QPU_GET_FIELD(q->inst, QPU_WADDR_ADD);
174 uint32_t waddr_m = QPU_GET_FIELD(q->inst, QPU_WADDR_MUL);
175 if ((waddr_a >= QPU_W_SFU_RECIP && waddr_a <= QPU_W_SFU_LOG) ||
176 (waddr_m >= QPU_W_SFU_RECIP && waddr_m <= QPU_W_SFU_LOG)) {
177 last_sfu_write = c->qpu_inst_count;
178 }
179
180 /* "A scoreboard wait must not occur in the first two
181 * instructions of a fragment shader. This is either the
182 * explicit Wait for Scoreboard signal or an implicit wait
183 * with the first tile-buffer read or write instruction."
184 */
185 if (!scoreboard_wait_emitted &&
186 (waddr_a == QPU_W_TLB_Z || waddr_m == QPU_W_TLB_Z ||
187 waddr_a == QPU_W_TLB_COLOR_MS ||
188 waddr_m == QPU_W_TLB_COLOR_MS ||
189 waddr_a == QPU_W_TLB_COLOR_ALL ||
190 waddr_m == QPU_W_TLB_COLOR_ALL ||
191 QPU_GET_FIELD(q->inst, QPU_SIG) == QPU_SIG_COLOR_LOAD)) {
192 while (c->qpu_inst_count < 3 ||
193 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
194 QPU_SIG) != QPU_SIG_NONE) {
195 serialize_one_inst(c, qpu_NOP());
196 }
197 c->qpu_insts[c->qpu_inst_count - 1] =
198 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
199 QPU_SIG_WAIT_FOR_SCOREBOARD);
200 scoreboard_wait_emitted = true;
201 }
202
203 serialize_one_inst(c, q->inst);
204
205 remove_from_list(&q->link);
206 free(q);
207 }
208 }
209
210 void
211 vc4_generate_code(struct vc4_compile *c)
212 {
213 struct qpu_reg allocate_to_qpu_reg[3 + 32 + 32];
214 bool reg_in_use[ARRAY_SIZE(allocate_to_qpu_reg)];
215 int *reg_allocated = calloc(c->num_temps, sizeof(*reg_allocated));
216 int *reg_uses_remaining =
217 calloc(c->num_temps, sizeof(*reg_uses_remaining));
218 bool discard = false;
219
220 for (int i = 0; i < ARRAY_SIZE(reg_in_use); i++)
221 reg_in_use[i] = false;
222 for (int i = 0; i < c->num_temps; i++)
223 reg_allocated[i] = -1;
224 for (int i = 0; i < 3; i++)
225 allocate_to_qpu_reg[i] = qpu_rn(i);
226 for (int i = 0; i < 32; i++)
227 allocate_to_qpu_reg[i + 3] = qpu_ra(i);
228 for (int i = 0; i < 32; i++)
229 allocate_to_qpu_reg[i + 3 + 32] = qpu_rb(i);
230
231 make_empty_list(&c->qpu_inst_list);
232
233 struct simple_node *node;
234 foreach(node, &c->instructions) {
235 struct qinst *qinst = (struct qinst *)node;
236
237 if (qinst->dst.file == QFILE_TEMP)
238 reg_uses_remaining[qinst->dst.index]++;
239 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
240 if (qinst->src[i].file == QFILE_TEMP)
241 reg_uses_remaining[qinst->src[i].index]++;
242 }
243 if (qinst->op == QOP_TLB_PASSTHROUGH_Z_WRITE ||
244 qinst->op == QOP_FRAG_Z)
245 reg_in_use[3 + 32 + QPU_R_FRAG_PAYLOAD_ZW] = true;
246 }
247
248 switch (c->stage) {
249 case QSTAGE_VERT:
250 case QSTAGE_COORD:
251 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
252 (0x00001a00 +
253 0x00100000 * c->num_inputs)));
254 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
255 break;
256 case QSTAGE_FRAG:
257 break;
258 }
259
260 foreach(node, &c->instructions) {
261 struct qinst *qinst = (struct qinst *)node;
262
263 #if 0
264 fprintf(stderr, "translating qinst to qpu: ");
265 qir_dump_inst(qinst);
266 fprintf(stderr, "\n");
267 #endif
268
269 static const struct {
270 uint32_t op;
271 bool is_mul;
272 } translate[] = {
273 #define A(name) [QOP_##name] = {QPU_A_##name, false}
274 #define M(name) [QOP_##name] = {QPU_M_##name, true}
275 A(FADD),
276 A(FSUB),
277 A(FMIN),
278 A(FMAX),
279 A(FMINABS),
280 A(FMAXABS),
281 A(FTOI),
282 A(ITOF),
283 A(ADD),
284 A(SUB),
285 A(SHL),
286 A(SHR),
287 A(ASR),
288 A(MIN),
289 A(MAX),
290 A(AND),
291 A(OR),
292 A(XOR),
293 A(NOT),
294
295 M(FMUL),
296 M(MUL24),
297 };
298
299 struct qpu_reg src[4];
300 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
301 int index = qinst->src[i].index;
302 switch (qinst->src[i].file) {
303 case QFILE_NULL:
304 src[i] = qpu_rn(0);
305 break;
306 case QFILE_TEMP:
307 if (reg_allocated[index] == -1) {
308 fprintf(stderr, "undefined reg use: ");
309 qir_dump_inst(qinst);
310 fprintf(stderr, "\n");
311
312 src[i] = qpu_rn(0);
313 } else {
314 src[i] = allocate_to_qpu_reg[reg_allocated[index]];
315 reg_uses_remaining[index]--;
316 if (reg_uses_remaining[index] == 0)
317 reg_in_use[reg_allocated[index]] = false;
318 }
319 break;
320 case QFILE_UNIF:
321 src[i] = qpu_unif();
322 break;
323 case QFILE_VARY:
324 src[i] = qpu_vary();
325 break;
326 }
327 }
328
329 struct qpu_reg dst;
330 switch (qinst->dst.file) {
331 case QFILE_NULL:
332 dst = qpu_ra(QPU_W_NOP);
333 break;
334
335 case QFILE_TEMP:
336 if (reg_allocated[qinst->dst.index] == -1) {
337 int alloc;
338 for (alloc = 0;
339 alloc < ARRAY_SIZE(reg_in_use);
340 alloc++) {
341 /* The pack flags require an A-file register. */
342 if (qinst->op == QOP_PACK_SCALED &&
343 allocate_to_qpu_reg[alloc].mux != QPU_MUX_A) {
344 continue;
345 }
346
347 if (!reg_in_use[alloc])
348 break;
349 }
350 assert(alloc != ARRAY_SIZE(reg_in_use) && "need better reg alloc");
351 reg_in_use[alloc] = true;
352 reg_allocated[qinst->dst.index] = alloc;
353 }
354
355 dst = allocate_to_qpu_reg[reg_allocated[qinst->dst.index]];
356
357 reg_uses_remaining[qinst->dst.index]--;
358 if (reg_uses_remaining[qinst->dst.index] == 0) {
359 reg_in_use[reg_allocated[qinst->dst.index]] =
360 false;
361 }
362 break;
363
364 case QFILE_VARY:
365 case QFILE_UNIF:
366 assert(!"not reached");
367 break;
368 }
369
370 switch (qinst->op) {
371 case QOP_MOV:
372 /* Skip emitting the MOV if it's a no-op. */
373 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
374 dst.mux != src[0].mux || dst.addr != src[0].addr) {
375 queue(c, qpu_a_MOV(dst, src[0]));
376 }
377 break;
378
379 case QOP_SF:
380 fixup_raddr_conflict(c, src[0], &src[1]);
381 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
382 *last_inst(c) |= QPU_SF;
383 break;
384
385 case QOP_SEL_X_0_ZS:
386 case QOP_SEL_X_0_ZC:
387 case QOP_SEL_X_0_NS:
388 case QOP_SEL_X_0_NC:
389 queue(c, qpu_a_MOV(dst, src[0]));
390 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
391 QPU_COND_ZS);
392
393 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
394 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
395 1) + QPU_COND_ZS);
396 break;
397
398 case QOP_SEL_X_Y_ZS:
399 case QOP_SEL_X_Y_ZC:
400 case QOP_SEL_X_Y_NS:
401 case QOP_SEL_X_Y_NC:
402 queue(c, qpu_a_MOV(dst, src[0]));
403 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
404 QPU_COND_ZS);
405
406 queue(c, qpu_a_MOV(dst, src[1]));
407 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
408 1) + QPU_COND_ZS);
409
410 break;
411
412 case QOP_VPM_WRITE:
413 queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
414 break;
415
416 case QOP_VPM_READ:
417 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
418 break;
419
420 case QOP_RCP:
421 case QOP_RSQ:
422 case QOP_EXP2:
423 case QOP_LOG2:
424 switch (qinst->op) {
425 case QOP_RCP:
426 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
427 src[0]));
428 break;
429 case QOP_RSQ:
430 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
431 src[0]));
432 break;
433 case QOP_EXP2:
434 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
435 src[0]));
436 break;
437 case QOP_LOG2:
438 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
439 src[0]));
440 break;
441 default:
442 abort();
443 }
444
445 queue(c, qpu_a_MOV(dst, qpu_r4()));
446
447 break;
448
449 case QOP_PACK_COLORS:
450 for (int i = 0; i < 4; i++) {
451 queue(c, qpu_m_MOV(qpu_r3(), src[i]));
452 *last_inst(c) |= QPU_PM;
453 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
454 QPU_PACK);
455 }
456
457 queue(c, qpu_a_MOV(dst, qpu_r3()));
458
459 break;
460
461 case QOP_FRAG_X:
462 queue(c, qpu_a_ITOF(dst,
463 qpu_ra(QPU_R_XY_PIXEL_COORD)));
464 break;
465
466 case QOP_FRAG_Y:
467 queue(c, qpu_a_ITOF(dst,
468 qpu_rb(QPU_R_XY_PIXEL_COORD)));
469 break;
470
471 case QOP_FRAG_Z:
472 queue(c, qpu_a_ITOF(dst,
473 qpu_rb(QPU_R_FRAG_PAYLOAD_ZW)));
474 break;
475
476 case QOP_FRAG_RCP_W:
477 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
478 qpu_ra(QPU_R_FRAG_PAYLOAD_ZW)));
479
480 queue(c, qpu_a_MOV(dst, qpu_r4()));
481 break;
482
483 case QOP_TLB_DISCARD_SETUP:
484 discard = true;
485 queue(c, qpu_a_MOV(src[0], src[0]));
486 *last_inst(c) |= QPU_SF;
487 break;
488
489 case QOP_TLB_PASSTHROUGH_Z_WRITE:
490 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z),
491 qpu_rb(QPU_R_FRAG_PAYLOAD_ZW)));
492 if (discard) {
493 set_last_cond_add(c, QPU_COND_ZS);
494 }
495 break;
496
497 case QOP_TLB_COLOR_READ:
498 queue(c, qpu_NOP());
499 *last_inst(c) = qpu_set_sig(*last_inst(c),
500 QPU_SIG_COLOR_LOAD);
501
502 break;
503
504 case QOP_TLB_COLOR_WRITE:
505 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
506 if (discard) {
507 set_last_cond_add(c, QPU_COND_ZS);
508 }
509 break;
510
511 case QOP_VARY_ADD_C:
512 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
513 break;
514
515 case QOP_PACK_SCALED: {
516 uint64_t a = (qpu_a_MOV(dst, src[0]) |
517 QPU_SET_FIELD(QPU_PACK_A_16A,
518 QPU_PACK));
519 uint64_t b = (qpu_a_MOV(dst, src[1]) |
520 QPU_SET_FIELD(QPU_PACK_A_16B,
521 QPU_PACK));
522
523 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
524 queue(c, b);
525 queue(c, a);
526 } else {
527 queue(c, a);
528 queue(c, b);
529 }
530 break;
531 }
532
533 case QOP_TEX_S:
534 case QOP_TEX_T:
535 case QOP_TEX_R:
536 case QOP_TEX_B:
537 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
538 (qinst->op - QOP_TEX_S)),
539 src[0]));
540 break;
541
542 case QOP_TEX_RESULT:
543 queue(c, qpu_NOP());
544 *last_inst(c) = qpu_set_sig(*last_inst(c),
545 QPU_SIG_LOAD_TMU0);
546
547 break;
548
549 case QOP_R4_UNPACK_A:
550 case QOP_R4_UNPACK_B:
551 case QOP_R4_UNPACK_C:
552 case QOP_R4_UNPACK_D:
553 queue(c, qpu_a_MOV(dst, qpu_r4()));
554 *last_inst(c) |= QPU_PM;
555 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_R4_8A +
556 (qinst->op -
557 QOP_R4_UNPACK_A),
558 QPU_UNPACK);
559
560 break;
561
562 default:
563 assert(qinst->op < ARRAY_SIZE(translate));
564 assert(translate[qinst->op].op != 0); /* NOPs */
565
566 /* If we have only one source, put it in the second
567 * argument slot as well so that we don't take up
568 * another raddr just to get unused data.
569 */
570 if (qir_get_op_nsrc(qinst->op) == 1)
571 src[1] = src[0];
572
573 fixup_raddr_conflict(c, src[0], &src[1]);
574
575 if (translate[qinst->op].is_mul) {
576 queue(c, qpu_m_alu2(translate[qinst->op].op,
577 dst,
578 src[0], src[1]));
579 } else {
580 queue(c, qpu_a_alu2(translate[qinst->op].op,
581 dst,
582 src[0], src[1]));
583 }
584 break;
585 }
586 }
587
588 serialize_insts(c);
589
590 /* thread end can't have VPM write */
591 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
592 QPU_WADDR_ADD) == QPU_W_VPM ||
593 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
594 QPU_WADDR_MUL) == QPU_W_VPM) {
595 serialize_one_inst(c, qpu_NOP());
596 }
597
598 c->qpu_insts[c->qpu_inst_count - 1] =
599 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
600 QPU_SIG_PROG_END);
601 serialize_one_inst(c, qpu_NOP());
602 serialize_one_inst(c, qpu_NOP());
603
604 switch (c->stage) {
605 case QSTAGE_VERT:
606 case QSTAGE_COORD:
607 break;
608 case QSTAGE_FRAG:
609 c->qpu_insts[c->qpu_inst_count - 1] =
610 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
611 QPU_SIG_SCOREBOARD_UNLOCK);
612 break;
613 }
614
615 if (vc4_debug & VC4_DEBUG_QPU)
616 vc4_dump_program(c);
617
618 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
619 }