vc4: Add support for stencil operations.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29
30 static void
31 vc4_dump_program(struct vc4_compile *c)
32 {
33 fprintf(stderr, "%s:\n", qir_get_stage_name(c->stage));
34
35 for (int i = 0; i < c->qpu_inst_count; i++) {
36 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
37 vc4_qpu_disasm(&c->qpu_insts[i], 1);
38 fprintf(stderr, "\n");
39 }
40 }
41
42 struct queued_qpu_inst {
43 struct simple_node link;
44 uint64_t inst;
45 };
46
47 static void
48 queue(struct vc4_compile *c, uint64_t inst)
49 {
50 struct queued_qpu_inst *q = calloc(1, sizeof(*q));
51 q->inst = inst;
52 insert_at_tail(&c->qpu_inst_list, &q->link);
53 }
54
55 static uint64_t *
56 last_inst(struct vc4_compile *c)
57 {
58 struct queued_qpu_inst *q =
59 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
60 return &q->inst;
61 }
62
63 static void
64 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
65 {
66 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
67 }
68
69 /**
70 * This is used to resolve the fact that we might register-allocate two
71 * different operands of an instruction to the same physical register file
72 * even though instructions have only one field for the register file source
73 * address.
74 *
75 * In that case, we need to move one to a temporary that can be used in the
76 * instruction, instead.
77 */
78 static void
79 fixup_raddr_conflict(struct vc4_compile *c,
80 struct qpu_reg src0, struct qpu_reg *src1)
81 {
82 if ((src0.mux == QPU_MUX_A || src0.mux == QPU_MUX_B) &&
83 (src1->mux == QPU_MUX_A || src1->mux == QPU_MUX_B) &&
84 src0.addr != src1->addr) {
85 queue(c, qpu_a_MOV(qpu_r3(), *src1));
86 *src1 = qpu_r3();
87 }
88 }
89
90 static void
91 serialize_one_inst(struct vc4_compile *c, uint64_t inst)
92 {
93 if (c->qpu_inst_count >= c->qpu_inst_size) {
94 c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2);
95 c->qpu_insts = realloc(c->qpu_insts,
96 c->qpu_inst_size * sizeof(uint64_t));
97 }
98 c->qpu_insts[c->qpu_inst_count++] = inst;
99 }
100
101 static void
102 serialize_insts(struct vc4_compile *c)
103 {
104 int last_sfu_write = -10;
105 bool scoreboard_wait_emitted = false;
106
107 while (!is_empty_list(&c->qpu_inst_list)) {
108 struct queued_qpu_inst *q =
109 (struct queued_qpu_inst *)first_elem(&c->qpu_inst_list);
110 uint32_t last_waddr_a = QPU_W_NOP, last_waddr_b = QPU_W_NOP;
111 uint32_t raddr_a = QPU_GET_FIELD(q->inst, QPU_RADDR_A);
112 uint32_t raddr_b = QPU_GET_FIELD(q->inst, QPU_RADDR_B);
113
114 if (c->qpu_inst_count > 0) {
115 uint64_t last_inst = c->qpu_insts[c->qpu_inst_count -
116 1];
117 uint32_t last_waddr_add = QPU_GET_FIELD(last_inst,
118 QPU_WADDR_ADD);
119 uint32_t last_waddr_mul = QPU_GET_FIELD(last_inst,
120 QPU_WADDR_MUL);
121
122 if (last_inst & QPU_WS) {
123 last_waddr_a = last_waddr_mul;
124 last_waddr_b = last_waddr_add;
125 } else {
126 last_waddr_a = last_waddr_add;
127 last_waddr_b = last_waddr_mul;
128 }
129 }
130
131 uint32_t src_muxes[] = {
132 QPU_GET_FIELD(q->inst, QPU_ADD_A),
133 QPU_GET_FIELD(q->inst, QPU_ADD_B),
134 QPU_GET_FIELD(q->inst, QPU_MUL_A),
135 QPU_GET_FIELD(q->inst, QPU_MUL_B),
136 };
137
138 /* "An instruction must not read from a location in physical
139 * regfile A or B that was written to by the previous
140 * instruction."
141 */
142 bool needs_raddr_vs_waddr_nop = false;
143 bool reads_r4 = false;
144 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
145 if ((raddr_a < 32 &&
146 src_muxes[i] == QPU_MUX_A &&
147 last_waddr_a == raddr_a) ||
148 (raddr_b < 32 &&
149 src_muxes[i] == QPU_MUX_B &&
150 last_waddr_b == raddr_b)) {
151 needs_raddr_vs_waddr_nop = true;
152 }
153 if (src_muxes[i] == QPU_MUX_R4)
154 reads_r4 = true;
155 }
156
157 if (needs_raddr_vs_waddr_nop) {
158 serialize_one_inst(c, qpu_NOP());
159 }
160
161 /* "After an SFU lookup instruction, accumulator r4 must not
162 * be read in the following two instructions. Any other
163 * instruction that results in r4 being written (that is, TMU
164 * read, TLB read, SFU lookup) cannot occur in the two
165 * instructions following an SFU lookup."
166 */
167 if (reads_r4) {
168 while (c->qpu_inst_count - last_sfu_write < 3) {
169 serialize_one_inst(c, qpu_NOP());
170 }
171 }
172
173 uint32_t waddr_a = QPU_GET_FIELD(q->inst, QPU_WADDR_ADD);
174 uint32_t waddr_m = QPU_GET_FIELD(q->inst, QPU_WADDR_MUL);
175 if ((waddr_a >= QPU_W_SFU_RECIP && waddr_a <= QPU_W_SFU_LOG) ||
176 (waddr_m >= QPU_W_SFU_RECIP && waddr_m <= QPU_W_SFU_LOG)) {
177 last_sfu_write = c->qpu_inst_count;
178 }
179
180 /* "A scoreboard wait must not occur in the first two
181 * instructions of a fragment shader. This is either the
182 * explicit Wait for Scoreboard signal or an implicit wait
183 * with the first tile-buffer read or write instruction."
184 */
185 if (!scoreboard_wait_emitted &&
186 (waddr_a == QPU_W_TLB_Z || waddr_m == QPU_W_TLB_Z ||
187 waddr_a == QPU_W_TLB_COLOR_MS ||
188 waddr_m == QPU_W_TLB_COLOR_MS ||
189 waddr_a == QPU_W_TLB_COLOR_ALL ||
190 waddr_m == QPU_W_TLB_COLOR_ALL ||
191 QPU_GET_FIELD(q->inst, QPU_SIG) == QPU_SIG_COLOR_LOAD)) {
192 while (c->qpu_inst_count < 3 ||
193 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
194 QPU_SIG) != QPU_SIG_NONE) {
195 serialize_one_inst(c, qpu_NOP());
196 }
197 c->qpu_insts[c->qpu_inst_count - 1] =
198 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
199 QPU_SIG_WAIT_FOR_SCOREBOARD);
200 scoreboard_wait_emitted = true;
201 }
202
203 serialize_one_inst(c, q->inst);
204
205 remove_from_list(&q->link);
206 free(q);
207 }
208 }
209
210 void
211 vc4_generate_code(struct vc4_compile *c)
212 {
213 struct qpu_reg *temp_registers = vc4_register_allocate(c);
214 bool discard = false;
215
216 make_empty_list(&c->qpu_inst_list);
217
218 switch (c->stage) {
219 case QSTAGE_VERT:
220 case QSTAGE_COORD:
221 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
222 (0x00001a00 +
223 0x00100000 * c->num_inputs)));
224 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
225 break;
226 case QSTAGE_FRAG:
227 break;
228 }
229
230 struct simple_node *node;
231 foreach(node, &c->instructions) {
232 struct qinst *qinst = (struct qinst *)node;
233
234 #if 0
235 fprintf(stderr, "translating qinst to qpu: ");
236 qir_dump_inst(qinst);
237 fprintf(stderr, "\n");
238 #endif
239
240 static const struct {
241 uint32_t op;
242 bool is_mul;
243 } translate[] = {
244 #define A(name) [QOP_##name] = {QPU_A_##name, false}
245 #define M(name) [QOP_##name] = {QPU_M_##name, true}
246 A(FADD),
247 A(FSUB),
248 A(FMIN),
249 A(FMAX),
250 A(FMINABS),
251 A(FMAXABS),
252 A(FTOI),
253 A(ITOF),
254 A(ADD),
255 A(SUB),
256 A(SHL),
257 A(SHR),
258 A(ASR),
259 A(MIN),
260 A(MAX),
261 A(AND),
262 A(OR),
263 A(XOR),
264 A(NOT),
265
266 M(FMUL),
267 M(MUL24),
268 };
269
270 struct qpu_reg src[4];
271 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
272 int index = qinst->src[i].index;
273 switch (qinst->src[i].file) {
274 case QFILE_NULL:
275 src[i] = qpu_rn(0);
276 break;
277 case QFILE_TEMP:
278 src[i] = temp_registers[index];
279 break;
280 case QFILE_UNIF:
281 src[i] = qpu_unif();
282 break;
283 case QFILE_VARY:
284 src[i] = qpu_vary();
285 break;
286 }
287 }
288
289 struct qpu_reg dst;
290 switch (qinst->dst.file) {
291 case QFILE_NULL:
292 dst = qpu_ra(QPU_W_NOP);
293 break;
294 case QFILE_TEMP:
295 dst = temp_registers[qinst->dst.index];
296 break;
297 case QFILE_VARY:
298 case QFILE_UNIF:
299 assert(!"not reached");
300 break;
301 }
302
303 switch (qinst->op) {
304 case QOP_MOV:
305 /* Skip emitting the MOV if it's a no-op. */
306 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
307 dst.mux != src[0].mux || dst.addr != src[0].addr) {
308 queue(c, qpu_a_MOV(dst, src[0]));
309 }
310 break;
311
312 case QOP_SF:
313 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
314 *last_inst(c) |= QPU_SF;
315 break;
316
317 case QOP_SEL_X_0_ZS:
318 case QOP_SEL_X_0_ZC:
319 case QOP_SEL_X_0_NS:
320 case QOP_SEL_X_0_NC:
321 queue(c, qpu_a_MOV(dst, src[0]));
322 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
323 QPU_COND_ZS);
324
325 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
326 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
327 1) + QPU_COND_ZS);
328 break;
329
330 case QOP_SEL_X_Y_ZS:
331 case QOP_SEL_X_Y_ZC:
332 case QOP_SEL_X_Y_NS:
333 case QOP_SEL_X_Y_NC:
334 queue(c, qpu_a_MOV(dst, src[0]));
335 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
336 QPU_COND_ZS);
337
338 queue(c, qpu_a_MOV(dst, src[1]));
339 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
340 1) + QPU_COND_ZS);
341
342 break;
343
344 case QOP_VPM_WRITE:
345 queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
346 break;
347
348 case QOP_VPM_READ:
349 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
350 break;
351
352 case QOP_RCP:
353 case QOP_RSQ:
354 case QOP_EXP2:
355 case QOP_LOG2:
356 switch (qinst->op) {
357 case QOP_RCP:
358 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
359 src[0]));
360 break;
361 case QOP_RSQ:
362 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
363 src[0]));
364 break;
365 case QOP_EXP2:
366 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
367 src[0]));
368 break;
369 case QOP_LOG2:
370 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
371 src[0]));
372 break;
373 default:
374 abort();
375 }
376
377 queue(c, qpu_a_MOV(dst, qpu_r4()));
378
379 break;
380
381 case QOP_PACK_COLORS:
382 for (int i = 0; i < 4; i++) {
383 queue(c, qpu_m_MOV(qpu_r3(), src[i]));
384 *last_inst(c) |= QPU_PM;
385 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
386 QPU_PACK);
387 }
388
389 queue(c, qpu_a_MOV(dst, qpu_r3()));
390
391 break;
392
393 case QOP_FRAG_X:
394 queue(c, qpu_a_ITOF(dst,
395 qpu_ra(QPU_R_XY_PIXEL_COORD)));
396 break;
397
398 case QOP_FRAG_Y:
399 queue(c, qpu_a_ITOF(dst,
400 qpu_rb(QPU_R_XY_PIXEL_COORD)));
401 break;
402
403 case QOP_FRAG_Z:
404 /* QOP_FRAG_Z doesn't emit instructions, just
405 * allocates the register to the Z payload.
406 */
407 break;
408
409 case QOP_FRAG_RCP_W:
410 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
411 qpu_ra(QPU_R_FRAG_PAYLOAD_ZW)));
412
413 queue(c, qpu_a_MOV(dst, qpu_r4()));
414 break;
415
416 case QOP_TLB_DISCARD_SETUP:
417 discard = true;
418 queue(c, qpu_a_MOV(src[0], src[0]));
419 *last_inst(c) |= QPU_SF;
420 break;
421
422 case QOP_TLB_STENCIL_SETUP:
423 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
424 break;
425
426 case QOP_TLB_Z_WRITE:
427 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
428 if (discard) {
429 set_last_cond_add(c, QPU_COND_ZS);
430 }
431 break;
432
433 case QOP_TLB_COLOR_READ:
434 queue(c, qpu_NOP());
435 *last_inst(c) = qpu_set_sig(*last_inst(c),
436 QPU_SIG_COLOR_LOAD);
437
438 break;
439
440 case QOP_TLB_COLOR_WRITE:
441 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
442 if (discard) {
443 set_last_cond_add(c, QPU_COND_ZS);
444 }
445 break;
446
447 case QOP_VARY_ADD_C:
448 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
449 break;
450
451 case QOP_PACK_SCALED: {
452 uint64_t a = (qpu_a_MOV(dst, src[0]) |
453 QPU_SET_FIELD(QPU_PACK_A_16A,
454 QPU_PACK));
455 uint64_t b = (qpu_a_MOV(dst, src[1]) |
456 QPU_SET_FIELD(QPU_PACK_A_16B,
457 QPU_PACK));
458
459 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
460 queue(c, b);
461 queue(c, a);
462 } else {
463 queue(c, a);
464 queue(c, b);
465 }
466 break;
467 }
468
469 case QOP_TEX_S:
470 case QOP_TEX_T:
471 case QOP_TEX_R:
472 case QOP_TEX_B:
473 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
474 (qinst->op - QOP_TEX_S)),
475 src[0]));
476 break;
477
478 case QOP_TEX_RESULT:
479 queue(c, qpu_NOP());
480 *last_inst(c) = qpu_set_sig(*last_inst(c),
481 QPU_SIG_LOAD_TMU0);
482
483 break;
484
485 case QOP_R4_UNPACK_A:
486 case QOP_R4_UNPACK_B:
487 case QOP_R4_UNPACK_C:
488 case QOP_R4_UNPACK_D:
489 assert(src[0].mux == QPU_MUX_R4);
490 queue(c, qpu_a_MOV(dst, src[0]));
491 *last_inst(c) |= QPU_PM;
492 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_R4_8A +
493 (qinst->op -
494 QOP_R4_UNPACK_A),
495 QPU_UNPACK);
496
497 break;
498
499 default:
500 assert(qinst->op < ARRAY_SIZE(translate));
501 assert(translate[qinst->op].op != 0); /* NOPs */
502
503 /* If we have only one source, put it in the second
504 * argument slot as well so that we don't take up
505 * another raddr just to get unused data.
506 */
507 if (qir_get_op_nsrc(qinst->op) == 1)
508 src[1] = src[0];
509
510 fixup_raddr_conflict(c, src[0], &src[1]);
511
512 if (translate[qinst->op].is_mul) {
513 queue(c, qpu_m_alu2(translate[qinst->op].op,
514 dst,
515 src[0], src[1]));
516 } else {
517 queue(c, qpu_a_alu2(translate[qinst->op].op,
518 dst,
519 src[0], src[1]));
520 }
521 break;
522 }
523 }
524
525 serialize_insts(c);
526
527 /* thread end can't have VPM write */
528 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
529 QPU_WADDR_ADD) == QPU_W_VPM ||
530 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
531 QPU_WADDR_MUL) == QPU_W_VPM) {
532 serialize_one_inst(c, qpu_NOP());
533 }
534
535 /* thread end can't have uniform read */
536 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
537 QPU_RADDR_A) == QPU_R_UNIF ||
538 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
539 QPU_RADDR_B) == QPU_R_UNIF) {
540 serialize_one_inst(c, qpu_NOP());
541 }
542
543 c->qpu_insts[c->qpu_inst_count - 1] =
544 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
545 QPU_SIG_PROG_END);
546 serialize_one_inst(c, qpu_NOP());
547 serialize_one_inst(c, qpu_NOP());
548
549 switch (c->stage) {
550 case QSTAGE_VERT:
551 case QSTAGE_COORD:
552 break;
553 case QSTAGE_FRAG:
554 c->qpu_insts[c->qpu_inst_count - 1] =
555 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
556 QPU_SIG_SCOREBOARD_UNLOCK);
557 break;
558 }
559
560 if (vc4_debug & VC4_DEBUG_QPU)
561 vc4_dump_program(c);
562
563 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
564
565 free(temp_registers);
566 }