vc4: Add support for 8-bit unorm/snorm vertex inputs.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29
30 static void
31 vc4_dump_program(struct vc4_compile *c)
32 {
33 fprintf(stderr, "%s:\n", qir_get_stage_name(c->stage));
34
35 for (int i = 0; i < c->qpu_inst_count; i++) {
36 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
37 vc4_qpu_disasm(&c->qpu_insts[i], 1);
38 fprintf(stderr, "\n");
39 }
40 }
41
42 struct queued_qpu_inst {
43 struct simple_node link;
44 uint64_t inst;
45 };
46
47 static void
48 queue(struct vc4_compile *c, uint64_t inst)
49 {
50 struct queued_qpu_inst *q = calloc(1, sizeof(*q));
51 q->inst = inst;
52 insert_at_tail(&c->qpu_inst_list, &q->link);
53 }
54
55 static uint64_t *
56 last_inst(struct vc4_compile *c)
57 {
58 struct queued_qpu_inst *q =
59 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
60 return &q->inst;
61 }
62
63 static void
64 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
65 {
66 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
67 }
68
69 /**
70 * This is used to resolve the fact that we might register-allocate two
71 * different operands of an instruction to the same physical register file
72 * even though instructions have only one field for the register file source
73 * address.
74 *
75 * In that case, we need to move one to a temporary that can be used in the
76 * instruction, instead.
77 */
78 static void
79 fixup_raddr_conflict(struct vc4_compile *c,
80 struct qpu_reg src0, struct qpu_reg *src1)
81 {
82 if ((src0.mux == QPU_MUX_A || src0.mux == QPU_MUX_B) &&
83 (src1->mux == QPU_MUX_A || src1->mux == QPU_MUX_B) &&
84 src0.addr != src1->addr) {
85 queue(c, qpu_a_MOV(qpu_r3(), *src1));
86 *src1 = qpu_r3();
87 }
88 }
89
90 static void
91 serialize_one_inst(struct vc4_compile *c, uint64_t inst)
92 {
93 if (c->qpu_inst_count >= c->qpu_inst_size) {
94 c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2);
95 c->qpu_insts = realloc(c->qpu_insts,
96 c->qpu_inst_size * sizeof(uint64_t));
97 }
98 c->qpu_insts[c->qpu_inst_count++] = inst;
99 }
100
101 static void
102 serialize_insts(struct vc4_compile *c)
103 {
104 int last_sfu_write = -10;
105 bool scoreboard_wait_emitted = false;
106
107 while (!is_empty_list(&c->qpu_inst_list)) {
108 struct queued_qpu_inst *q =
109 (struct queued_qpu_inst *)first_elem(&c->qpu_inst_list);
110 uint32_t last_waddr_a = QPU_W_NOP, last_waddr_b = QPU_W_NOP;
111 uint32_t raddr_a = QPU_GET_FIELD(q->inst, QPU_RADDR_A);
112 uint32_t raddr_b = QPU_GET_FIELD(q->inst, QPU_RADDR_B);
113
114 if (c->qpu_inst_count > 0) {
115 uint64_t last_inst = c->qpu_insts[c->qpu_inst_count -
116 1];
117 uint32_t last_waddr_add = QPU_GET_FIELD(last_inst,
118 QPU_WADDR_ADD);
119 uint32_t last_waddr_mul = QPU_GET_FIELD(last_inst,
120 QPU_WADDR_MUL);
121
122 if (last_inst & QPU_WS) {
123 last_waddr_a = last_waddr_mul;
124 last_waddr_b = last_waddr_add;
125 } else {
126 last_waddr_a = last_waddr_add;
127 last_waddr_b = last_waddr_mul;
128 }
129 }
130
131 uint32_t src_muxes[] = {
132 QPU_GET_FIELD(q->inst, QPU_ADD_A),
133 QPU_GET_FIELD(q->inst, QPU_ADD_B),
134 QPU_GET_FIELD(q->inst, QPU_MUL_A),
135 QPU_GET_FIELD(q->inst, QPU_MUL_B),
136 };
137
138 /* "An instruction must not read from a location in physical
139 * regfile A or B that was written to by the previous
140 * instruction."
141 */
142 bool needs_raddr_vs_waddr_nop = false;
143 bool reads_r4 = false;
144 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
145 if ((raddr_a < 32 &&
146 src_muxes[i] == QPU_MUX_A &&
147 last_waddr_a == raddr_a) ||
148 (raddr_b < 32 &&
149 src_muxes[i] == QPU_MUX_B &&
150 last_waddr_b == raddr_b)) {
151 needs_raddr_vs_waddr_nop = true;
152 }
153 if (src_muxes[i] == QPU_MUX_R4)
154 reads_r4 = true;
155 }
156
157 if (needs_raddr_vs_waddr_nop) {
158 serialize_one_inst(c, qpu_NOP());
159 }
160
161 /* "After an SFU lookup instruction, accumulator r4 must not
162 * be read in the following two instructions. Any other
163 * instruction that results in r4 being written (that is, TMU
164 * read, TLB read, SFU lookup) cannot occur in the two
165 * instructions following an SFU lookup."
166 */
167 if (reads_r4) {
168 while (c->qpu_inst_count - last_sfu_write < 3) {
169 serialize_one_inst(c, qpu_NOP());
170 }
171 }
172
173 uint32_t waddr_a = QPU_GET_FIELD(q->inst, QPU_WADDR_ADD);
174 uint32_t waddr_m = QPU_GET_FIELD(q->inst, QPU_WADDR_MUL);
175 if ((waddr_a >= QPU_W_SFU_RECIP && waddr_a <= QPU_W_SFU_LOG) ||
176 (waddr_m >= QPU_W_SFU_RECIP && waddr_m <= QPU_W_SFU_LOG)) {
177 last_sfu_write = c->qpu_inst_count;
178 }
179
180 /* "A scoreboard wait must not occur in the first two
181 * instructions of a fragment shader. This is either the
182 * explicit Wait for Scoreboard signal or an implicit wait
183 * with the first tile-buffer read or write instruction."
184 */
185 if (!scoreboard_wait_emitted &&
186 (waddr_a == QPU_W_TLB_Z || waddr_m == QPU_W_TLB_Z ||
187 waddr_a == QPU_W_TLB_COLOR_MS ||
188 waddr_m == QPU_W_TLB_COLOR_MS ||
189 waddr_a == QPU_W_TLB_COLOR_ALL ||
190 waddr_m == QPU_W_TLB_COLOR_ALL ||
191 QPU_GET_FIELD(q->inst, QPU_SIG) == QPU_SIG_COLOR_LOAD)) {
192 while (c->qpu_inst_count < 3 ||
193 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
194 QPU_SIG) != QPU_SIG_NONE) {
195 serialize_one_inst(c, qpu_NOP());
196 }
197 c->qpu_insts[c->qpu_inst_count - 1] =
198 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
199 QPU_SIG_WAIT_FOR_SCOREBOARD);
200 scoreboard_wait_emitted = true;
201 }
202
203 serialize_one_inst(c, q->inst);
204
205 remove_from_list(&q->link);
206 free(q);
207 }
208 }
209
210 void
211 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
212 {
213 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
214 bool discard = false;
215
216 make_empty_list(&c->qpu_inst_list);
217
218 switch (c->stage) {
219 case QSTAGE_VERT:
220 case QSTAGE_COORD:
221 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
222 (0x00001a00 +
223 0x00100000 * c->num_inputs)));
224 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
225 break;
226 case QSTAGE_FRAG:
227 break;
228 }
229
230 struct simple_node *node;
231 foreach(node, &c->instructions) {
232 struct qinst *qinst = (struct qinst *)node;
233
234 #if 0
235 fprintf(stderr, "translating qinst to qpu: ");
236 qir_dump_inst(qinst);
237 fprintf(stderr, "\n");
238 #endif
239
240 static const struct {
241 uint32_t op;
242 bool is_mul;
243 } translate[] = {
244 #define A(name) [QOP_##name] = {QPU_A_##name, false}
245 #define M(name) [QOP_##name] = {QPU_M_##name, true}
246 A(FADD),
247 A(FSUB),
248 A(FMIN),
249 A(FMAX),
250 A(FMINABS),
251 A(FMAXABS),
252 A(FTOI),
253 A(ITOF),
254 A(ADD),
255 A(SUB),
256 A(SHL),
257 A(SHR),
258 A(ASR),
259 A(MIN),
260 A(MAX),
261 A(AND),
262 A(OR),
263 A(XOR),
264 A(NOT),
265
266 M(FMUL),
267 M(MUL24),
268 };
269
270 struct qpu_reg src[4];
271 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
272 int index = qinst->src[i].index;
273 switch (qinst->src[i].file) {
274 case QFILE_NULL:
275 src[i] = qpu_rn(0);
276 break;
277 case QFILE_TEMP:
278 src[i] = temp_registers[index];
279 break;
280 case QFILE_UNIF:
281 src[i] = qpu_unif();
282 break;
283 case QFILE_VARY:
284 src[i] = qpu_vary();
285 break;
286 }
287 }
288
289 struct qpu_reg dst;
290 switch (qinst->dst.file) {
291 case QFILE_NULL:
292 dst = qpu_ra(QPU_W_NOP);
293 break;
294 case QFILE_TEMP:
295 dst = temp_registers[qinst->dst.index];
296 break;
297 case QFILE_VARY:
298 case QFILE_UNIF:
299 assert(!"not reached");
300 break;
301 }
302
303 switch (qinst->op) {
304 case QOP_MOV:
305 /* Skip emitting the MOV if it's a no-op. */
306 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
307 dst.mux != src[0].mux || dst.addr != src[0].addr) {
308 queue(c, qpu_a_MOV(dst, src[0]));
309 }
310 break;
311
312 case QOP_SF:
313 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
314 *last_inst(c) |= QPU_SF;
315 break;
316
317 case QOP_SEL_X_0_ZS:
318 case QOP_SEL_X_0_ZC:
319 case QOP_SEL_X_0_NS:
320 case QOP_SEL_X_0_NC:
321 queue(c, qpu_a_MOV(dst, src[0]));
322 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
323 QPU_COND_ZS);
324
325 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
326 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
327 1) + QPU_COND_ZS);
328 break;
329
330 case QOP_SEL_X_Y_ZS:
331 case QOP_SEL_X_Y_ZC:
332 case QOP_SEL_X_Y_NS:
333 case QOP_SEL_X_Y_NC:
334 queue(c, qpu_a_MOV(dst, src[0]));
335 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
336 QPU_COND_ZS);
337
338 queue(c, qpu_a_MOV(dst, src[1]));
339 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
340 1) + QPU_COND_ZS);
341
342 break;
343
344 case QOP_VPM_WRITE:
345 queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
346 break;
347
348 case QOP_VPM_READ:
349 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
350 break;
351
352 case QOP_RCP:
353 case QOP_RSQ:
354 case QOP_EXP2:
355 case QOP_LOG2:
356 switch (qinst->op) {
357 case QOP_RCP:
358 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
359 src[0]));
360 break;
361 case QOP_RSQ:
362 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
363 src[0]));
364 break;
365 case QOP_EXP2:
366 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
367 src[0]));
368 break;
369 case QOP_LOG2:
370 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
371 src[0]));
372 break;
373 default:
374 abort();
375 }
376
377 queue(c, qpu_a_MOV(dst, qpu_r4()));
378
379 break;
380
381 case QOP_PACK_COLORS:
382 for (int i = 0; i < 4; i++) {
383 queue(c, qpu_m_MOV(qpu_r3(), src[i]));
384 *last_inst(c) |= QPU_PM;
385 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
386 QPU_PACK);
387 }
388
389 queue(c, qpu_a_MOV(dst, qpu_r3()));
390
391 break;
392
393 case QOP_FRAG_X:
394 queue(c, qpu_a_ITOF(dst,
395 qpu_ra(QPU_R_XY_PIXEL_COORD)));
396 break;
397
398 case QOP_FRAG_Y:
399 queue(c, qpu_a_ITOF(dst,
400 qpu_rb(QPU_R_XY_PIXEL_COORD)));
401 break;
402
403 case QOP_FRAG_Z:
404 case QOP_FRAG_W:
405 /* QOP_FRAG_Z/W don't emit instructions, just allocate
406 * the register to the Z/W payload.
407 */
408 break;
409
410 case QOP_TLB_DISCARD_SETUP:
411 discard = true;
412 queue(c, qpu_a_MOV(src[0], src[0]));
413 *last_inst(c) |= QPU_SF;
414 break;
415
416 case QOP_TLB_STENCIL_SETUP:
417 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
418 break;
419
420 case QOP_TLB_Z_WRITE:
421 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
422 if (discard) {
423 set_last_cond_add(c, QPU_COND_ZS);
424 }
425 break;
426
427 case QOP_TLB_COLOR_READ:
428 queue(c, qpu_NOP());
429 *last_inst(c) = qpu_set_sig(*last_inst(c),
430 QPU_SIG_COLOR_LOAD);
431
432 break;
433
434 case QOP_TLB_COLOR_WRITE:
435 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
436 if (discard) {
437 set_last_cond_add(c, QPU_COND_ZS);
438 }
439 break;
440
441 case QOP_VARY_ADD_C:
442 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
443 break;
444
445 case QOP_PACK_SCALED: {
446 uint64_t a = (qpu_a_MOV(dst, src[0]) |
447 QPU_SET_FIELD(QPU_PACK_A_16A,
448 QPU_PACK));
449 uint64_t b = (qpu_a_MOV(dst, src[1]) |
450 QPU_SET_FIELD(QPU_PACK_A_16B,
451 QPU_PACK));
452
453 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
454 queue(c, b);
455 queue(c, a);
456 } else {
457 queue(c, a);
458 queue(c, b);
459 }
460 break;
461 }
462
463 case QOP_TEX_S:
464 case QOP_TEX_T:
465 case QOP_TEX_R:
466 case QOP_TEX_B:
467 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
468 (qinst->op - QOP_TEX_S)),
469 src[0]));
470 break;
471
472 case QOP_TEX_RESULT:
473 queue(c, qpu_NOP());
474 *last_inst(c) = qpu_set_sig(*last_inst(c),
475 QPU_SIG_LOAD_TMU0);
476
477 break;
478
479 case QOP_R4_UNPACK_A:
480 case QOP_R4_UNPACK_B:
481 case QOP_R4_UNPACK_C:
482 case QOP_R4_UNPACK_D:
483 assert(src[0].mux == QPU_MUX_R4);
484 queue(c, qpu_a_MOV(dst, src[0]));
485 *last_inst(c) |= QPU_PM;
486 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
487 (qinst->op -
488 QOP_R4_UNPACK_A),
489 QPU_UNPACK);
490
491 break;
492
493 case QOP_UNPACK_8A:
494 case QOP_UNPACK_8B:
495 case QOP_UNPACK_8C:
496 case QOP_UNPACK_8D: {
497 assert(src[0].mux == QPU_MUX_A);
498
499 /* And, since we're setting the pack bits, if the
500 * destination is in A it would get re-packed.
501 */
502 struct qpu_reg orig_dst = dst;
503 if (orig_dst.mux == QPU_MUX_A)
504 dst = qpu_rn(3);
505
506 queue(c, qpu_a_FMAX(dst, src[0], src[0]));
507 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
508 (qinst->op -
509 QOP_UNPACK_8A),
510 QPU_UNPACK);
511
512 if (orig_dst.mux == QPU_MUX_A) {
513 queue(c, qpu_a_MOV(orig_dst, dst));
514 }
515 }
516 break;
517
518 default:
519 assert(qinst->op < ARRAY_SIZE(translate));
520 assert(translate[qinst->op].op != 0); /* NOPs */
521
522 /* If we have only one source, put it in the second
523 * argument slot as well so that we don't take up
524 * another raddr just to get unused data.
525 */
526 if (qir_get_op_nsrc(qinst->op) == 1)
527 src[1] = src[0];
528
529 fixup_raddr_conflict(c, src[0], &src[1]);
530
531 if (translate[qinst->op].is_mul) {
532 queue(c, qpu_m_alu2(translate[qinst->op].op,
533 dst,
534 src[0], src[1]));
535 } else {
536 queue(c, qpu_a_alu2(translate[qinst->op].op,
537 dst,
538 src[0], src[1]));
539 }
540 break;
541 }
542 }
543
544 serialize_insts(c);
545
546 /* thread end can't have VPM write */
547 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
548 QPU_WADDR_ADD) == QPU_W_VPM ||
549 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
550 QPU_WADDR_MUL) == QPU_W_VPM) {
551 serialize_one_inst(c, qpu_NOP());
552 }
553
554 /* thread end can't have uniform read */
555 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
556 QPU_RADDR_A) == QPU_R_UNIF ||
557 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
558 QPU_RADDR_B) == QPU_R_UNIF) {
559 serialize_one_inst(c, qpu_NOP());
560 }
561
562 c->qpu_insts[c->qpu_inst_count - 1] =
563 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
564 QPU_SIG_PROG_END);
565 serialize_one_inst(c, qpu_NOP());
566 serialize_one_inst(c, qpu_NOP());
567
568 switch (c->stage) {
569 case QSTAGE_VERT:
570 case QSTAGE_COORD:
571 break;
572 case QSTAGE_FRAG:
573 c->qpu_insts[c->qpu_inst_count - 1] =
574 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
575 QPU_SIG_SCOREBOARD_UNLOCK);
576 break;
577 }
578
579 if (vc4_debug & VC4_DEBUG_QPU)
580 vc4_dump_program(c);
581
582 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
583
584 free(temp_registers);
585 }