3f30f2cd3c586159df90eb7f1e6aaa494381883f
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29
30 static void
31 vc4_dump_program(struct vc4_compile *c)
32 {
33 fprintf(stderr, "%s:\n", qir_get_stage_name(c->stage));
34
35 for (int i = 0; i < c->qpu_inst_count; i++) {
36 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
37 vc4_qpu_disasm(&c->qpu_insts[i], 1);
38 fprintf(stderr, "\n");
39 }
40 }
41
42 struct queued_qpu_inst {
43 struct simple_node link;
44 uint64_t inst;
45 };
46
47 static void
48 queue(struct vc4_compile *c, uint64_t inst)
49 {
50 struct queued_qpu_inst *q = calloc(1, sizeof(*q));
51 q->inst = inst;
52 insert_at_tail(&c->qpu_inst_list, &q->link);
53 }
54
55 static uint64_t *
56 last_inst(struct vc4_compile *c)
57 {
58 struct queued_qpu_inst *q =
59 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
60 return &q->inst;
61 }
62
63 static void
64 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
65 {
66 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
67 }
68
69 /**
70 * This is used to resolve the fact that we might register-allocate two
71 * different operands of an instruction to the same physical register file
72 * even though instructions have only one field for the register file source
73 * address.
74 *
75 * In that case, we need to move one to a temporary that can be used in the
76 * instruction, instead.
77 */
78 static void
79 fixup_raddr_conflict(struct vc4_compile *c,
80 struct qpu_reg src0, struct qpu_reg *src1)
81 {
82 if ((src0.mux == QPU_MUX_A || src0.mux == QPU_MUX_B) &&
83 (src1->mux == QPU_MUX_A || src1->mux == QPU_MUX_B) &&
84 src0.addr != src1->addr) {
85 queue(c, qpu_a_MOV(qpu_r3(), *src1));
86 *src1 = qpu_r3();
87 }
88 }
89
90 static void
91 serialize_one_inst(struct vc4_compile *c, uint64_t inst)
92 {
93 if (c->qpu_inst_count >= c->qpu_inst_size) {
94 c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2);
95 c->qpu_insts = realloc(c->qpu_insts,
96 c->qpu_inst_size * sizeof(uint64_t));
97 }
98 c->qpu_insts[c->qpu_inst_count++] = inst;
99 }
100
101 static void
102 serialize_insts(struct vc4_compile *c)
103 {
104 int last_sfu_write = -10;
105 bool scoreboard_wait_emitted = false;
106
107 while (!is_empty_list(&c->qpu_inst_list)) {
108 struct queued_qpu_inst *q =
109 (struct queued_qpu_inst *)first_elem(&c->qpu_inst_list);
110 uint32_t last_waddr_a = QPU_W_NOP, last_waddr_b = QPU_W_NOP;
111 uint32_t raddr_a = QPU_GET_FIELD(q->inst, QPU_RADDR_A);
112 uint32_t raddr_b = QPU_GET_FIELD(q->inst, QPU_RADDR_B);
113
114 if (c->qpu_inst_count > 0) {
115 uint64_t last_inst = c->qpu_insts[c->qpu_inst_count -
116 1];
117 uint32_t last_waddr_add = QPU_GET_FIELD(last_inst,
118 QPU_WADDR_ADD);
119 uint32_t last_waddr_mul = QPU_GET_FIELD(last_inst,
120 QPU_WADDR_MUL);
121
122 if (last_inst & QPU_WS) {
123 last_waddr_a = last_waddr_mul;
124 last_waddr_b = last_waddr_add;
125 } else {
126 last_waddr_a = last_waddr_add;
127 last_waddr_b = last_waddr_mul;
128 }
129 }
130
131 uint32_t src_muxes[] = {
132 QPU_GET_FIELD(q->inst, QPU_ADD_A),
133 QPU_GET_FIELD(q->inst, QPU_ADD_B),
134 QPU_GET_FIELD(q->inst, QPU_MUL_A),
135 QPU_GET_FIELD(q->inst, QPU_MUL_B),
136 };
137
138 /* "An instruction must not read from a location in physical
139 * regfile A or B that was written to by the previous
140 * instruction."
141 */
142 bool needs_raddr_vs_waddr_nop = false;
143 bool reads_r4 = false;
144 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
145 if ((raddr_a < 32 &&
146 src_muxes[i] == QPU_MUX_A &&
147 last_waddr_a == raddr_a) ||
148 (raddr_b < 32 &&
149 src_muxes[i] == QPU_MUX_B &&
150 last_waddr_b == raddr_b)) {
151 needs_raddr_vs_waddr_nop = true;
152 }
153 if (src_muxes[i] == QPU_MUX_R4)
154 reads_r4 = true;
155 }
156
157 if (needs_raddr_vs_waddr_nop) {
158 serialize_one_inst(c, qpu_NOP());
159 }
160
161 /* "After an SFU lookup instruction, accumulator r4 must not
162 * be read in the following two instructions. Any other
163 * instruction that results in r4 being written (that is, TMU
164 * read, TLB read, SFU lookup) cannot occur in the two
165 * instructions following an SFU lookup."
166 */
167 if (reads_r4) {
168 while (c->qpu_inst_count - last_sfu_write < 3) {
169 serialize_one_inst(c, qpu_NOP());
170 }
171 }
172
173 uint32_t waddr_a = QPU_GET_FIELD(q->inst, QPU_WADDR_ADD);
174 uint32_t waddr_m = QPU_GET_FIELD(q->inst, QPU_WADDR_MUL);
175 if ((waddr_a >= QPU_W_SFU_RECIP && waddr_a <= QPU_W_SFU_LOG) ||
176 (waddr_m >= QPU_W_SFU_RECIP && waddr_m <= QPU_W_SFU_LOG)) {
177 last_sfu_write = c->qpu_inst_count;
178 }
179
180 /* "A scoreboard wait must not occur in the first two
181 * instructions of a fragment shader. This is either the
182 * explicit Wait for Scoreboard signal or an implicit wait
183 * with the first tile-buffer read or write instruction."
184 */
185 if (!scoreboard_wait_emitted &&
186 (waddr_a == QPU_W_TLB_Z || waddr_m == QPU_W_TLB_Z ||
187 waddr_a == QPU_W_TLB_COLOR_MS ||
188 waddr_m == QPU_W_TLB_COLOR_MS ||
189 waddr_a == QPU_W_TLB_COLOR_ALL ||
190 waddr_m == QPU_W_TLB_COLOR_ALL ||
191 QPU_GET_FIELD(q->inst, QPU_SIG) == QPU_SIG_COLOR_LOAD)) {
192 while (c->qpu_inst_count < 3 ||
193 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
194 QPU_SIG) != QPU_SIG_NONE) {
195 serialize_one_inst(c, qpu_NOP());
196 }
197 c->qpu_insts[c->qpu_inst_count - 1] =
198 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
199 QPU_SIG_WAIT_FOR_SCOREBOARD);
200 scoreboard_wait_emitted = true;
201 }
202
203 serialize_one_inst(c, q->inst);
204
205 remove_from_list(&q->link);
206 free(q);
207 }
208 }
209
210 void
211 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
212 {
213 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
214 bool discard = false;
215
216 make_empty_list(&c->qpu_inst_list);
217
218 switch (c->stage) {
219 case QSTAGE_VERT:
220 case QSTAGE_COORD:
221 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
222 (0x00001a00 +
223 0x00100000 * c->num_inputs)));
224 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
225 break;
226 case QSTAGE_FRAG:
227 break;
228 }
229
230 struct simple_node *node;
231 foreach(node, &c->instructions) {
232 struct qinst *qinst = (struct qinst *)node;
233
234 #if 0
235 fprintf(stderr, "translating qinst to qpu: ");
236 qir_dump_inst(qinst);
237 fprintf(stderr, "\n");
238 #endif
239
240 static const struct {
241 uint32_t op;
242 bool is_mul;
243 } translate[] = {
244 #define A(name) [QOP_##name] = {QPU_A_##name, false}
245 #define M(name) [QOP_##name] = {QPU_M_##name, true}
246 A(FADD),
247 A(FSUB),
248 A(FMIN),
249 A(FMAX),
250 A(FMINABS),
251 A(FMAXABS),
252 A(FTOI),
253 A(ITOF),
254 A(ADD),
255 A(SUB),
256 A(SHL),
257 A(SHR),
258 A(ASR),
259 A(MIN),
260 A(MAX),
261 A(AND),
262 A(OR),
263 A(XOR),
264 A(NOT),
265
266 M(FMUL),
267 M(MUL24),
268 };
269
270 struct qpu_reg src[4];
271 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
272 int index = qinst->src[i].index;
273 switch (qinst->src[i].file) {
274 case QFILE_NULL:
275 src[i] = qpu_rn(0);
276 break;
277 case QFILE_TEMP:
278 src[i] = temp_registers[index];
279 break;
280 case QFILE_UNIF:
281 src[i] = qpu_unif();
282 break;
283 case QFILE_VARY:
284 src[i] = qpu_vary();
285 break;
286 }
287 }
288
289 struct qpu_reg dst;
290 switch (qinst->dst.file) {
291 case QFILE_NULL:
292 dst = qpu_ra(QPU_W_NOP);
293 break;
294 case QFILE_TEMP:
295 dst = temp_registers[qinst->dst.index];
296 break;
297 case QFILE_VARY:
298 case QFILE_UNIF:
299 assert(!"not reached");
300 break;
301 }
302
303 switch (qinst->op) {
304 case QOP_MOV:
305 /* Skip emitting the MOV if it's a no-op. */
306 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
307 dst.mux != src[0].mux || dst.addr != src[0].addr) {
308 queue(c, qpu_a_MOV(dst, src[0]));
309 }
310 break;
311
312 case QOP_SF:
313 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
314 *last_inst(c) |= QPU_SF;
315 break;
316
317 case QOP_SEL_X_0_ZS:
318 case QOP_SEL_X_0_ZC:
319 case QOP_SEL_X_0_NS:
320 case QOP_SEL_X_0_NC:
321 queue(c, qpu_a_MOV(dst, src[0]));
322 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
323 QPU_COND_ZS);
324
325 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
326 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
327 1) + QPU_COND_ZS);
328 break;
329
330 case QOP_SEL_X_Y_ZS:
331 case QOP_SEL_X_Y_ZC:
332 case QOP_SEL_X_Y_NS:
333 case QOP_SEL_X_Y_NC:
334 queue(c, qpu_a_MOV(dst, src[0]));
335 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
336 QPU_COND_ZS);
337
338 queue(c, qpu_a_MOV(dst, src[1]));
339 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
340 1) + QPU_COND_ZS);
341
342 break;
343
344 case QOP_VPM_WRITE:
345 queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
346 break;
347
348 case QOP_VPM_READ:
349 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
350 break;
351
352 case QOP_RCP:
353 case QOP_RSQ:
354 case QOP_EXP2:
355 case QOP_LOG2:
356 switch (qinst->op) {
357 case QOP_RCP:
358 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
359 src[0]));
360 break;
361 case QOP_RSQ:
362 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
363 src[0]));
364 break;
365 case QOP_EXP2:
366 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
367 src[0]));
368 break;
369 case QOP_LOG2:
370 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
371 src[0]));
372 break;
373 default:
374 abort();
375 }
376
377 queue(c, qpu_a_MOV(dst, qpu_r4()));
378
379 break;
380
381 case QOP_PACK_COLORS:
382 for (int i = 0; i < 4; i++) {
383 queue(c, qpu_m_MOV(qpu_r3(), src[i]));
384 *last_inst(c) |= QPU_PM;
385 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
386 QPU_PACK);
387 }
388
389 queue(c, qpu_a_MOV(dst, qpu_r3()));
390
391 break;
392
393 case QOP_FRAG_X:
394 queue(c, qpu_a_ITOF(dst,
395 qpu_ra(QPU_R_XY_PIXEL_COORD)));
396 break;
397
398 case QOP_FRAG_Y:
399 queue(c, qpu_a_ITOF(dst,
400 qpu_rb(QPU_R_XY_PIXEL_COORD)));
401 break;
402
403 case QOP_FRAG_Z:
404 case QOP_FRAG_W:
405 /* QOP_FRAG_Z/W don't emit instructions, just allocate
406 * the register to the Z/W payload.
407 */
408 break;
409
410 case QOP_TLB_DISCARD_SETUP:
411 discard = true;
412 queue(c, qpu_a_MOV(src[0], src[0]));
413 *last_inst(c) |= QPU_SF;
414 break;
415
416 case QOP_TLB_STENCIL_SETUP:
417 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
418 break;
419
420 case QOP_TLB_Z_WRITE:
421 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
422 if (discard) {
423 set_last_cond_add(c, QPU_COND_ZS);
424 }
425 break;
426
427 case QOP_TLB_COLOR_READ:
428 queue(c, qpu_NOP());
429 *last_inst(c) = qpu_set_sig(*last_inst(c),
430 QPU_SIG_COLOR_LOAD);
431
432 break;
433
434 case QOP_TLB_COLOR_WRITE:
435 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
436 if (discard) {
437 set_last_cond_add(c, QPU_COND_ZS);
438 }
439 break;
440
441 case QOP_VARY_ADD_C:
442 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
443 break;
444
445 case QOP_PACK_SCALED: {
446 uint64_t a = (qpu_a_MOV(dst, src[0]) |
447 QPU_SET_FIELD(QPU_PACK_A_16A,
448 QPU_PACK));
449 uint64_t b = (qpu_a_MOV(dst, src[1]) |
450 QPU_SET_FIELD(QPU_PACK_A_16B,
451 QPU_PACK));
452
453 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
454 queue(c, b);
455 queue(c, a);
456 } else {
457 queue(c, a);
458 queue(c, b);
459 }
460 break;
461 }
462
463 case QOP_TEX_S:
464 case QOP_TEX_T:
465 case QOP_TEX_R:
466 case QOP_TEX_B:
467 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
468 (qinst->op - QOP_TEX_S)),
469 src[0]));
470 break;
471
472 case QOP_TEX_RESULT:
473 queue(c, qpu_NOP());
474 *last_inst(c) = qpu_set_sig(*last_inst(c),
475 QPU_SIG_LOAD_TMU0);
476
477 break;
478
479 case QOP_R4_UNPACK_A:
480 case QOP_R4_UNPACK_B:
481 case QOP_R4_UNPACK_C:
482 case QOP_R4_UNPACK_D:
483 assert(src[0].mux == QPU_MUX_R4);
484 queue(c, qpu_a_MOV(dst, src[0]));
485 *last_inst(c) |= QPU_PM;
486 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
487 (qinst->op -
488 QOP_R4_UNPACK_A),
489 QPU_UNPACK);
490
491 break;
492
493 default:
494 assert(qinst->op < ARRAY_SIZE(translate));
495 assert(translate[qinst->op].op != 0); /* NOPs */
496
497 /* If we have only one source, put it in the second
498 * argument slot as well so that we don't take up
499 * another raddr just to get unused data.
500 */
501 if (qir_get_op_nsrc(qinst->op) == 1)
502 src[1] = src[0];
503
504 fixup_raddr_conflict(c, src[0], &src[1]);
505
506 if (translate[qinst->op].is_mul) {
507 queue(c, qpu_m_alu2(translate[qinst->op].op,
508 dst,
509 src[0], src[1]));
510 } else {
511 queue(c, qpu_a_alu2(translate[qinst->op].op,
512 dst,
513 src[0], src[1]));
514 }
515 break;
516 }
517 }
518
519 serialize_insts(c);
520
521 /* thread end can't have VPM write */
522 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
523 QPU_WADDR_ADD) == QPU_W_VPM ||
524 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
525 QPU_WADDR_MUL) == QPU_W_VPM) {
526 serialize_one_inst(c, qpu_NOP());
527 }
528
529 /* thread end can't have uniform read */
530 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
531 QPU_RADDR_A) == QPU_R_UNIF ||
532 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
533 QPU_RADDR_B) == QPU_R_UNIF) {
534 serialize_one_inst(c, qpu_NOP());
535 }
536
537 c->qpu_insts[c->qpu_inst_count - 1] =
538 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
539 QPU_SIG_PROG_END);
540 serialize_one_inst(c, qpu_NOP());
541 serialize_one_inst(c, qpu_NOP());
542
543 switch (c->stage) {
544 case QSTAGE_VERT:
545 case QSTAGE_COORD:
546 break;
547 case QSTAGE_FRAG:
548 c->qpu_insts[c->qpu_inst_count - 1] =
549 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
550 QPU_SIG_SCOREBOARD_UNLOCK);
551 break;
552 }
553
554 if (vc4_debug & VC4_DEBUG_QPU)
555 vc4_dump_program(c);
556
557 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
558
559 free(temp_registers);
560 }