vc4: Reserve rb31 instead of r3 for raddr conflict spills.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29
30 static void
31 vc4_dump_program(struct vc4_compile *c)
32 {
33 fprintf(stderr, "%s prog %d/%d QPU:\n",
34 qir_get_stage_name(c->stage),
35 c->program_id, c->variant_id);
36
37 for (int i = 0; i < c->qpu_inst_count; i++) {
38 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
39 vc4_qpu_disasm(&c->qpu_insts[i], 1);
40 fprintf(stderr, "\n");
41 }
42 }
43
44 static void
45 queue(struct vc4_compile *c, uint64_t inst)
46 {
47 struct queued_qpu_inst *q = calloc(1, sizeof(*q));
48 q->inst = inst;
49 insert_at_tail(&c->qpu_inst_list, &q->link);
50 }
51
52 static uint64_t *
53 last_inst(struct vc4_compile *c)
54 {
55 struct queued_qpu_inst *q =
56 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
57 return &q->inst;
58 }
59
60 static void
61 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
62 {
63 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
64 }
65
66 /**
67 * Some special registers can be read from either file, which lets us resolve
68 * raddr conflicts without extra MOVs.
69 */
70 static bool
71 swap_file(struct qpu_reg *src)
72 {
73 switch (src->addr) {
74 case QPU_R_UNIF:
75 case QPU_R_VARY:
76 if (src->mux == QPU_MUX_A)
77 src->mux = QPU_MUX_B;
78 else
79 src->mux = QPU_MUX_A;
80 return true;
81
82 default:
83 return false;
84 }
85 }
86
87 /**
88 * This is used to resolve the fact that we might register-allocate two
89 * different operands of an instruction to the same physical register file
90 * even though instructions have only one field for the register file source
91 * address.
92 *
93 * In that case, we need to move one to a temporary that can be used in the
94 * instruction, instead.
95 */
96 static bool
97 fixup_raddr_conflict(struct vc4_compile *c,
98 struct qpu_reg dst,
99 struct qpu_reg *src0, struct qpu_reg *src1,
100 bool r3_live)
101 {
102 if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
103 src0->mux != src1->mux ||
104 src0->addr == src1->addr) {
105 return false;
106 }
107
108 if (swap_file(src0) || swap_file(src1))
109 return false;
110
111 if (src0->mux == QPU_MUX_A) {
112 /* If we're conflicting over the A regfile, then we can just
113 * use the reserved rb31.
114 */
115 queue(c, qpu_a_MOV(qpu_rb(31), *src1));
116 *src1 = qpu_rb(31);
117 return false;
118 } else {
119 /* Otherwise, we need a non-B regfile. So, we spill r3 out to
120 * rb31, then store our desired value in r3, and tell the
121 * caller to put rb31 back into r3 when we're done.
122 */
123 if (r3_live)
124 queue(c, qpu_a_MOV(qpu_rb(31), qpu_r3()));
125 queue(c, qpu_a_MOV(qpu_r3(), *src1));
126
127 *src1 = qpu_r3();
128
129 return r3_live && dst.mux != QPU_MUX_R3;
130 }
131 }
132
133 void
134 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
135 {
136 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
137 bool discard = false;
138 uint32_t inputs_remaining = c->num_inputs;
139 uint32_t vpm_read_fifo_count = 0;
140 uint32_t vpm_read_offset = 0;
141 bool written_r3 = false;
142 bool needs_restore;
143
144 make_empty_list(&c->qpu_inst_list);
145
146 switch (c->stage) {
147 case QSTAGE_VERT:
148 case QSTAGE_COORD:
149 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
150 * load up to 16 dwords (4 vec4s) per vertex.
151 */
152 while (inputs_remaining) {
153 uint32_t num_entries = MIN2(inputs_remaining, 16);
154 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
155 vpm_read_offset |
156 0x00001a00 |
157 ((num_entries & 0xf) << 20)));
158 inputs_remaining -= num_entries;
159 vpm_read_offset += num_entries;
160 vpm_read_fifo_count++;
161 }
162 assert(vpm_read_fifo_count <= 4);
163
164 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
165 break;
166 case QSTAGE_FRAG:
167 break;
168 }
169
170 struct simple_node *node;
171 foreach(node, &c->instructions) {
172 struct qinst *qinst = (struct qinst *)node;
173
174 #if 0
175 fprintf(stderr, "translating qinst to qpu: ");
176 qir_dump_inst(qinst);
177 fprintf(stderr, "\n");
178 #endif
179
180 static const struct {
181 uint32_t op;
182 bool is_mul;
183 } translate[] = {
184 #define A(name) [QOP_##name] = {QPU_A_##name, false}
185 #define M(name) [QOP_##name] = {QPU_M_##name, true}
186 A(FADD),
187 A(FSUB),
188 A(FMIN),
189 A(FMAX),
190 A(FMINABS),
191 A(FMAXABS),
192 A(FTOI),
193 A(ITOF),
194 A(ADD),
195 A(SUB),
196 A(SHL),
197 A(SHR),
198 A(ASR),
199 A(MIN),
200 A(MAX),
201 A(AND),
202 A(OR),
203 A(XOR),
204 A(NOT),
205
206 M(FMUL),
207 M(MUL24),
208 };
209
210 struct qpu_reg src[4];
211 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
212 int index = qinst->src[i].index;
213 switch (qinst->src[i].file) {
214 case QFILE_NULL:
215 src[i] = qpu_rn(0);
216 break;
217 case QFILE_TEMP:
218 src[i] = temp_registers[index];
219 break;
220 case QFILE_UNIF:
221 src[i] = qpu_unif();
222 break;
223 case QFILE_VARY:
224 src[i] = qpu_vary();
225 break;
226 }
227 }
228
229 struct qpu_reg dst;
230 switch (qinst->dst.file) {
231 case QFILE_NULL:
232 dst = qpu_ra(QPU_W_NOP);
233 break;
234 case QFILE_TEMP:
235 dst = temp_registers[qinst->dst.index];
236 break;
237 case QFILE_VARY:
238 case QFILE_UNIF:
239 assert(!"not reached");
240 break;
241 }
242
243 switch (qinst->op) {
244 case QOP_MOV:
245 /* Skip emitting the MOV if it's a no-op. */
246 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
247 dst.mux != src[0].mux || dst.addr != src[0].addr) {
248 queue(c, qpu_a_MOV(dst, src[0]));
249 }
250 break;
251
252 case QOP_SF:
253 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
254 *last_inst(c) |= QPU_SF;
255 break;
256
257 case QOP_SEL_X_0_ZS:
258 case QOP_SEL_X_0_ZC:
259 case QOP_SEL_X_0_NS:
260 case QOP_SEL_X_0_NC:
261 queue(c, qpu_a_MOV(dst, src[0]));
262 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
263 QPU_COND_ZS);
264
265 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
266 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
267 1) + QPU_COND_ZS);
268 break;
269
270 case QOP_SEL_X_Y_ZS:
271 case QOP_SEL_X_Y_ZC:
272 case QOP_SEL_X_Y_NS:
273 case QOP_SEL_X_Y_NC:
274 queue(c, qpu_a_MOV(dst, src[0]));
275 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
276 QPU_COND_ZS);
277
278 queue(c, qpu_a_MOV(dst, src[1]));
279 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
280 1) + QPU_COND_ZS);
281
282 break;
283
284 case QOP_VPM_WRITE:
285 queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
286 break;
287
288 case QOP_VPM_READ:
289 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
290 break;
291
292 case QOP_RCP:
293 case QOP_RSQ:
294 case QOP_EXP2:
295 case QOP_LOG2:
296 switch (qinst->op) {
297 case QOP_RCP:
298 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
299 src[0]));
300 break;
301 case QOP_RSQ:
302 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
303 src[0]));
304 break;
305 case QOP_EXP2:
306 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
307 src[0]));
308 break;
309 case QOP_LOG2:
310 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
311 src[0]));
312 break;
313 default:
314 abort();
315 }
316
317 queue(c, qpu_a_MOV(dst, qpu_r4()));
318
319 break;
320
321 case QOP_PACK_COLORS: {
322 /* We have to be careful not to start writing over one
323 * of our source values when incrementally writing the
324 * destination. So, if the dst is one of the srcs, we
325 * pack that one first (and we pack 4 channels at once
326 * for the first pack).
327 */
328 struct qpu_reg first_pack = src[0];
329 for (int i = 0; i < 4; i++) {
330 if (src[i].mux == dst.mux &&
331 src[i].addr == dst.addr) {
332 first_pack = dst;
333 break;
334 }
335 }
336 queue(c, qpu_m_MOV(dst, first_pack));
337 *last_inst(c) |= QPU_PM;
338 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
339 QPU_PACK);
340
341 for (int i = 0; i < 4; i++) {
342 if (src[i].mux == first_pack.mux &&
343 src[i].addr == first_pack.addr) {
344 continue;
345 }
346
347 queue(c, qpu_m_MOV(dst, src[i]));
348 *last_inst(c) |= QPU_PM;
349 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
350 QPU_PACK);
351 }
352
353 break;
354 }
355
356 case QOP_FRAG_X:
357 queue(c, qpu_a_ITOF(dst,
358 qpu_ra(QPU_R_XY_PIXEL_COORD)));
359 break;
360
361 case QOP_FRAG_Y:
362 queue(c, qpu_a_ITOF(dst,
363 qpu_rb(QPU_R_XY_PIXEL_COORD)));
364 break;
365
366 case QOP_FRAG_REV_FLAG:
367 queue(c, qpu_a_ITOF(dst,
368 qpu_rb(QPU_R_MS_REV_FLAGS)));
369 break;
370
371 case QOP_FRAG_Z:
372 case QOP_FRAG_W:
373 /* QOP_FRAG_Z/W don't emit instructions, just allocate
374 * the register to the Z/W payload.
375 */
376 break;
377
378 case QOP_TLB_DISCARD_SETUP:
379 discard = true;
380 queue(c, qpu_a_MOV(src[0], src[0]));
381 *last_inst(c) |= QPU_SF;
382 break;
383
384 case QOP_TLB_STENCIL_SETUP:
385 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
386 break;
387
388 case QOP_TLB_Z_WRITE:
389 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
390 if (discard) {
391 set_last_cond_add(c, QPU_COND_ZS);
392 }
393 break;
394
395 case QOP_TLB_COLOR_READ:
396 queue(c, qpu_NOP());
397 *last_inst(c) = qpu_set_sig(*last_inst(c),
398 QPU_SIG_COLOR_LOAD);
399
400 break;
401
402 case QOP_TLB_COLOR_WRITE:
403 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
404 if (discard) {
405 set_last_cond_add(c, QPU_COND_ZS);
406 }
407 break;
408
409 case QOP_VARY_ADD_C:
410 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
411 break;
412
413 case QOP_PACK_SCALED: {
414 uint64_t a = (qpu_a_MOV(dst, src[0]) |
415 QPU_SET_FIELD(QPU_PACK_A_16A,
416 QPU_PACK));
417 uint64_t b = (qpu_a_MOV(dst, src[1]) |
418 QPU_SET_FIELD(QPU_PACK_A_16B,
419 QPU_PACK));
420
421 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
422 queue(c, b);
423 queue(c, a);
424 } else {
425 queue(c, a);
426 queue(c, b);
427 }
428 break;
429 }
430
431 case QOP_TEX_S:
432 case QOP_TEX_T:
433 case QOP_TEX_R:
434 case QOP_TEX_B:
435 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
436 (qinst->op - QOP_TEX_S)),
437 src[0]));
438 break;
439
440 case QOP_TEX_DIRECT:
441 needs_restore = fixup_raddr_conflict(c, dst,
442 &src[0], &src[1],
443 written_r3);
444 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
445 if (needs_restore)
446 queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
447 break;
448
449 case QOP_TEX_RESULT:
450 queue(c, qpu_NOP());
451 *last_inst(c) = qpu_set_sig(*last_inst(c),
452 QPU_SIG_LOAD_TMU0);
453
454 break;
455
456 case QOP_R4_UNPACK_A:
457 case QOP_R4_UNPACK_B:
458 case QOP_R4_UNPACK_C:
459 case QOP_R4_UNPACK_D:
460 assert(src[0].mux == QPU_MUX_R4);
461 queue(c, qpu_a_MOV(dst, src[0]));
462 *last_inst(c) |= QPU_PM;
463 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
464 (qinst->op -
465 QOP_R4_UNPACK_A),
466 QPU_UNPACK);
467
468 break;
469
470 case QOP_UNPACK_8A:
471 case QOP_UNPACK_8B:
472 case QOP_UNPACK_8C:
473 case QOP_UNPACK_8D: {
474 assert(src[0].mux == QPU_MUX_A);
475
476 /* And, since we're setting the pack bits, if the
477 * destination is in A it would get re-packed.
478 */
479 struct qpu_reg orig_dst = dst;
480 if (orig_dst.mux == QPU_MUX_A)
481 dst = qpu_rn(3);
482
483 queue(c, qpu_a_FMAX(dst, src[0], src[0]));
484 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
485 (qinst->op -
486 QOP_UNPACK_8A),
487 QPU_UNPACK);
488
489 if (orig_dst.mux == QPU_MUX_A) {
490 queue(c, qpu_a_MOV(orig_dst, dst));
491 }
492 }
493 break;
494
495 default:
496 assert(qinst->op < ARRAY_SIZE(translate));
497 assert(translate[qinst->op].op != 0); /* NOPs */
498
499 /* If we have only one source, put it in the second
500 * argument slot as well so that we don't take up
501 * another raddr just to get unused data.
502 */
503 if (qir_get_op_nsrc(qinst->op) == 1)
504 src[1] = src[0];
505
506 needs_restore = fixup_raddr_conflict(c, dst,
507 &src[0], &src[1],
508 written_r3);
509
510 if (translate[qinst->op].is_mul) {
511 queue(c, qpu_m_alu2(translate[qinst->op].op,
512 dst,
513 src[0], src[1]));
514 } else {
515 queue(c, qpu_a_alu2(translate[qinst->op].op,
516 dst,
517 src[0], src[1]));
518 }
519 if (needs_restore)
520 queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
521
522 break;
523 }
524
525 if (dst.mux == QPU_MUX_R3)
526 written_r3 = true;
527 }
528
529 qpu_schedule_instructions(c);
530
531 /* thread end can't have VPM write or read */
532 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
533 QPU_WADDR_ADD) == QPU_W_VPM ||
534 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
535 QPU_WADDR_MUL) == QPU_W_VPM ||
536 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
537 QPU_RADDR_A) == QPU_R_VPM ||
538 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
539 QPU_RADDR_B) == QPU_R_VPM) {
540 qpu_serialize_one_inst(c, qpu_NOP());
541 }
542
543 /* thread end can't have uniform read */
544 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
545 QPU_RADDR_A) == QPU_R_UNIF ||
546 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
547 QPU_RADDR_B) == QPU_R_UNIF) {
548 qpu_serialize_one_inst(c, qpu_NOP());
549 }
550
551 /* thread end can't have TLB operations */
552 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
553 qpu_serialize_one_inst(c, qpu_NOP());
554
555 c->qpu_insts[c->qpu_inst_count - 1] =
556 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
557 QPU_SIG_PROG_END);
558 qpu_serialize_one_inst(c, qpu_NOP());
559 qpu_serialize_one_inst(c, qpu_NOP());
560
561 switch (c->stage) {
562 case QSTAGE_VERT:
563 case QSTAGE_COORD:
564 break;
565 case QSTAGE_FRAG:
566 c->qpu_insts[c->qpu_inst_count - 1] =
567 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
568 QPU_SIG_SCOREBOARD_UNLOCK);
569 break;
570 }
571
572 if (vc4_debug & VC4_DEBUG_QPU)
573 vc4_dump_program(c);
574
575 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
576
577 free(temp_registers);
578 }