vc4: Fix use of r3 as a temp in 8-bit unpacking.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 insert_at_tail(&c->qpu_inst_list, &q->link);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 /**
68 * Some special registers can be read from either file, which lets us resolve
69 * raddr conflicts without extra MOVs.
70 */
71 static bool
72 swap_file(struct qpu_reg *src)
73 {
74 switch (src->addr) {
75 case QPU_R_UNIF:
76 case QPU_R_VARY:
77 if (src->mux == QPU_MUX_A)
78 src->mux = QPU_MUX_B;
79 else
80 src->mux = QPU_MUX_A;
81 return true;
82
83 default:
84 return false;
85 }
86 }
87
88 /**
89 * This is used to resolve the fact that we might register-allocate two
90 * different operands of an instruction to the same physical register file
91 * even though instructions have only one field for the register file source
92 * address.
93 *
94 * In that case, we need to move one to a temporary that can be used in the
95 * instruction, instead.
96 */
97 static bool
98 fixup_raddr_conflict(struct vc4_compile *c,
99 struct qpu_reg dst,
100 struct qpu_reg *src0, struct qpu_reg *src1,
101 bool r3_live)
102 {
103 if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
104 src0->mux != src1->mux ||
105 src0->addr == src1->addr) {
106 return false;
107 }
108
109 if (swap_file(src0) || swap_file(src1))
110 return false;
111
112 if (src0->mux == QPU_MUX_A) {
113 /* If we're conflicting over the A regfile, then we can just
114 * use the reserved rb31.
115 */
116 queue(c, qpu_a_MOV(qpu_rb(31), *src1));
117 *src1 = qpu_rb(31);
118 return false;
119 } else {
120 /* Otherwise, we need a non-B regfile. So, we spill r3 out to
121 * rb31, then store our desired value in r3, and tell the
122 * caller to put rb31 back into r3 when we're done.
123 */
124 if (r3_live)
125 queue(c, qpu_a_MOV(qpu_rb(31), qpu_r3()));
126 queue(c, qpu_a_MOV(qpu_r3(), *src1));
127
128 *src1 = qpu_r3();
129
130 return r3_live && dst.mux != QPU_MUX_R3;
131 }
132 }
133
134 void
135 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
136 {
137 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
138 bool discard = false;
139 uint32_t inputs_remaining = c->num_inputs;
140 uint32_t vpm_read_fifo_count = 0;
141 uint32_t vpm_read_offset = 0;
142 bool written_r3 = false;
143 bool needs_restore;
144
145 make_empty_list(&c->qpu_inst_list);
146
147 switch (c->stage) {
148 case QSTAGE_VERT:
149 case QSTAGE_COORD:
150 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
151 * load up to 16 dwords (4 vec4s) per vertex.
152 */
153 while (inputs_remaining) {
154 uint32_t num_entries = MIN2(inputs_remaining, 16);
155 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
156 vpm_read_offset |
157 0x00001a00 |
158 ((num_entries & 0xf) << 20)));
159 inputs_remaining -= num_entries;
160 vpm_read_offset += num_entries;
161 vpm_read_fifo_count++;
162 }
163 assert(vpm_read_fifo_count <= 4);
164
165 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
166 break;
167 case QSTAGE_FRAG:
168 break;
169 }
170
171 struct simple_node *node;
172 foreach(node, &c->instructions) {
173 struct qinst *qinst = (struct qinst *)node;
174
175 #if 0
176 fprintf(stderr, "translating qinst to qpu: ");
177 qir_dump_inst(qinst);
178 fprintf(stderr, "\n");
179 #endif
180
181 static const struct {
182 uint32_t op;
183 bool is_mul;
184 } translate[] = {
185 #define A(name) [QOP_##name] = {QPU_A_##name, false}
186 #define M(name) [QOP_##name] = {QPU_M_##name, true}
187 A(FADD),
188 A(FSUB),
189 A(FMIN),
190 A(FMAX),
191 A(FMINABS),
192 A(FMAXABS),
193 A(FTOI),
194 A(ITOF),
195 A(ADD),
196 A(SUB),
197 A(SHL),
198 A(SHR),
199 A(ASR),
200 A(MIN),
201 A(MAX),
202 A(AND),
203 A(OR),
204 A(XOR),
205 A(NOT),
206
207 M(FMUL),
208 M(MUL24),
209 };
210
211 struct qpu_reg src[4];
212 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
213 int index = qinst->src[i].index;
214 switch (qinst->src[i].file) {
215 case QFILE_NULL:
216 src[i] = qpu_rn(0);
217 break;
218 case QFILE_TEMP:
219 src[i] = temp_registers[index];
220 break;
221 case QFILE_UNIF:
222 src[i] = qpu_unif();
223 break;
224 case QFILE_VARY:
225 src[i] = qpu_vary();
226 break;
227 }
228 }
229
230 struct qpu_reg dst;
231 switch (qinst->dst.file) {
232 case QFILE_NULL:
233 dst = qpu_ra(QPU_W_NOP);
234 break;
235 case QFILE_TEMP:
236 dst = temp_registers[qinst->dst.index];
237 break;
238 case QFILE_VARY:
239 case QFILE_UNIF:
240 assert(!"not reached");
241 break;
242 }
243
244 switch (qinst->op) {
245 case QOP_MOV:
246 /* Skip emitting the MOV if it's a no-op. */
247 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
248 dst.mux != src[0].mux || dst.addr != src[0].addr) {
249 queue(c, qpu_a_MOV(dst, src[0]));
250 }
251 break;
252
253 case QOP_SF:
254 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
255 *last_inst(c) |= QPU_SF;
256 break;
257
258 case QOP_SEL_X_0_ZS:
259 case QOP_SEL_X_0_ZC:
260 case QOP_SEL_X_0_NS:
261 case QOP_SEL_X_0_NC:
262 queue(c, qpu_a_MOV(dst, src[0]));
263 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
264 QPU_COND_ZS);
265
266 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
267 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
268 1) + QPU_COND_ZS);
269 break;
270
271 case QOP_SEL_X_Y_ZS:
272 case QOP_SEL_X_Y_ZC:
273 case QOP_SEL_X_Y_NS:
274 case QOP_SEL_X_Y_NC:
275 queue(c, qpu_a_MOV(dst, src[0]));
276 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
277 QPU_COND_ZS);
278
279 queue(c, qpu_a_MOV(dst, src[1]));
280 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
281 1) + QPU_COND_ZS);
282
283 break;
284
285 case QOP_VPM_WRITE:
286 queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
287 break;
288
289 case QOP_VPM_READ:
290 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
291 break;
292
293 case QOP_RCP:
294 case QOP_RSQ:
295 case QOP_EXP2:
296 case QOP_LOG2:
297 switch (qinst->op) {
298 case QOP_RCP:
299 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
300 src[0]));
301 break;
302 case QOP_RSQ:
303 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
304 src[0]));
305 break;
306 case QOP_EXP2:
307 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
308 src[0]));
309 break;
310 case QOP_LOG2:
311 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
312 src[0]));
313 break;
314 default:
315 abort();
316 }
317
318 queue(c, qpu_a_MOV(dst, qpu_r4()));
319
320 break;
321
322 case QOP_PACK_COLORS: {
323 /* We have to be careful not to start writing over one
324 * of our source values when incrementally writing the
325 * destination. So, if the dst is one of the srcs, we
326 * pack that one first (and we pack 4 channels at once
327 * for the first pack).
328 */
329 struct qpu_reg first_pack = src[0];
330 for (int i = 0; i < 4; i++) {
331 if (src[i].mux == dst.mux &&
332 src[i].addr == dst.addr) {
333 first_pack = dst;
334 break;
335 }
336 }
337 queue(c, qpu_m_MOV(dst, first_pack));
338 *last_inst(c) |= QPU_PM;
339 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
340 QPU_PACK);
341
342 for (int i = 0; i < 4; i++) {
343 if (src[i].mux == first_pack.mux &&
344 src[i].addr == first_pack.addr) {
345 continue;
346 }
347
348 queue(c, qpu_m_MOV(dst, src[i]));
349 *last_inst(c) |= QPU_PM;
350 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
351 QPU_PACK);
352 }
353
354 break;
355 }
356
357 case QOP_FRAG_X:
358 queue(c, qpu_a_ITOF(dst,
359 qpu_ra(QPU_R_XY_PIXEL_COORD)));
360 break;
361
362 case QOP_FRAG_Y:
363 queue(c, qpu_a_ITOF(dst,
364 qpu_rb(QPU_R_XY_PIXEL_COORD)));
365 break;
366
367 case QOP_FRAG_REV_FLAG:
368 queue(c, qpu_a_ITOF(dst,
369 qpu_rb(QPU_R_MS_REV_FLAGS)));
370 break;
371
372 case QOP_FRAG_Z:
373 case QOP_FRAG_W:
374 /* QOP_FRAG_Z/W don't emit instructions, just allocate
375 * the register to the Z/W payload.
376 */
377 break;
378
379 case QOP_TLB_DISCARD_SETUP:
380 discard = true;
381 queue(c, qpu_a_MOV(src[0], src[0]));
382 *last_inst(c) |= QPU_SF;
383 break;
384
385 case QOP_TLB_STENCIL_SETUP:
386 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
387 break;
388
389 case QOP_TLB_Z_WRITE:
390 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
391 if (discard) {
392 set_last_cond_add(c, QPU_COND_ZS);
393 }
394 break;
395
396 case QOP_TLB_COLOR_READ:
397 queue(c, qpu_NOP());
398 *last_inst(c) = qpu_set_sig(*last_inst(c),
399 QPU_SIG_COLOR_LOAD);
400
401 break;
402
403 case QOP_TLB_COLOR_WRITE:
404 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
405 if (discard) {
406 set_last_cond_add(c, QPU_COND_ZS);
407 }
408 break;
409
410 case QOP_VARY_ADD_C:
411 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
412 break;
413
414 case QOP_PACK_SCALED: {
415 uint64_t a = (qpu_a_MOV(dst, src[0]) |
416 QPU_SET_FIELD(QPU_PACK_A_16A,
417 QPU_PACK));
418 uint64_t b = (qpu_a_MOV(dst, src[1]) |
419 QPU_SET_FIELD(QPU_PACK_A_16B,
420 QPU_PACK));
421
422 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
423 queue(c, b);
424 queue(c, a);
425 } else {
426 queue(c, a);
427 queue(c, b);
428 }
429 break;
430 }
431
432 case QOP_TEX_S:
433 case QOP_TEX_T:
434 case QOP_TEX_R:
435 case QOP_TEX_B:
436 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
437 (qinst->op - QOP_TEX_S)),
438 src[0]));
439 break;
440
441 case QOP_TEX_DIRECT:
442 needs_restore = fixup_raddr_conflict(c, dst,
443 &src[0], &src[1],
444 written_r3);
445 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
446 if (needs_restore)
447 queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
448 break;
449
450 case QOP_TEX_RESULT:
451 queue(c, qpu_NOP());
452 *last_inst(c) = qpu_set_sig(*last_inst(c),
453 QPU_SIG_LOAD_TMU0);
454
455 break;
456
457 case QOP_R4_UNPACK_A:
458 case QOP_R4_UNPACK_B:
459 case QOP_R4_UNPACK_C:
460 case QOP_R4_UNPACK_D:
461 assert(src[0].mux == QPU_MUX_R4);
462 queue(c, qpu_a_MOV(dst, src[0]));
463 *last_inst(c) |= QPU_PM;
464 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
465 (qinst->op -
466 QOP_R4_UNPACK_A),
467 QPU_UNPACK);
468
469 break;
470
471 case QOP_UNPACK_8A_F:
472 case QOP_UNPACK_8B_F:
473 case QOP_UNPACK_8C_F:
474 case QOP_UNPACK_8D_F:
475 assert(src[0].mux == QPU_MUX_A);
476
477 /* Since we're setting the pack bits, if the
478 * destination is in A it would get re-packed.
479 */
480 queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
481 qpu_rb(31) : dst),
482 src[0], src[0]));
483 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
484 (qinst->op -
485 QOP_UNPACK_8A_F),
486 QPU_UNPACK);
487
488 if (dst.mux == QPU_MUX_A) {
489 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
490 }
491 break;
492
493 default:
494 assert(qinst->op < ARRAY_SIZE(translate));
495 assert(translate[qinst->op].op != 0); /* NOPs */
496
497 /* If we have only one source, put it in the second
498 * argument slot as well so that we don't take up
499 * another raddr just to get unused data.
500 */
501 if (qir_get_op_nsrc(qinst->op) == 1)
502 src[1] = src[0];
503
504 needs_restore = fixup_raddr_conflict(c, dst,
505 &src[0], &src[1],
506 written_r3);
507
508 if (translate[qinst->op].is_mul) {
509 queue(c, qpu_m_alu2(translate[qinst->op].op,
510 dst,
511 src[0], src[1]));
512 } else {
513 queue(c, qpu_a_alu2(translate[qinst->op].op,
514 dst,
515 src[0], src[1]));
516 }
517 if (needs_restore)
518 queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
519
520 break;
521 }
522
523 if (dst.mux == QPU_MUX_R3)
524 written_r3 = true;
525 }
526
527 qpu_schedule_instructions(c);
528
529 /* thread end can't have VPM write or read */
530 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
531 QPU_WADDR_ADD) == QPU_W_VPM ||
532 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
533 QPU_WADDR_MUL) == QPU_W_VPM ||
534 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
535 QPU_RADDR_A) == QPU_R_VPM ||
536 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
537 QPU_RADDR_B) == QPU_R_VPM) {
538 qpu_serialize_one_inst(c, qpu_NOP());
539 }
540
541 /* thread end can't have uniform read */
542 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
543 QPU_RADDR_A) == QPU_R_UNIF ||
544 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
545 QPU_RADDR_B) == QPU_R_UNIF) {
546 qpu_serialize_one_inst(c, qpu_NOP());
547 }
548
549 /* thread end can't have TLB operations */
550 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
551 qpu_serialize_one_inst(c, qpu_NOP());
552
553 c->qpu_insts[c->qpu_inst_count - 1] =
554 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
555 QPU_SIG_PROG_END);
556 qpu_serialize_one_inst(c, qpu_NOP());
557 qpu_serialize_one_inst(c, qpu_NOP());
558
559 switch (c->stage) {
560 case QSTAGE_VERT:
561 case QSTAGE_COORD:
562 break;
563 case QSTAGE_FRAG:
564 c->qpu_insts[c->qpu_inst_count - 1] =
565 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
566 QPU_SIG_SCOREBOARD_UNLOCK);
567 break;
568 }
569
570 if (vc4_debug & VC4_DEBUG_QPU)
571 vc4_dump_program(c);
572
573 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
574
575 free(temp_registers);
576 }