freedreno/a4xx: format updates
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 /**
68 * Some special registers can be read from either file, which lets us resolve
69 * raddr conflicts without extra MOVs.
70 */
71 static bool
72 swap_file(struct qpu_reg *src)
73 {
74 switch (src->addr) {
75 case QPU_R_UNIF:
76 case QPU_R_VARY:
77 if (src->mux == QPU_MUX_SMALL_IMM) {
78 return false;
79 } else {
80 if (src->mux == QPU_MUX_A)
81 src->mux = QPU_MUX_B;
82 else
83 src->mux = QPU_MUX_A;
84 return true;
85 }
86
87 default:
88 return false;
89 }
90 }
91
92 /**
93 * This is used to resolve the fact that we might register-allocate two
94 * different operands of an instruction to the same physical register file
95 * even though instructions have only one field for the register file source
96 * address.
97 *
98 * In that case, we need to move one to a temporary that can be used in the
99 * instruction, instead. We reserve ra31/rb31 for this purpose.
100 */
101 static void
102 fixup_raddr_conflict(struct vc4_compile *c,
103 struct qpu_reg dst,
104 struct qpu_reg *src0, struct qpu_reg *src1)
105 {
106 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
107 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
108
109 if (mux0 <= QPU_MUX_R5 ||
110 mux0 != mux1 ||
111 (src0->addr == src1->addr &&
112 src0->mux == src1->mux)) {
113 return;
114 }
115
116 if (swap_file(src0) || swap_file(src1))
117 return;
118
119 if (mux0 == QPU_MUX_A) {
120 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
121 *src0 = qpu_rb(31);
122 } else {
123 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
124 *src0 = qpu_ra(31);
125 }
126 }
127
128 void
129 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
130 {
131 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
132 bool discard = false;
133 uint32_t inputs_remaining = c->num_inputs;
134 uint32_t vpm_read_fifo_count = 0;
135 uint32_t vpm_read_offset = 0;
136 int last_vpm_read_index = -1;
137 /* Map from the QIR ops enum order to QPU unpack bits. */
138 static const uint32_t unpack_map[] = {
139 QPU_UNPACK_8A,
140 QPU_UNPACK_8B,
141 QPU_UNPACK_8C,
142 QPU_UNPACK_8D,
143 QPU_UNPACK_16A_TO_F32,
144 QPU_UNPACK_16B_TO_F32,
145 };
146
147 list_inithead(&c->qpu_inst_list);
148
149 switch (c->stage) {
150 case QSTAGE_VERT:
151 case QSTAGE_COORD:
152 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
153 * load up to 16 dwords (4 vec4s) per vertex.
154 */
155 while (inputs_remaining) {
156 uint32_t num_entries = MIN2(inputs_remaining, 16);
157 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
158 vpm_read_offset |
159 0x00001a00 |
160 ((num_entries & 0xf) << 20)));
161 inputs_remaining -= num_entries;
162 vpm_read_offset += num_entries;
163 vpm_read_fifo_count++;
164 }
165 assert(vpm_read_fifo_count <= 4);
166
167 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
168 break;
169 case QSTAGE_FRAG:
170 break;
171 }
172
173 list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
174 #if 0
175 fprintf(stderr, "translating qinst to qpu: ");
176 qir_dump_inst(qinst);
177 fprintf(stderr, "\n");
178 #endif
179
180 static const struct {
181 uint32_t op;
182 bool is_mul;
183 } translate[] = {
184 #define A(name) [QOP_##name] = {QPU_A_##name, false}
185 #define M(name) [QOP_##name] = {QPU_M_##name, true}
186 A(FADD),
187 A(FSUB),
188 A(FMIN),
189 A(FMAX),
190 A(FMINABS),
191 A(FMAXABS),
192 A(FTOI),
193 A(ITOF),
194 A(ADD),
195 A(SUB),
196 A(SHL),
197 A(SHR),
198 A(ASR),
199 A(MIN),
200 A(MAX),
201 A(AND),
202 A(OR),
203 A(XOR),
204 A(NOT),
205
206 M(FMUL),
207 M(MUL24),
208 };
209
210 struct qpu_reg src[4];
211 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
212 int index = qinst->src[i].index;
213 switch (qinst->src[i].file) {
214 case QFILE_NULL:
215 src[i] = qpu_rn(0);
216 break;
217 case QFILE_TEMP:
218 src[i] = temp_registers[index];
219 break;
220 case QFILE_UNIF:
221 src[i] = qpu_unif();
222 break;
223 case QFILE_VARY:
224 src[i] = qpu_vary();
225 break;
226 case QFILE_SMALL_IMM:
227 src[i].mux = QPU_MUX_SMALL_IMM;
228 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
229 /* This should only have returned a valid
230 * small immediate field, not ~0 for failure.
231 */
232 assert(src[i].addr <= 47);
233 break;
234 case QFILE_VPM:
235 assert((int)qinst->src[i].index >=
236 last_vpm_read_index);
237 (void)last_vpm_read_index;
238 last_vpm_read_index = qinst->src[i].index;
239 src[i] = qpu_ra(QPU_R_VPM);
240 break;
241 }
242 }
243
244 struct qpu_reg dst;
245 switch (qinst->dst.file) {
246 case QFILE_NULL:
247 dst = qpu_ra(QPU_W_NOP);
248 break;
249 case QFILE_TEMP:
250 dst = temp_registers[qinst->dst.index];
251 break;
252 case QFILE_VPM:
253 dst = qpu_ra(QPU_W_VPM);
254 break;
255 case QFILE_VARY:
256 case QFILE_UNIF:
257 case QFILE_SMALL_IMM:
258 assert(!"not reached");
259 break;
260 }
261
262 switch (qinst->op) {
263 case QOP_MOV:
264 /* Skip emitting the MOV if it's a no-op. */
265 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
266 dst.mux != src[0].mux || dst.addr != src[0].addr) {
267 queue(c, qpu_a_MOV(dst, src[0]));
268 }
269 break;
270
271 case QOP_SEL_X_0_ZS:
272 case QOP_SEL_X_0_ZC:
273 case QOP_SEL_X_0_NS:
274 case QOP_SEL_X_0_NC:
275 queue(c, qpu_a_MOV(dst, src[0]));
276 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
277 QPU_COND_ZS);
278
279 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
280 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
281 1) + QPU_COND_ZS);
282 break;
283
284 case QOP_SEL_X_Y_ZS:
285 case QOP_SEL_X_Y_ZC:
286 case QOP_SEL_X_Y_NS:
287 case QOP_SEL_X_Y_NC:
288 queue(c, qpu_a_MOV(dst, src[0]));
289 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
290 QPU_COND_ZS);
291
292 queue(c, qpu_a_MOV(dst, src[1]));
293 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
294 1) + QPU_COND_ZS);
295
296 break;
297
298 case QOP_RCP:
299 case QOP_RSQ:
300 case QOP_EXP2:
301 case QOP_LOG2:
302 switch (qinst->op) {
303 case QOP_RCP:
304 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
305 src[0]));
306 break;
307 case QOP_RSQ:
308 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
309 src[0]));
310 break;
311 case QOP_EXP2:
312 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
313 src[0]));
314 break;
315 case QOP_LOG2:
316 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
317 src[0]));
318 break;
319 default:
320 abort();
321 }
322
323 if (dst.mux != QPU_MUX_R4)
324 queue(c, qpu_a_MOV(dst, qpu_r4()));
325
326 break;
327
328 case QOP_PACK_8888_F:
329 queue(c, qpu_m_MOV(dst, src[0]));
330 *last_inst(c) |= QPU_PM;
331 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
332 QPU_PACK);
333 break;
334
335 case QOP_PACK_8A_F:
336 case QOP_PACK_8B_F:
337 case QOP_PACK_8C_F:
338 case QOP_PACK_8D_F:
339 /* If dst doesn't happen to already contain src[0],
340 * then we have to move it in.
341 */
342 if (qinst->src[0].file != QFILE_NULL &&
343 (src[0].mux != dst.mux || src[0].addr != dst.addr)) {
344 /* Don't overwrite src1 while setting up
345 * the dst!
346 */
347 if (dst.mux == src[1].mux &&
348 dst.addr == src[1].addr) {
349 queue(c, qpu_m_MOV(qpu_rb(31), src[1]));
350 src[1] = qpu_rb(31);
351 }
352
353 queue(c, qpu_m_MOV(dst, src[0]));
354 }
355
356 queue(c, qpu_m_MOV(dst, src[1]));
357 *last_inst(c) |= QPU_PM;
358 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A +
359 qinst->op - QOP_PACK_8A_F,
360 QPU_PACK);
361 break;
362
363 case QOP_FRAG_X:
364 queue(c, qpu_a_ITOF(dst,
365 qpu_ra(QPU_R_XY_PIXEL_COORD)));
366 break;
367
368 case QOP_FRAG_Y:
369 queue(c, qpu_a_ITOF(dst,
370 qpu_rb(QPU_R_XY_PIXEL_COORD)));
371 break;
372
373 case QOP_FRAG_REV_FLAG:
374 queue(c, qpu_a_ITOF(dst,
375 qpu_rb(QPU_R_MS_REV_FLAGS)));
376 break;
377
378 case QOP_FRAG_Z:
379 case QOP_FRAG_W:
380 /* QOP_FRAG_Z/W don't emit instructions, just allocate
381 * the register to the Z/W payload.
382 */
383 break;
384
385 case QOP_TLB_DISCARD_SETUP:
386 discard = true;
387 queue(c, qpu_a_MOV(src[0], src[0]));
388 *last_inst(c) |= QPU_SF;
389 break;
390
391 case QOP_TLB_STENCIL_SETUP:
392 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
393 break;
394
395 case QOP_TLB_Z_WRITE:
396 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
397 if (discard) {
398 set_last_cond_add(c, QPU_COND_ZS);
399 }
400 break;
401
402 case QOP_TLB_COLOR_READ:
403 queue(c, qpu_NOP());
404 *last_inst(c) = qpu_set_sig(*last_inst(c),
405 QPU_SIG_COLOR_LOAD);
406
407 if (dst.mux != QPU_MUX_R4)
408 queue(c, qpu_a_MOV(dst, qpu_r4()));
409 break;
410
411 case QOP_TLB_COLOR_WRITE:
412 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
413 if (discard) {
414 set_last_cond_add(c, QPU_COND_ZS);
415 }
416 break;
417
418 case QOP_VARY_ADD_C:
419 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
420 break;
421
422 case QOP_PACK_SCALED: {
423 uint64_t a = (qpu_a_MOV(dst, src[0]) |
424 QPU_SET_FIELD(QPU_PACK_A_16A,
425 QPU_PACK));
426 uint64_t b = (qpu_a_MOV(dst, src[1]) |
427 QPU_SET_FIELD(QPU_PACK_A_16B,
428 QPU_PACK));
429
430 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
431 queue(c, b);
432 queue(c, a);
433 } else {
434 queue(c, a);
435 queue(c, b);
436 }
437 break;
438 }
439
440 case QOP_TEX_S:
441 case QOP_TEX_T:
442 case QOP_TEX_R:
443 case QOP_TEX_B:
444 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
445 (qinst->op - QOP_TEX_S)),
446 src[0]));
447 break;
448
449 case QOP_TEX_DIRECT:
450 fixup_raddr_conflict(c, dst, &src[0], &src[1]);
451 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
452 break;
453
454 case QOP_TEX_RESULT:
455 queue(c, qpu_NOP());
456 *last_inst(c) = qpu_set_sig(*last_inst(c),
457 QPU_SIG_LOAD_TMU0);
458 if (dst.mux != QPU_MUX_R4)
459 queue(c, qpu_a_MOV(dst, qpu_r4()));
460 break;
461
462 case QOP_UNPACK_8A_F:
463 case QOP_UNPACK_8B_F:
464 case QOP_UNPACK_8C_F:
465 case QOP_UNPACK_8D_F:
466 case QOP_UNPACK_16A_F:
467 case QOP_UNPACK_16B_F: {
468 if (src[0].mux == QPU_MUX_R4) {
469 queue(c, qpu_a_MOV(dst, src[0]));
470 *last_inst(c) |= QPU_PM;
471 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
472 (qinst->op -
473 QOP_UNPACK_8A_F),
474 QPU_UNPACK);
475 } else {
476 assert(src[0].mux == QPU_MUX_A);
477
478 /* Since we're setting the pack bits, if the
479 * destination is in A it would get re-packed.
480 */
481 queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
482 qpu_rb(31) : dst),
483 src[0], src[0]));
484 *last_inst(c) |=
485 QPU_SET_FIELD(unpack_map[qinst->op -
486 QOP_UNPACK_8A_F],
487 QPU_UNPACK);
488
489 if (dst.mux == QPU_MUX_A) {
490 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
491 }
492 }
493 }
494 break;
495
496 case QOP_UNPACK_8A_I:
497 case QOP_UNPACK_8B_I:
498 case QOP_UNPACK_8C_I:
499 case QOP_UNPACK_8D_I:
500 case QOP_UNPACK_16A_I:
501 case QOP_UNPACK_16B_I: {
502 assert(src[0].mux == QPU_MUX_A);
503
504 /* Since we're setting the pack bits, if the
505 * destination is in A it would get re-packed.
506 */
507 queue(c, qpu_a_MOV((dst.mux == QPU_MUX_A ?
508 qpu_rb(31) : dst), src[0]));
509 *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
510 QOP_UNPACK_8A_I],
511 QPU_UNPACK);
512
513 if (dst.mux == QPU_MUX_A) {
514 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
515 }
516 }
517 break;
518
519 default:
520 assert(qinst->op < ARRAY_SIZE(translate));
521 assert(translate[qinst->op].op != 0); /* NOPs */
522
523 /* If we have only one source, put it in the second
524 * argument slot as well so that we don't take up
525 * another raddr just to get unused data.
526 */
527 if (qir_get_op_nsrc(qinst->op) == 1)
528 src[1] = src[0];
529
530 fixup_raddr_conflict(c, dst, &src[0], &src[1]);
531
532 if (translate[qinst->op].is_mul) {
533 queue(c, qpu_m_alu2(translate[qinst->op].op,
534 dst,
535 src[0], src[1]));
536 } else {
537 queue(c, qpu_a_alu2(translate[qinst->op].op,
538 dst,
539 src[0], src[1]));
540 }
541
542 break;
543 }
544
545 if (qinst->sf) {
546 assert(!qir_is_multi_instruction(qinst));
547 *last_inst(c) |= QPU_SF;
548 }
549 }
550
551 qpu_schedule_instructions(c);
552
553 /* thread end can't have VPM write or read */
554 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
555 QPU_WADDR_ADD) == QPU_W_VPM ||
556 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
557 QPU_WADDR_MUL) == QPU_W_VPM ||
558 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
559 QPU_RADDR_A) == QPU_R_VPM ||
560 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
561 QPU_RADDR_B) == QPU_R_VPM) {
562 qpu_serialize_one_inst(c, qpu_NOP());
563 }
564
565 /* thread end can't have uniform read */
566 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
567 QPU_RADDR_A) == QPU_R_UNIF ||
568 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
569 QPU_RADDR_B) == QPU_R_UNIF) {
570 qpu_serialize_one_inst(c, qpu_NOP());
571 }
572
573 /* thread end can't have TLB operations */
574 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
575 qpu_serialize_one_inst(c, qpu_NOP());
576
577 c->qpu_insts[c->qpu_inst_count - 1] =
578 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
579 QPU_SIG_PROG_END);
580 qpu_serialize_one_inst(c, qpu_NOP());
581 qpu_serialize_one_inst(c, qpu_NOP());
582
583 switch (c->stage) {
584 case QSTAGE_VERT:
585 case QSTAGE_COORD:
586 break;
587 case QSTAGE_FRAG:
588 c->qpu_insts[c->qpu_inst_count - 1] =
589 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
590 QPU_SIG_SCOREBOARD_UNLOCK);
591 break;
592 }
593
594 if (vc4_debug & VC4_DEBUG_QPU)
595 vc4_dump_program(c);
596
597 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
598
599 free(temp_registers);
600 }