a3eff84413795456a3f6a481f78306490a3b5b47
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 /**
68 * Some special registers can be read from either file, which lets us resolve
69 * raddr conflicts without extra MOVs.
70 */
71 static bool
72 swap_file(struct qpu_reg *src)
73 {
74 switch (src->addr) {
75 case QPU_R_UNIF:
76 case QPU_R_VARY:
77 if (src->mux == QPU_MUX_SMALL_IMM) {
78 return false;
79 } else {
80 if (src->mux == QPU_MUX_A)
81 src->mux = QPU_MUX_B;
82 else
83 src->mux = QPU_MUX_A;
84 return true;
85 }
86
87 default:
88 return false;
89 }
90 }
91
92 /**
93 * This is used to resolve the fact that we might register-allocate two
94 * different operands of an instruction to the same physical register file
95 * even though instructions have only one field for the register file source
96 * address.
97 *
98 * In that case, we need to move one to a temporary that can be used in the
99 * instruction, instead. We reserve ra31/rb31 for this purpose.
100 */
101 static void
102 fixup_raddr_conflict(struct vc4_compile *c,
103 struct qpu_reg dst,
104 struct qpu_reg *src0, struct qpu_reg *src1)
105 {
106 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
107 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
108
109 if (mux0 <= QPU_MUX_R5 ||
110 mux0 != mux1 ||
111 (src0->addr == src1->addr &&
112 src0->mux == src1->mux)) {
113 return;
114 }
115
116 if (swap_file(src0) || swap_file(src1))
117 return;
118
119 if (mux0 == QPU_MUX_A) {
120 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
121 *src0 = qpu_rb(31);
122 } else {
123 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
124 *src0 = qpu_ra(31);
125 }
126 }
127
128 void
129 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
130 {
131 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
132 bool discard = false;
133 uint32_t inputs_remaining = c->num_inputs;
134 uint32_t vpm_read_fifo_count = 0;
135 uint32_t vpm_read_offset = 0;
136 int last_vpm_read_index = -1;
137
138 list_inithead(&c->qpu_inst_list);
139
140 switch (c->stage) {
141 case QSTAGE_VERT:
142 case QSTAGE_COORD:
143 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
144 * load up to 16 dwords (4 vec4s) per vertex.
145 */
146 while (inputs_remaining) {
147 uint32_t num_entries = MIN2(inputs_remaining, 16);
148 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
149 vpm_read_offset |
150 0x00001a00 |
151 ((num_entries & 0xf) << 20)));
152 inputs_remaining -= num_entries;
153 vpm_read_offset += num_entries;
154 vpm_read_fifo_count++;
155 }
156 assert(vpm_read_fifo_count <= 4);
157
158 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
159 break;
160 case QSTAGE_FRAG:
161 break;
162 }
163
164 list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
165 #if 0
166 fprintf(stderr, "translating qinst to qpu: ");
167 qir_dump_inst(qinst);
168 fprintf(stderr, "\n");
169 #endif
170
171 static const struct {
172 uint32_t op;
173 } translate[] = {
174 #define A(name) [QOP_##name] = {QPU_A_##name}
175 #define M(name) [QOP_##name] = {QPU_M_##name}
176 A(FADD),
177 A(FSUB),
178 A(FMIN),
179 A(FMAX),
180 A(FMINABS),
181 A(FMAXABS),
182 A(FTOI),
183 A(ITOF),
184 A(ADD),
185 A(SUB),
186 A(SHL),
187 A(SHR),
188 A(ASR),
189 A(MIN),
190 A(MAX),
191 A(AND),
192 A(OR),
193 A(XOR),
194 A(NOT),
195
196 M(FMUL),
197 M(V8MULD),
198 M(V8MIN),
199 M(V8MAX),
200 M(V8ADDS),
201 M(V8SUBS),
202 M(MUL24),
203
204 /* If we replicate src[0] out to src[1], this works
205 * out the same as a MOV.
206 */
207 [QOP_MOV] = { QPU_A_OR },
208 [QOP_FMOV] = { QPU_A_FMAX },
209 };
210
211 uint64_t unpack = 0;
212 struct qpu_reg src[4];
213 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
214 int index = qinst->src[i].index;
215 switch (qinst->src[i].file) {
216 case QFILE_NULL:
217 src[i] = qpu_rn(0);
218 break;
219 case QFILE_TEMP:
220 src[i] = temp_registers[index];
221 if (qinst->src[i].pack) {
222 assert(!unpack ||
223 unpack == qinst->src[i].pack);
224 unpack = QPU_SET_FIELD(qinst->src[i].pack,
225 QPU_UNPACK);
226 if (src[i].mux == QPU_MUX_R4)
227 unpack |= QPU_PM;
228 }
229 break;
230 case QFILE_UNIF:
231 src[i] = qpu_unif();
232 break;
233 case QFILE_VARY:
234 src[i] = qpu_vary();
235 break;
236 case QFILE_SMALL_IMM:
237 src[i].mux = QPU_MUX_SMALL_IMM;
238 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
239 /* This should only have returned a valid
240 * small immediate field, not ~0 for failure.
241 */
242 assert(src[i].addr <= 47);
243 break;
244 case QFILE_VPM:
245 assert((int)qinst->src[i].index >=
246 last_vpm_read_index);
247 (void)last_vpm_read_index;
248 last_vpm_read_index = qinst->src[i].index;
249 src[i] = qpu_ra(QPU_R_VPM);
250 break;
251 }
252 }
253
254 struct qpu_reg dst;
255 switch (qinst->dst.file) {
256 case QFILE_NULL:
257 dst = qpu_ra(QPU_W_NOP);
258 break;
259 case QFILE_TEMP:
260 dst = temp_registers[qinst->dst.index];
261 break;
262 case QFILE_VPM:
263 dst = qpu_ra(QPU_W_VPM);
264 break;
265 case QFILE_VARY:
266 case QFILE_UNIF:
267 case QFILE_SMALL_IMM:
268 assert(!"not reached");
269 break;
270 }
271
272 switch (qinst->op) {
273 case QOP_SEL_X_0_ZS:
274 case QOP_SEL_X_0_ZC:
275 case QOP_SEL_X_0_NS:
276 case QOP_SEL_X_0_NC:
277 queue(c, qpu_a_MOV(dst, src[0]));
278 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
279 QPU_COND_ZS);
280
281 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
282 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
283 1) + QPU_COND_ZS);
284 break;
285
286 case QOP_SEL_X_Y_ZS:
287 case QOP_SEL_X_Y_ZC:
288 case QOP_SEL_X_Y_NS:
289 case QOP_SEL_X_Y_NC:
290 queue(c, qpu_a_MOV(dst, src[0]));
291 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
292 QPU_COND_ZS);
293
294 queue(c, qpu_a_MOV(dst, src[1]));
295 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
296 1) + QPU_COND_ZS);
297
298 break;
299
300 case QOP_RCP:
301 case QOP_RSQ:
302 case QOP_EXP2:
303 case QOP_LOG2:
304 switch (qinst->op) {
305 case QOP_RCP:
306 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
307 src[0]));
308 break;
309 case QOP_RSQ:
310 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
311 src[0]));
312 break;
313 case QOP_EXP2:
314 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
315 src[0]));
316 break;
317 case QOP_LOG2:
318 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
319 src[0]));
320 break;
321 default:
322 abort();
323 }
324
325 if (dst.mux != QPU_MUX_R4)
326 queue(c, qpu_a_MOV(dst, qpu_r4()));
327
328 break;
329
330 case QOP_PACK_8888_F:
331 queue(c, qpu_m_MOV(dst, src[0]));
332 *last_inst(c) |= QPU_PM;
333 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
334 QPU_PACK);
335 break;
336
337 case QOP_PACK_8A_F:
338 case QOP_PACK_8B_F:
339 case QOP_PACK_8C_F:
340 case QOP_PACK_8D_F:
341 queue(c,
342 qpu_m_MOV(dst, src[0]) |
343 QPU_PM |
344 QPU_SET_FIELD(QPU_PACK_MUL_8A +
345 qinst->op - QOP_PACK_8A_F,
346 QPU_PACK));
347 break;
348
349 case QOP_FRAG_X:
350 queue(c, qpu_a_ITOF(dst,
351 qpu_ra(QPU_R_XY_PIXEL_COORD)));
352 break;
353
354 case QOP_FRAG_Y:
355 queue(c, qpu_a_ITOF(dst,
356 qpu_rb(QPU_R_XY_PIXEL_COORD)));
357 break;
358
359 case QOP_FRAG_REV_FLAG:
360 queue(c, qpu_a_ITOF(dst,
361 qpu_rb(QPU_R_MS_REV_FLAGS)));
362 break;
363
364 case QOP_FRAG_Z:
365 case QOP_FRAG_W:
366 /* QOP_FRAG_Z/W don't emit instructions, just allocate
367 * the register to the Z/W payload.
368 */
369 break;
370
371 case QOP_TLB_DISCARD_SETUP:
372 discard = true;
373 queue(c, qpu_a_MOV(src[0], src[0]));
374 *last_inst(c) |= QPU_SF;
375 break;
376
377 case QOP_TLB_STENCIL_SETUP:
378 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
379 break;
380
381 case QOP_TLB_Z_WRITE:
382 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
383 if (discard) {
384 set_last_cond_add(c, QPU_COND_ZS);
385 }
386 break;
387
388 case QOP_TLB_COLOR_READ:
389 queue(c, qpu_NOP());
390 *last_inst(c) = qpu_set_sig(*last_inst(c),
391 QPU_SIG_COLOR_LOAD);
392
393 if (dst.mux != QPU_MUX_R4)
394 queue(c, qpu_a_MOV(dst, qpu_r4()));
395 break;
396
397 case QOP_TLB_COLOR_WRITE:
398 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
399 if (discard) {
400 set_last_cond_add(c, QPU_COND_ZS);
401 }
402 break;
403
404 case QOP_VARY_ADD_C:
405 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
406 break;
407
408 case QOP_TEX_S:
409 case QOP_TEX_T:
410 case QOP_TEX_R:
411 case QOP_TEX_B:
412 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
413 (qinst->op - QOP_TEX_S)),
414 src[0]));
415 break;
416
417 case QOP_TEX_DIRECT:
418 fixup_raddr_conflict(c, dst, &src[0], &src[1]);
419 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
420 break;
421
422 case QOP_TEX_RESULT:
423 queue(c, qpu_NOP());
424 *last_inst(c) = qpu_set_sig(*last_inst(c),
425 QPU_SIG_LOAD_TMU0);
426 if (dst.mux != QPU_MUX_R4)
427 queue(c, qpu_a_MOV(dst, qpu_r4()));
428 break;
429
430 default:
431 assert(qinst->op < ARRAY_SIZE(translate));
432 assert(translate[qinst->op].op != 0); /* NOPs */
433
434 /* Skip emitting the MOV if it's a no-op. */
435 if (qir_is_raw_mov(qinst) &&
436 dst.mux == src[0].mux && dst.addr == src[0].addr) {
437 break;
438 }
439
440 /* If we have only one source, put it in the second
441 * argument slot as well so that we don't take up
442 * another raddr just to get unused data.
443 */
444 if (qir_get_op_nsrc(qinst->op) == 1)
445 src[1] = src[0];
446
447 fixup_raddr_conflict(c, dst, &src[0], &src[1]);
448
449 if (qir_is_mul(qinst)) {
450 queue(c, qpu_m_alu2(translate[qinst->op].op,
451 dst,
452 src[0], src[1]));
453 if (qinst->dst.pack) {
454 *last_inst(c) |= QPU_PM;
455 *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
456 QPU_PACK);
457 }
458 } else {
459 queue(c, qpu_a_alu2(translate[qinst->op].op,
460 dst,
461 src[0], src[1]));
462 if (qinst->dst.pack) {
463 assert(dst.mux == QPU_MUX_A);
464 *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
465 QPU_PACK);
466 }
467 }
468
469 break;
470 }
471
472 if (qinst->sf) {
473 assert(!qir_is_multi_instruction(qinst));
474 *last_inst(c) |= QPU_SF;
475 }
476 }
477
478 qpu_schedule_instructions(c);
479
480 /* thread end can't have VPM write or read */
481 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
482 QPU_WADDR_ADD) == QPU_W_VPM ||
483 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
484 QPU_WADDR_MUL) == QPU_W_VPM ||
485 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
486 QPU_RADDR_A) == QPU_R_VPM ||
487 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
488 QPU_RADDR_B) == QPU_R_VPM) {
489 qpu_serialize_one_inst(c, qpu_NOP());
490 }
491
492 /* thread end can't have uniform read */
493 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
494 QPU_RADDR_A) == QPU_R_UNIF ||
495 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
496 QPU_RADDR_B) == QPU_R_UNIF) {
497 qpu_serialize_one_inst(c, qpu_NOP());
498 }
499
500 /* thread end can't have TLB operations */
501 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
502 qpu_serialize_one_inst(c, qpu_NOP());
503
504 c->qpu_insts[c->qpu_inst_count - 1] =
505 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
506 QPU_SIG_PROG_END);
507 qpu_serialize_one_inst(c, qpu_NOP());
508 qpu_serialize_one_inst(c, qpu_NOP());
509
510 switch (c->stage) {
511 case QSTAGE_VERT:
512 case QSTAGE_COORD:
513 break;
514 case QSTAGE_FRAG:
515 c->qpu_insts[c->qpu_inst_count - 1] =
516 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
517 QPU_SIG_SCOREBOARD_UNLOCK);
518 break;
519 }
520
521 if (vc4_debug & VC4_DEBUG_QPU)
522 vc4_dump_program(c);
523
524 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
525
526 free(temp_registers);
527 }