7d4144ac30f5422f91ffd1653dad274b9d1b2a14
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 /**
68 * Some special registers can be read from either file, which lets us resolve
69 * raddr conflicts without extra MOVs.
70 */
71 static bool
72 swap_file(struct qpu_reg *src)
73 {
74 switch (src->addr) {
75 case QPU_R_UNIF:
76 case QPU_R_VARY:
77 if (src->mux == QPU_MUX_SMALL_IMM) {
78 return false;
79 } else {
80 if (src->mux == QPU_MUX_A)
81 src->mux = QPU_MUX_B;
82 else
83 src->mux = QPU_MUX_A;
84 return true;
85 }
86
87 default:
88 return false;
89 }
90 }
91
92 /**
93 * This is used to resolve the fact that we might register-allocate two
94 * different operands of an instruction to the same physical register file
95 * even though instructions have only one field for the register file source
96 * address.
97 *
98 * In that case, we need to move one to a temporary that can be used in the
99 * instruction, instead. We reserve ra31/rb31 for this purpose.
100 */
101 static void
102 fixup_raddr_conflict(struct vc4_compile *c,
103 struct qpu_reg dst,
104 struct qpu_reg *src0, struct qpu_reg *src1)
105 {
106 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
107 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
108
109 if (mux0 <= QPU_MUX_R5 ||
110 mux0 != mux1 ||
111 (src0->addr == src1->addr &&
112 src0->mux == src1->mux)) {
113 return;
114 }
115
116 if (swap_file(src0) || swap_file(src1))
117 return;
118
119 if (mux0 == QPU_MUX_A) {
120 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
121 *src0 = qpu_rb(31);
122 } else {
123 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
124 *src0 = qpu_ra(31);
125 }
126 }
127
128 static void
129 set_last_dst_pack(struct vc4_compile *c, struct qinst *inst)
130 {
131 bool had_pm = *last_inst(c) & QPU_PM;
132 bool had_ws = *last_inst(c) & QPU_WS;
133 uint32_t unpack = QPU_GET_FIELD(*last_inst(c), QPU_UNPACK);
134
135 if (!inst->dst.pack)
136 return;
137
138 *last_inst(c) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
139
140 if (qir_is_mul(inst)) {
141 assert(!unpack || had_pm);
142 *last_inst(c) |= QPU_PM;
143 } else {
144 assert(!unpack || !had_pm);
145 assert(!had_ws); /* dst must be a-file to pack. */
146 }
147 }
148
149 void
150 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
151 {
152 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
153 bool discard = false;
154 uint32_t inputs_remaining = c->num_inputs;
155 uint32_t vpm_read_fifo_count = 0;
156 uint32_t vpm_read_offset = 0;
157 int last_vpm_read_index = -1;
158
159 list_inithead(&c->qpu_inst_list);
160
161 switch (c->stage) {
162 case QSTAGE_VERT:
163 case QSTAGE_COORD:
164 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
165 * load up to 16 dwords (4 vec4s) per vertex.
166 */
167 while (inputs_remaining) {
168 uint32_t num_entries = MIN2(inputs_remaining, 16);
169 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
170 vpm_read_offset |
171 0x00001a00 |
172 ((num_entries & 0xf) << 20)));
173 inputs_remaining -= num_entries;
174 vpm_read_offset += num_entries;
175 vpm_read_fifo_count++;
176 }
177 assert(vpm_read_fifo_count <= 4);
178
179 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
180 break;
181 case QSTAGE_FRAG:
182 break;
183 }
184
185 list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
186 #if 0
187 fprintf(stderr, "translating qinst to qpu: ");
188 qir_dump_inst(qinst);
189 fprintf(stderr, "\n");
190 #endif
191
192 static const struct {
193 uint32_t op;
194 } translate[] = {
195 #define A(name) [QOP_##name] = {QPU_A_##name}
196 #define M(name) [QOP_##name] = {QPU_M_##name}
197 A(FADD),
198 A(FSUB),
199 A(FMIN),
200 A(FMAX),
201 A(FMINABS),
202 A(FMAXABS),
203 A(FTOI),
204 A(ITOF),
205 A(ADD),
206 A(SUB),
207 A(SHL),
208 A(SHR),
209 A(ASR),
210 A(MIN),
211 A(MAX),
212 A(AND),
213 A(OR),
214 A(XOR),
215 A(NOT),
216
217 M(FMUL),
218 M(V8MULD),
219 M(V8MIN),
220 M(V8MAX),
221 M(V8ADDS),
222 M(V8SUBS),
223 M(MUL24),
224
225 /* If we replicate src[0] out to src[1], this works
226 * out the same as a MOV.
227 */
228 [QOP_MOV] = { QPU_A_OR },
229 [QOP_FMOV] = { QPU_A_FMAX },
230 };
231
232 uint64_t unpack = 0;
233 struct qpu_reg src[4];
234 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
235 int index = qinst->src[i].index;
236 switch (qinst->src[i].file) {
237 case QFILE_NULL:
238 src[i] = qpu_rn(0);
239 break;
240 case QFILE_TEMP:
241 src[i] = temp_registers[index];
242 if (qinst->src[i].pack) {
243 assert(!unpack ||
244 unpack == qinst->src[i].pack);
245 unpack = QPU_SET_FIELD(qinst->src[i].pack,
246 QPU_UNPACK);
247 if (src[i].mux == QPU_MUX_R4)
248 unpack |= QPU_PM;
249 }
250 break;
251 case QFILE_UNIF:
252 src[i] = qpu_unif();
253 break;
254 case QFILE_VARY:
255 src[i] = qpu_vary();
256 break;
257 case QFILE_SMALL_IMM:
258 src[i].mux = QPU_MUX_SMALL_IMM;
259 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
260 /* This should only have returned a valid
261 * small immediate field, not ~0 for failure.
262 */
263 assert(src[i].addr <= 47);
264 break;
265 case QFILE_VPM:
266 assert((int)qinst->src[i].index >=
267 last_vpm_read_index);
268 (void)last_vpm_read_index;
269 last_vpm_read_index = qinst->src[i].index;
270 src[i] = qpu_ra(QPU_R_VPM);
271 break;
272 }
273 }
274
275 struct qpu_reg dst;
276 switch (qinst->dst.file) {
277 case QFILE_NULL:
278 dst = qpu_ra(QPU_W_NOP);
279 break;
280 case QFILE_TEMP:
281 dst = temp_registers[qinst->dst.index];
282 break;
283 case QFILE_VPM:
284 dst = qpu_ra(QPU_W_VPM);
285 break;
286 case QFILE_VARY:
287 case QFILE_UNIF:
288 case QFILE_SMALL_IMM:
289 assert(!"not reached");
290 break;
291 }
292
293 switch (qinst->op) {
294 case QOP_SEL_X_0_ZS:
295 case QOP_SEL_X_0_ZC:
296 case QOP_SEL_X_0_NS:
297 case QOP_SEL_X_0_NC:
298 queue(c, qpu_a_MOV(dst, src[0]));
299 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
300 QPU_COND_ZS);
301
302 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
303 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
304 1) + QPU_COND_ZS);
305 break;
306
307 case QOP_SEL_X_Y_ZS:
308 case QOP_SEL_X_Y_ZC:
309 case QOP_SEL_X_Y_NS:
310 case QOP_SEL_X_Y_NC:
311 queue(c, qpu_a_MOV(dst, src[0]));
312 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
313 QPU_COND_ZS);
314
315 queue(c, qpu_a_MOV(dst, src[1]));
316 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
317 1) + QPU_COND_ZS);
318
319 break;
320
321 case QOP_RCP:
322 case QOP_RSQ:
323 case QOP_EXP2:
324 case QOP_LOG2:
325 switch (qinst->op) {
326 case QOP_RCP:
327 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
328 src[0]));
329 break;
330 case QOP_RSQ:
331 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
332 src[0]));
333 break;
334 case QOP_EXP2:
335 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
336 src[0]));
337 break;
338 case QOP_LOG2:
339 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
340 src[0]));
341 break;
342 default:
343 abort();
344 }
345
346 if (dst.mux != QPU_MUX_R4)
347 queue(c, qpu_a_MOV(dst, qpu_r4()));
348
349 break;
350
351 case QOP_PACK_8888_F:
352 queue(c, qpu_m_MOV(dst, src[0]));
353 *last_inst(c) |= QPU_PM;
354 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
355 QPU_PACK);
356 break;
357
358 case QOP_PACK_8A_F:
359 case QOP_PACK_8B_F:
360 case QOP_PACK_8C_F:
361 case QOP_PACK_8D_F:
362 queue(c,
363 qpu_m_MOV(dst, src[0]) |
364 QPU_PM |
365 QPU_SET_FIELD(QPU_PACK_MUL_8A +
366 qinst->op - QOP_PACK_8A_F,
367 QPU_PACK));
368 break;
369
370 case QOP_FRAG_X:
371 queue(c, qpu_a_ITOF(dst,
372 qpu_ra(QPU_R_XY_PIXEL_COORD)));
373 break;
374
375 case QOP_FRAG_Y:
376 queue(c, qpu_a_ITOF(dst,
377 qpu_rb(QPU_R_XY_PIXEL_COORD)));
378 break;
379
380 case QOP_FRAG_REV_FLAG:
381 queue(c, qpu_a_ITOF(dst,
382 qpu_rb(QPU_R_MS_REV_FLAGS)));
383 break;
384
385 case QOP_FRAG_Z:
386 case QOP_FRAG_W:
387 /* QOP_FRAG_Z/W don't emit instructions, just allocate
388 * the register to the Z/W payload.
389 */
390 break;
391
392 case QOP_TLB_DISCARD_SETUP:
393 discard = true;
394 queue(c, qpu_a_MOV(src[0], src[0]));
395 *last_inst(c) |= QPU_SF;
396 break;
397
398 case QOP_TLB_STENCIL_SETUP:
399 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
400 break;
401
402 case QOP_TLB_Z_WRITE:
403 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
404 if (discard) {
405 set_last_cond_add(c, QPU_COND_ZS);
406 }
407 break;
408
409 case QOP_TLB_COLOR_READ:
410 queue(c, qpu_NOP());
411 *last_inst(c) = qpu_set_sig(*last_inst(c),
412 QPU_SIG_COLOR_LOAD);
413
414 if (dst.mux != QPU_MUX_R4)
415 queue(c, qpu_a_MOV(dst, qpu_r4()));
416 break;
417
418 case QOP_TLB_COLOR_WRITE:
419 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
420 if (discard) {
421 set_last_cond_add(c, QPU_COND_ZS);
422 }
423 break;
424
425 case QOP_VARY_ADD_C:
426 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
427 break;
428
429 case QOP_TEX_S:
430 case QOP_TEX_T:
431 case QOP_TEX_R:
432 case QOP_TEX_B:
433 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
434 (qinst->op - QOP_TEX_S)),
435 src[0]));
436 break;
437
438 case QOP_TEX_DIRECT:
439 fixup_raddr_conflict(c, dst, &src[0], &src[1]);
440 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
441 break;
442
443 case QOP_TEX_RESULT:
444 queue(c, qpu_NOP());
445 *last_inst(c) = qpu_set_sig(*last_inst(c),
446 QPU_SIG_LOAD_TMU0);
447 if (dst.mux != QPU_MUX_R4)
448 queue(c, qpu_a_MOV(dst, qpu_r4()));
449 break;
450
451 default:
452 assert(qinst->op < ARRAY_SIZE(translate));
453 assert(translate[qinst->op].op != 0); /* NOPs */
454
455 /* Skip emitting the MOV if it's a no-op. */
456 if (qir_is_raw_mov(qinst) &&
457 dst.mux == src[0].mux && dst.addr == src[0].addr) {
458 break;
459 }
460
461 /* If we have only one source, put it in the second
462 * argument slot as well so that we don't take up
463 * another raddr just to get unused data.
464 */
465 if (qir_get_op_nsrc(qinst->op) == 1)
466 src[1] = src[0];
467
468 fixup_raddr_conflict(c, dst, &src[0], &src[1]);
469
470 if (qir_is_mul(qinst)) {
471 queue(c, qpu_m_alu2(translate[qinst->op].op,
472 dst,
473 src[0], src[1]));
474 } else {
475 queue(c, qpu_a_alu2(translate[qinst->op].op,
476 dst,
477 src[0], src[1]));
478 }
479 set_last_dst_pack(c, qinst);
480
481 break;
482 }
483
484 if (qinst->sf) {
485 assert(!qir_is_multi_instruction(qinst));
486 *last_inst(c) |= QPU_SF;
487 }
488 }
489
490 qpu_schedule_instructions(c);
491
492 /* thread end can't have VPM write or read */
493 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
494 QPU_WADDR_ADD) == QPU_W_VPM ||
495 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
496 QPU_WADDR_MUL) == QPU_W_VPM ||
497 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
498 QPU_RADDR_A) == QPU_R_VPM ||
499 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
500 QPU_RADDR_B) == QPU_R_VPM) {
501 qpu_serialize_one_inst(c, qpu_NOP());
502 }
503
504 /* thread end can't have uniform read */
505 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
506 QPU_RADDR_A) == QPU_R_UNIF ||
507 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
508 QPU_RADDR_B) == QPU_R_UNIF) {
509 qpu_serialize_one_inst(c, qpu_NOP());
510 }
511
512 /* thread end can't have TLB operations */
513 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
514 qpu_serialize_one_inst(c, qpu_NOP());
515
516 c->qpu_insts[c->qpu_inst_count - 1] =
517 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
518 QPU_SIG_PROG_END);
519 qpu_serialize_one_inst(c, qpu_NOP());
520 qpu_serialize_one_inst(c, qpu_NOP());
521
522 switch (c->stage) {
523 case QSTAGE_VERT:
524 case QSTAGE_COORD:
525 break;
526 case QSTAGE_FRAG:
527 c->qpu_insts[c->qpu_inst_count - 1] =
528 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
529 QPU_SIG_SCOREBOARD_UNLOCK);
530 break;
531 }
532
533 if (vc4_debug & VC4_DEBUG_QPU)
534 vc4_dump_program(c);
535
536 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
537
538 free(temp_registers);
539 }