e0d3633da4268b22233ae2527143a7684dae4d79
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 /**
68 * Some special registers can be read from either file, which lets us resolve
69 * raddr conflicts without extra MOVs.
70 */
71 static bool
72 swap_file(struct qpu_reg *src)
73 {
74 switch (src->addr) {
75 case QPU_R_UNIF:
76 case QPU_R_VARY:
77 if (src->mux == QPU_MUX_SMALL_IMM) {
78 return false;
79 } else {
80 if (src->mux == QPU_MUX_A)
81 src->mux = QPU_MUX_B;
82 else
83 src->mux = QPU_MUX_A;
84 return true;
85 }
86
87 default:
88 return false;
89 }
90 }
91
92 /**
93 * This is used to resolve the fact that we might register-allocate two
94 * different operands of an instruction to the same physical register file
95 * even though instructions have only one field for the register file source
96 * address.
97 *
98 * In that case, we need to move one to a temporary that can be used in the
99 * instruction, instead. We reserve ra31/rb31 for this purpose.
100 */
101 static void
102 fixup_raddr_conflict(struct vc4_compile *c,
103 struct qpu_reg dst,
104 struct qpu_reg *src0, struct qpu_reg *src1,
105 struct qinst *inst, uint64_t *unpack)
106 {
107 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
108 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
109
110 if (mux0 <= QPU_MUX_R5 ||
111 mux0 != mux1 ||
112 (src0->addr == src1->addr &&
113 src0->mux == src1->mux)) {
114 return;
115 }
116
117 if (swap_file(src0) || swap_file(src1))
118 return;
119
120 if (mux0 == QPU_MUX_A) {
121 /* Make sure we use the same type of MOV as the instruction,
122 * in case of unpacks.
123 */
124 if (qir_is_float_input(inst))
125 queue(c, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
126 else
127 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
128
129 /* If we had an unpack on this A-file source, we need to put
130 * it into this MOV, not into the later move from regfile B.
131 */
132 if (inst->src[0].pack) {
133 *last_inst(c) |= *unpack;
134 *unpack = 0;
135 }
136 *src0 = qpu_rb(31);
137 } else {
138 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
139 *src0 = qpu_ra(31);
140 }
141 }
142
143 static void
144 set_last_dst_pack(struct vc4_compile *c, struct qinst *inst)
145 {
146 bool had_pm = *last_inst(c) & QPU_PM;
147 bool had_ws = *last_inst(c) & QPU_WS;
148 uint32_t unpack = QPU_GET_FIELD(*last_inst(c), QPU_UNPACK);
149
150 if (!inst->dst.pack)
151 return;
152
153 *last_inst(c) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
154
155 if (qir_is_mul(inst)) {
156 assert(!unpack || had_pm);
157 *last_inst(c) |= QPU_PM;
158 } else {
159 assert(!unpack || !had_pm);
160 assert(!had_ws); /* dst must be a-file to pack. */
161 }
162 }
163
164 void
165 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
166 {
167 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
168 bool discard = false;
169 uint32_t inputs_remaining = c->num_inputs;
170 uint32_t vpm_read_fifo_count = 0;
171 uint32_t vpm_read_offset = 0;
172 int last_vpm_read_index = -1;
173
174 list_inithead(&c->qpu_inst_list);
175
176 switch (c->stage) {
177 case QSTAGE_VERT:
178 case QSTAGE_COORD:
179 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
180 * load up to 16 dwords (4 vec4s) per vertex.
181 */
182 while (inputs_remaining) {
183 uint32_t num_entries = MIN2(inputs_remaining, 16);
184 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
185 vpm_read_offset |
186 0x00001a00 |
187 ((num_entries & 0xf) << 20)));
188 inputs_remaining -= num_entries;
189 vpm_read_offset += num_entries;
190 vpm_read_fifo_count++;
191 }
192 assert(vpm_read_fifo_count <= 4);
193
194 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
195 break;
196 case QSTAGE_FRAG:
197 break;
198 }
199
200 list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
201 #if 0
202 fprintf(stderr, "translating qinst to qpu: ");
203 qir_dump_inst(qinst);
204 fprintf(stderr, "\n");
205 #endif
206
207 static const struct {
208 uint32_t op;
209 } translate[] = {
210 #define A(name) [QOP_##name] = {QPU_A_##name}
211 #define M(name) [QOP_##name] = {QPU_M_##name}
212 A(FADD),
213 A(FSUB),
214 A(FMIN),
215 A(FMAX),
216 A(FMINABS),
217 A(FMAXABS),
218 A(FTOI),
219 A(ITOF),
220 A(ADD),
221 A(SUB),
222 A(SHL),
223 A(SHR),
224 A(ASR),
225 A(MIN),
226 A(MAX),
227 A(AND),
228 A(OR),
229 A(XOR),
230 A(NOT),
231
232 M(FMUL),
233 M(V8MULD),
234 M(V8MIN),
235 M(V8MAX),
236 M(V8ADDS),
237 M(V8SUBS),
238 M(MUL24),
239
240 /* If we replicate src[0] out to src[1], this works
241 * out the same as a MOV.
242 */
243 [QOP_MOV] = { QPU_A_OR },
244 [QOP_FMOV] = { QPU_A_FMAX },
245 [QOP_MMOV] = { QPU_M_V8MIN },
246 };
247
248 uint64_t unpack = 0;
249 struct qpu_reg src[4];
250 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
251 int index = qinst->src[i].index;
252 switch (qinst->src[i].file) {
253 case QFILE_NULL:
254 src[i] = qpu_rn(0);
255 break;
256 case QFILE_TEMP:
257 src[i] = temp_registers[index];
258 if (qinst->src[i].pack) {
259 assert(!unpack ||
260 unpack == qinst->src[i].pack);
261 unpack = QPU_SET_FIELD(qinst->src[i].pack,
262 QPU_UNPACK);
263 if (src[i].mux == QPU_MUX_R4)
264 unpack |= QPU_PM;
265 }
266 break;
267 case QFILE_UNIF:
268 src[i] = qpu_unif();
269 break;
270 case QFILE_VARY:
271 src[i] = qpu_vary();
272 break;
273 case QFILE_SMALL_IMM:
274 src[i].mux = QPU_MUX_SMALL_IMM;
275 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
276 /* This should only have returned a valid
277 * small immediate field, not ~0 for failure.
278 */
279 assert(src[i].addr <= 47);
280 break;
281 case QFILE_VPM:
282 assert((int)qinst->src[i].index >=
283 last_vpm_read_index);
284 (void)last_vpm_read_index;
285 last_vpm_read_index = qinst->src[i].index;
286 src[i] = qpu_ra(QPU_R_VPM);
287 break;
288 }
289 }
290
291 struct qpu_reg dst;
292 switch (qinst->dst.file) {
293 case QFILE_NULL:
294 dst = qpu_ra(QPU_W_NOP);
295 break;
296 case QFILE_TEMP:
297 dst = temp_registers[qinst->dst.index];
298 break;
299 case QFILE_VPM:
300 dst = qpu_ra(QPU_W_VPM);
301 break;
302 case QFILE_VARY:
303 case QFILE_UNIF:
304 case QFILE_SMALL_IMM:
305 assert(!"not reached");
306 break;
307 }
308
309 switch (qinst->op) {
310 case QOP_SEL_X_0_ZS:
311 case QOP_SEL_X_0_ZC:
312 case QOP_SEL_X_0_NS:
313 case QOP_SEL_X_0_NC:
314 case QOP_SEL_X_0_CS:
315 case QOP_SEL_X_0_CC:
316 queue(c, qpu_a_MOV(dst, src[0]) | unpack);
317 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
318 QPU_COND_ZS);
319
320 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
321 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
322 1) + QPU_COND_ZS);
323 break;
324
325 case QOP_SEL_X_Y_ZS:
326 case QOP_SEL_X_Y_ZC:
327 case QOP_SEL_X_Y_NS:
328 case QOP_SEL_X_Y_NC:
329 case QOP_SEL_X_Y_CS:
330 case QOP_SEL_X_Y_CC:
331 queue(c, qpu_a_MOV(dst, src[0]));
332 if (qinst->src[0].pack)
333 *(last_inst(c)) |= unpack;
334 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
335 QPU_COND_ZS);
336
337 queue(c, qpu_a_MOV(dst, src[1]));
338 if (qinst->src[1].pack)
339 *(last_inst(c)) |= unpack;
340 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
341 1) + QPU_COND_ZS);
342
343 break;
344
345 case QOP_RCP:
346 case QOP_RSQ:
347 case QOP_EXP2:
348 case QOP_LOG2:
349 switch (qinst->op) {
350 case QOP_RCP:
351 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
352 src[0]) | unpack);
353 break;
354 case QOP_RSQ:
355 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
356 src[0]) | unpack);
357 break;
358 case QOP_EXP2:
359 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
360 src[0]) | unpack);
361 break;
362 case QOP_LOG2:
363 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
364 src[0]) | unpack);
365 break;
366 default:
367 abort();
368 }
369
370 if (dst.mux != QPU_MUX_R4)
371 queue(c, qpu_a_MOV(dst, qpu_r4()));
372
373 break;
374
375 case QOP_FRAG_X:
376 queue(c, qpu_a_ITOF(dst,
377 qpu_ra(QPU_R_XY_PIXEL_COORD)));
378 break;
379
380 case QOP_FRAG_Y:
381 queue(c, qpu_a_ITOF(dst,
382 qpu_rb(QPU_R_XY_PIXEL_COORD)));
383 break;
384
385 case QOP_FRAG_REV_FLAG:
386 queue(c, qpu_a_ITOF(dst,
387 qpu_rb(QPU_R_MS_REV_FLAGS)));
388 break;
389
390 case QOP_FRAG_Z:
391 case QOP_FRAG_W:
392 /* QOP_FRAG_Z/W don't emit instructions, just allocate
393 * the register to the Z/W payload.
394 */
395 break;
396
397 case QOP_TLB_DISCARD_SETUP:
398 discard = true;
399 queue(c, qpu_a_MOV(src[0], src[0]) | unpack);
400 *last_inst(c) |= QPU_SF;
401 break;
402
403 case QOP_TLB_STENCIL_SETUP:
404 assert(!unpack);
405 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP),
406 src[0]) | unpack);
407 break;
408
409 case QOP_TLB_Z_WRITE:
410 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z),
411 src[0]) | unpack);
412 if (discard) {
413 set_last_cond_add(c, QPU_COND_ZS);
414 }
415 break;
416
417 case QOP_TLB_COLOR_READ:
418 queue(c, qpu_NOP());
419 *last_inst(c) = qpu_set_sig(*last_inst(c),
420 QPU_SIG_COLOR_LOAD);
421
422 if (dst.mux != QPU_MUX_R4)
423 queue(c, qpu_a_MOV(dst, qpu_r4()));
424 break;
425
426 case QOP_TLB_COLOR_WRITE:
427 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]) | unpack);
428 if (discard) {
429 set_last_cond_add(c, QPU_COND_ZS);
430 }
431 break;
432
433 case QOP_VARY_ADD_C:
434 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
435 break;
436
437 case QOP_TEX_S:
438 case QOP_TEX_T:
439 case QOP_TEX_R:
440 case QOP_TEX_B:
441 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
442 (qinst->op - QOP_TEX_S)),
443 src[0]) | unpack);
444 break;
445
446 case QOP_TEX_DIRECT:
447 fixup_raddr_conflict(c, dst, &src[0], &src[1],
448 qinst, &unpack);
449 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
450 src[0], src[1]) | unpack);
451 break;
452
453 case QOP_TEX_RESULT:
454 queue(c, qpu_NOP());
455 *last_inst(c) = qpu_set_sig(*last_inst(c),
456 QPU_SIG_LOAD_TMU0);
457 if (dst.mux != QPU_MUX_R4)
458 queue(c, qpu_a_MOV(dst, qpu_r4()));
459 break;
460
461 default:
462 assert(qinst->op < ARRAY_SIZE(translate));
463 assert(translate[qinst->op].op != 0); /* NOPs */
464
465 /* Skip emitting the MOV if it's a no-op. */
466 if (qir_is_raw_mov(qinst) &&
467 dst.mux == src[0].mux && dst.addr == src[0].addr) {
468 break;
469 }
470
471 /* If we have only one source, put it in the second
472 * argument slot as well so that we don't take up
473 * another raddr just to get unused data.
474 */
475 if (qir_get_op_nsrc(qinst->op) == 1)
476 src[1] = src[0];
477
478 fixup_raddr_conflict(c, dst, &src[0], &src[1],
479 qinst, &unpack);
480
481 if (qir_is_mul(qinst)) {
482 queue(c, qpu_m_alu2(translate[qinst->op].op,
483 dst,
484 src[0], src[1]) | unpack);
485 } else {
486 queue(c, qpu_a_alu2(translate[qinst->op].op,
487 dst,
488 src[0], src[1]) | unpack);
489 }
490 set_last_dst_pack(c, qinst);
491
492 break;
493 }
494
495 if (qinst->sf) {
496 assert(!qir_is_multi_instruction(qinst));
497 *last_inst(c) |= QPU_SF;
498 }
499 }
500
501 qpu_schedule_instructions(c);
502
503 /* thread end can't have VPM write or read */
504 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
505 QPU_WADDR_ADD) == QPU_W_VPM ||
506 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
507 QPU_WADDR_MUL) == QPU_W_VPM ||
508 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
509 QPU_RADDR_A) == QPU_R_VPM ||
510 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
511 QPU_RADDR_B) == QPU_R_VPM) {
512 qpu_serialize_one_inst(c, qpu_NOP());
513 }
514
515 /* thread end can't have uniform read */
516 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
517 QPU_RADDR_A) == QPU_R_UNIF ||
518 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
519 QPU_RADDR_B) == QPU_R_UNIF) {
520 qpu_serialize_one_inst(c, qpu_NOP());
521 }
522
523 /* thread end can't have TLB operations */
524 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
525 qpu_serialize_one_inst(c, qpu_NOP());
526
527 c->qpu_insts[c->qpu_inst_count - 1] =
528 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
529 QPU_SIG_PROG_END);
530 qpu_serialize_one_inst(c, qpu_NOP());
531 qpu_serialize_one_inst(c, qpu_NOP());
532
533 switch (c->stage) {
534 case QSTAGE_VERT:
535 case QSTAGE_COORD:
536 break;
537 case QSTAGE_FRAG:
538 c->qpu_insts[c->qpu_inst_count - 1] =
539 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
540 QPU_SIG_SCOREBOARD_UNLOCK);
541 break;
542 }
543
544 if (vc4_debug & VC4_DEBUG_QPU)
545 vc4_dump_program(c);
546
547 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
548
549 free(temp_registers);
550 }