vc4: Convert from simple_list.h to list.h
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 /**
68 * Some special registers can be read from either file, which lets us resolve
69 * raddr conflicts without extra MOVs.
70 */
71 static bool
72 swap_file(struct qpu_reg *src)
73 {
74 switch (src->addr) {
75 case QPU_R_UNIF:
76 case QPU_R_VARY:
77 if (src->mux == QPU_MUX_SMALL_IMM) {
78 return false;
79 } else {
80 if (src->mux == QPU_MUX_A)
81 src->mux = QPU_MUX_B;
82 else
83 src->mux = QPU_MUX_A;
84 return true;
85 }
86
87 default:
88 return false;
89 }
90 }
91
92 /**
93 * This is used to resolve the fact that we might register-allocate two
94 * different operands of an instruction to the same physical register file
95 * even though instructions have only one field for the register file source
96 * address.
97 *
98 * In that case, we need to move one to a temporary that can be used in the
99 * instruction, instead. We reserve ra31/rb31 for this purpose.
100 */
101 static void
102 fixup_raddr_conflict(struct vc4_compile *c,
103 struct qpu_reg dst,
104 struct qpu_reg *src0, struct qpu_reg *src1)
105 {
106 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
107 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
108
109 if (mux0 <= QPU_MUX_R5 ||
110 mux0 != mux1 ||
111 (src0->addr == src1->addr &&
112 src0->mux == src1->mux)) {
113 return;
114 }
115
116 if (swap_file(src0) || swap_file(src1))
117 return;
118
119 if (mux0 == QPU_MUX_A) {
120 queue(c, qpu_a_MOV(qpu_rb(31), *src1));
121 *src1 = qpu_rb(31);
122 } else {
123 queue(c, qpu_a_MOV(qpu_ra(31), *src1));
124 *src1 = qpu_ra(31);
125 }
126 }
127
128 void
129 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
130 {
131 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
132 bool discard = false;
133 uint32_t inputs_remaining = c->num_inputs;
134 uint32_t vpm_read_fifo_count = 0;
135 uint32_t vpm_read_offset = 0;
136 int last_vpm_read_index = -1;
137 /* Map from the QIR ops enum order to QPU unpack bits. */
138 static const uint32_t unpack_map[] = {
139 QPU_UNPACK_8A,
140 QPU_UNPACK_8B,
141 QPU_UNPACK_8C,
142 QPU_UNPACK_8D,
143 QPU_UNPACK_16A_TO_F32,
144 QPU_UNPACK_16B_TO_F32,
145 };
146
147 list_inithead(&c->qpu_inst_list);
148
149 switch (c->stage) {
150 case QSTAGE_VERT:
151 case QSTAGE_COORD:
152 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
153 * load up to 16 dwords (4 vec4s) per vertex.
154 */
155 while (inputs_remaining) {
156 uint32_t num_entries = MIN2(inputs_remaining, 16);
157 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
158 vpm_read_offset |
159 0x00001a00 |
160 ((num_entries & 0xf) << 20)));
161 inputs_remaining -= num_entries;
162 vpm_read_offset += num_entries;
163 vpm_read_fifo_count++;
164 }
165 assert(vpm_read_fifo_count <= 4);
166
167 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
168 break;
169 case QSTAGE_FRAG:
170 break;
171 }
172
173 list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
174 #if 0
175 fprintf(stderr, "translating qinst to qpu: ");
176 qir_dump_inst(qinst);
177 fprintf(stderr, "\n");
178 #endif
179
180 static const struct {
181 uint32_t op;
182 bool is_mul;
183 } translate[] = {
184 #define A(name) [QOP_##name] = {QPU_A_##name, false}
185 #define M(name) [QOP_##name] = {QPU_M_##name, true}
186 A(FADD),
187 A(FSUB),
188 A(FMIN),
189 A(FMAX),
190 A(FMINABS),
191 A(FMAXABS),
192 A(FTOI),
193 A(ITOF),
194 A(ADD),
195 A(SUB),
196 A(SHL),
197 A(SHR),
198 A(ASR),
199 A(MIN),
200 A(MAX),
201 A(AND),
202 A(OR),
203 A(XOR),
204 A(NOT),
205
206 M(FMUL),
207 M(MUL24),
208 };
209
210 struct qpu_reg src[4];
211 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
212 int index = qinst->src[i].index;
213 switch (qinst->src[i].file) {
214 case QFILE_NULL:
215 src[i] = qpu_rn(0);
216 break;
217 case QFILE_TEMP:
218 src[i] = temp_registers[index];
219 break;
220 case QFILE_UNIF:
221 src[i] = qpu_unif();
222 break;
223 case QFILE_VARY:
224 src[i] = qpu_vary();
225 break;
226 case QFILE_SMALL_IMM:
227 src[i].mux = QPU_MUX_SMALL_IMM;
228 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
229 /* This should only have returned a valid
230 * small immediate field, not ~0 for failure.
231 */
232 assert(src[i].addr <= 47);
233 break;
234 case QFILE_VPM:
235 assert((int)qinst->src[i].index >=
236 last_vpm_read_index);
237 last_vpm_read_index = qinst->src[i].index;
238 src[i] = qpu_ra(QPU_R_VPM);
239 break;
240 }
241 }
242
243 struct qpu_reg dst;
244 switch (qinst->dst.file) {
245 case QFILE_NULL:
246 dst = qpu_ra(QPU_W_NOP);
247 break;
248 case QFILE_TEMP:
249 dst = temp_registers[qinst->dst.index];
250 break;
251 case QFILE_VPM:
252 dst = qpu_ra(QPU_W_VPM);
253 break;
254 case QFILE_VARY:
255 case QFILE_UNIF:
256 case QFILE_SMALL_IMM:
257 assert(!"not reached");
258 break;
259 }
260
261 switch (qinst->op) {
262 case QOP_MOV:
263 /* Skip emitting the MOV if it's a no-op. */
264 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
265 dst.mux != src[0].mux || dst.addr != src[0].addr) {
266 queue(c, qpu_a_MOV(dst, src[0]));
267 }
268 break;
269
270 case QOP_SEL_X_0_ZS:
271 case QOP_SEL_X_0_ZC:
272 case QOP_SEL_X_0_NS:
273 case QOP_SEL_X_0_NC:
274 queue(c, qpu_a_MOV(dst, src[0]));
275 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
276 QPU_COND_ZS);
277
278 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
279 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
280 1) + QPU_COND_ZS);
281 break;
282
283 case QOP_SEL_X_Y_ZS:
284 case QOP_SEL_X_Y_ZC:
285 case QOP_SEL_X_Y_NS:
286 case QOP_SEL_X_Y_NC:
287 queue(c, qpu_a_MOV(dst, src[0]));
288 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
289 QPU_COND_ZS);
290
291 queue(c, qpu_a_MOV(dst, src[1]));
292 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
293 1) + QPU_COND_ZS);
294
295 break;
296
297 case QOP_RCP:
298 case QOP_RSQ:
299 case QOP_EXP2:
300 case QOP_LOG2:
301 switch (qinst->op) {
302 case QOP_RCP:
303 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
304 src[0]));
305 break;
306 case QOP_RSQ:
307 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
308 src[0]));
309 break;
310 case QOP_EXP2:
311 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
312 src[0]));
313 break;
314 case QOP_LOG2:
315 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
316 src[0]));
317 break;
318 default:
319 abort();
320 }
321
322 queue(c, qpu_a_MOV(dst, qpu_r4()));
323
324 break;
325
326 case QOP_PACK_8888_F:
327 queue(c, qpu_m_MOV(dst, src[0]));
328 *last_inst(c) |= QPU_PM;
329 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
330 QPU_PACK);
331 break;
332
333 case QOP_PACK_8A_F:
334 case QOP_PACK_8B_F:
335 case QOP_PACK_8C_F:
336 case QOP_PACK_8D_F:
337 /* If dst doesn't happen to already contain src[0],
338 * then we have to move it in.
339 */
340 if (qinst->src[0].file != QFILE_NULL &&
341 (src[0].mux != dst.mux || src[0].addr != dst.addr)) {
342 /* Don't overwrite src1 while setting up
343 * the dst!
344 */
345 if (dst.mux == src[1].mux &&
346 dst.addr == src[1].addr) {
347 queue(c, qpu_m_MOV(qpu_rb(31), src[1]));
348 src[1] = qpu_rb(31);
349 }
350
351 queue(c, qpu_m_MOV(dst, src[0]));
352 }
353
354 queue(c, qpu_m_MOV(dst, src[1]));
355 *last_inst(c) |= QPU_PM;
356 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A +
357 qinst->op - QOP_PACK_8A_F,
358 QPU_PACK);
359 break;
360
361 case QOP_FRAG_X:
362 queue(c, qpu_a_ITOF(dst,
363 qpu_ra(QPU_R_XY_PIXEL_COORD)));
364 break;
365
366 case QOP_FRAG_Y:
367 queue(c, qpu_a_ITOF(dst,
368 qpu_rb(QPU_R_XY_PIXEL_COORD)));
369 break;
370
371 case QOP_FRAG_REV_FLAG:
372 queue(c, qpu_a_ITOF(dst,
373 qpu_rb(QPU_R_MS_REV_FLAGS)));
374 break;
375
376 case QOP_FRAG_Z:
377 case QOP_FRAG_W:
378 /* QOP_FRAG_Z/W don't emit instructions, just allocate
379 * the register to the Z/W payload.
380 */
381 break;
382
383 case QOP_TLB_DISCARD_SETUP:
384 discard = true;
385 queue(c, qpu_a_MOV(src[0], src[0]));
386 *last_inst(c) |= QPU_SF;
387 break;
388
389 case QOP_TLB_STENCIL_SETUP:
390 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
391 break;
392
393 case QOP_TLB_Z_WRITE:
394 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
395 if (discard) {
396 set_last_cond_add(c, QPU_COND_ZS);
397 }
398 break;
399
400 case QOP_TLB_COLOR_READ:
401 queue(c, qpu_NOP());
402 *last_inst(c) = qpu_set_sig(*last_inst(c),
403 QPU_SIG_COLOR_LOAD);
404
405 break;
406
407 case QOP_TLB_COLOR_WRITE:
408 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
409 if (discard) {
410 set_last_cond_add(c, QPU_COND_ZS);
411 }
412 break;
413
414 case QOP_VARY_ADD_C:
415 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
416 break;
417
418 case QOP_PACK_SCALED: {
419 uint64_t a = (qpu_a_MOV(dst, src[0]) |
420 QPU_SET_FIELD(QPU_PACK_A_16A,
421 QPU_PACK));
422 uint64_t b = (qpu_a_MOV(dst, src[1]) |
423 QPU_SET_FIELD(QPU_PACK_A_16B,
424 QPU_PACK));
425
426 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
427 queue(c, b);
428 queue(c, a);
429 } else {
430 queue(c, a);
431 queue(c, b);
432 }
433 break;
434 }
435
436 case QOP_TEX_S:
437 case QOP_TEX_T:
438 case QOP_TEX_R:
439 case QOP_TEX_B:
440 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
441 (qinst->op - QOP_TEX_S)),
442 src[0]));
443 break;
444
445 case QOP_TEX_DIRECT:
446 fixup_raddr_conflict(c, dst, &src[0], &src[1]);
447 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
448 break;
449
450 case QOP_TEX_RESULT:
451 queue(c, qpu_NOP());
452 *last_inst(c) = qpu_set_sig(*last_inst(c),
453 QPU_SIG_LOAD_TMU0);
454
455 break;
456
457 case QOP_R4_UNPACK_A:
458 case QOP_R4_UNPACK_B:
459 case QOP_R4_UNPACK_C:
460 case QOP_R4_UNPACK_D:
461 assert(src[0].mux == QPU_MUX_R4);
462 queue(c, qpu_a_MOV(dst, src[0]));
463 *last_inst(c) |= QPU_PM;
464 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
465 (qinst->op -
466 QOP_R4_UNPACK_A),
467 QPU_UNPACK);
468
469 break;
470
471 case QOP_UNPACK_8A_F:
472 case QOP_UNPACK_8B_F:
473 case QOP_UNPACK_8C_F:
474 case QOP_UNPACK_8D_F:
475 case QOP_UNPACK_16A_F:
476 case QOP_UNPACK_16B_F: {
477 assert(src[0].mux == QPU_MUX_A);
478
479 /* Since we're setting the pack bits, if the
480 * destination is in A it would get re-packed.
481 */
482 queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
483 qpu_rb(31) : dst),
484 src[0], src[0]));
485 *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
486 QOP_UNPACK_8A_F],
487 QPU_UNPACK);
488
489 if (dst.mux == QPU_MUX_A) {
490 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
491 }
492 }
493 break;
494
495 case QOP_UNPACK_8A_I:
496 case QOP_UNPACK_8B_I:
497 case QOP_UNPACK_8C_I:
498 case QOP_UNPACK_8D_I:
499 case QOP_UNPACK_16A_I:
500 case QOP_UNPACK_16B_I: {
501 assert(src[0].mux == QPU_MUX_A);
502
503 /* Since we're setting the pack bits, if the
504 * destination is in A it would get re-packed.
505 */
506 queue(c, qpu_a_MOV((dst.mux == QPU_MUX_A ?
507 qpu_rb(31) : dst), src[0]));
508 *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
509 QOP_UNPACK_8A_I],
510 QPU_UNPACK);
511
512 if (dst.mux == QPU_MUX_A) {
513 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
514 }
515 }
516 break;
517
518 default:
519 assert(qinst->op < ARRAY_SIZE(translate));
520 assert(translate[qinst->op].op != 0); /* NOPs */
521
522 /* If we have only one source, put it in the second
523 * argument slot as well so that we don't take up
524 * another raddr just to get unused data.
525 */
526 if (qir_get_op_nsrc(qinst->op) == 1)
527 src[1] = src[0];
528
529 fixup_raddr_conflict(c, dst, &src[0], &src[1]);
530
531 if (translate[qinst->op].is_mul) {
532 queue(c, qpu_m_alu2(translate[qinst->op].op,
533 dst,
534 src[0], src[1]));
535 } else {
536 queue(c, qpu_a_alu2(translate[qinst->op].op,
537 dst,
538 src[0], src[1]));
539 }
540
541 break;
542 }
543
544 if (qinst->sf) {
545 assert(!qir_is_multi_instruction(qinst));
546 *last_inst(c) |= QPU_SF;
547 }
548 }
549
550 qpu_schedule_instructions(c);
551
552 /* thread end can't have VPM write or read */
553 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
554 QPU_WADDR_ADD) == QPU_W_VPM ||
555 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
556 QPU_WADDR_MUL) == QPU_W_VPM ||
557 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
558 QPU_RADDR_A) == QPU_R_VPM ||
559 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
560 QPU_RADDR_B) == QPU_R_VPM) {
561 qpu_serialize_one_inst(c, qpu_NOP());
562 }
563
564 /* thread end can't have uniform read */
565 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
566 QPU_RADDR_A) == QPU_R_UNIF ||
567 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
568 QPU_RADDR_B) == QPU_R_UNIF) {
569 qpu_serialize_one_inst(c, qpu_NOP());
570 }
571
572 /* thread end can't have TLB operations */
573 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
574 qpu_serialize_one_inst(c, qpu_NOP());
575
576 c->qpu_insts[c->qpu_inst_count - 1] =
577 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
578 QPU_SIG_PROG_END);
579 qpu_serialize_one_inst(c, qpu_NOP());
580 qpu_serialize_one_inst(c, qpu_NOP());
581
582 switch (c->stage) {
583 case QSTAGE_VERT:
584 case QSTAGE_COORD:
585 break;
586 case QSTAGE_FRAG:
587 c->qpu_insts[c->qpu_inst_count - 1] =
588 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
589 QPU_SIG_SCOREBOARD_UNLOCK);
590 break;
591 }
592
593 if (vc4_debug & VC4_DEBUG_QPU)
594 vc4_dump_program(c);
595
596 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
597
598 free(temp_registers);
599 }