vc4: Introduce scheduling of QPU instructions.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29
30 static void
31 vc4_dump_program(struct vc4_compile *c)
32 {
33 fprintf(stderr, "%s prog %d/%d QPU:\n",
34 qir_get_stage_name(c->stage),
35 c->program_id, c->variant_id);
36
37 for (int i = 0; i < c->qpu_inst_count; i++) {
38 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
39 vc4_qpu_disasm(&c->qpu_insts[i], 1);
40 fprintf(stderr, "\n");
41 }
42 }
43
44 static void
45 queue(struct vc4_compile *c, uint64_t inst)
46 {
47 struct queued_qpu_inst *q = calloc(1, sizeof(*q));
48 q->inst = inst;
49 insert_at_tail(&c->qpu_inst_list, &q->link);
50 }
51
52 static uint64_t *
53 last_inst(struct vc4_compile *c)
54 {
55 struct queued_qpu_inst *q =
56 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
57 return &q->inst;
58 }
59
60 static void
61 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
62 {
63 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
64 }
65
66 /**
67 * Some special registers can be read from either file, which lets us resolve
68 * raddr conflicts without extra MOVs.
69 */
70 static bool
71 swap_file(struct qpu_reg *src)
72 {
73 switch (src->addr) {
74 case QPU_R_UNIF:
75 case QPU_R_VARY:
76 if (src->mux == QPU_MUX_A)
77 src->mux = QPU_MUX_B;
78 else
79 src->mux = QPU_MUX_A;
80 return true;
81
82 default:
83 return false;
84 }
85 }
86
87 /**
88 * This is used to resolve the fact that we might register-allocate two
89 * different operands of an instruction to the same physical register file
90 * even though instructions have only one field for the register file source
91 * address.
92 *
93 * In that case, we need to move one to a temporary that can be used in the
94 * instruction, instead.
95 */
96 static void
97 fixup_raddr_conflict(struct vc4_compile *c,
98 struct qpu_reg *src0, struct qpu_reg *src1)
99 {
100 if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
101 src0->mux != src1->mux ||
102 src0->addr == src1->addr) {
103 return;
104 }
105
106 if (swap_file(src0) || swap_file(src1))
107 return;
108
109 queue(c, qpu_a_MOV(qpu_r3(), *src1));
110 *src1 = qpu_r3();
111 }
112
113 void
114 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
115 {
116 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
117 bool discard = false;
118 uint32_t inputs_remaining = c->num_inputs;
119 uint32_t vpm_read_fifo_count = 0;
120 uint32_t vpm_read_offset = 0;
121
122 make_empty_list(&c->qpu_inst_list);
123
124 switch (c->stage) {
125 case QSTAGE_VERT:
126 case QSTAGE_COORD:
127 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
128 * load up to 16 dwords (4 vec4s) per vertex.
129 */
130 while (inputs_remaining) {
131 uint32_t num_entries = MIN2(inputs_remaining, 16);
132 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
133 vpm_read_offset |
134 0x00001a00 |
135 ((num_entries & 0xf) << 20)));
136 inputs_remaining -= num_entries;
137 vpm_read_offset += num_entries;
138 vpm_read_fifo_count++;
139 }
140 assert(vpm_read_fifo_count <= 4);
141
142 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
143 break;
144 case QSTAGE_FRAG:
145 break;
146 }
147
148 struct simple_node *node;
149 foreach(node, &c->instructions) {
150 struct qinst *qinst = (struct qinst *)node;
151
152 #if 0
153 fprintf(stderr, "translating qinst to qpu: ");
154 qir_dump_inst(qinst);
155 fprintf(stderr, "\n");
156 #endif
157
158 static const struct {
159 uint32_t op;
160 bool is_mul;
161 } translate[] = {
162 #define A(name) [QOP_##name] = {QPU_A_##name, false}
163 #define M(name) [QOP_##name] = {QPU_M_##name, true}
164 A(FADD),
165 A(FSUB),
166 A(FMIN),
167 A(FMAX),
168 A(FMINABS),
169 A(FMAXABS),
170 A(FTOI),
171 A(ITOF),
172 A(ADD),
173 A(SUB),
174 A(SHL),
175 A(SHR),
176 A(ASR),
177 A(MIN),
178 A(MAX),
179 A(AND),
180 A(OR),
181 A(XOR),
182 A(NOT),
183
184 M(FMUL),
185 M(MUL24),
186 };
187
188 struct qpu_reg src[4];
189 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
190 int index = qinst->src[i].index;
191 switch (qinst->src[i].file) {
192 case QFILE_NULL:
193 src[i] = qpu_rn(0);
194 break;
195 case QFILE_TEMP:
196 src[i] = temp_registers[index];
197 break;
198 case QFILE_UNIF:
199 src[i] = qpu_unif();
200 break;
201 case QFILE_VARY:
202 src[i] = qpu_vary();
203 break;
204 }
205 }
206
207 struct qpu_reg dst;
208 switch (qinst->dst.file) {
209 case QFILE_NULL:
210 dst = qpu_ra(QPU_W_NOP);
211 break;
212 case QFILE_TEMP:
213 dst = temp_registers[qinst->dst.index];
214 break;
215 case QFILE_VARY:
216 case QFILE_UNIF:
217 assert(!"not reached");
218 break;
219 }
220
221 switch (qinst->op) {
222 case QOP_MOV:
223 /* Skip emitting the MOV if it's a no-op. */
224 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
225 dst.mux != src[0].mux || dst.addr != src[0].addr) {
226 queue(c, qpu_a_MOV(dst, src[0]));
227 }
228 break;
229
230 case QOP_SF:
231 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
232 *last_inst(c) |= QPU_SF;
233 break;
234
235 case QOP_SEL_X_0_ZS:
236 case QOP_SEL_X_0_ZC:
237 case QOP_SEL_X_0_NS:
238 case QOP_SEL_X_0_NC:
239 queue(c, qpu_a_MOV(dst, src[0]));
240 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
241 QPU_COND_ZS);
242
243 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
244 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
245 1) + QPU_COND_ZS);
246 break;
247
248 case QOP_SEL_X_Y_ZS:
249 case QOP_SEL_X_Y_ZC:
250 case QOP_SEL_X_Y_NS:
251 case QOP_SEL_X_Y_NC:
252 queue(c, qpu_a_MOV(dst, src[0]));
253 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
254 QPU_COND_ZS);
255
256 queue(c, qpu_a_MOV(dst, src[1]));
257 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
258 1) + QPU_COND_ZS);
259
260 break;
261
262 case QOP_VPM_WRITE:
263 queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
264 break;
265
266 case QOP_VPM_READ:
267 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
268 break;
269
270 case QOP_RCP:
271 case QOP_RSQ:
272 case QOP_EXP2:
273 case QOP_LOG2:
274 switch (qinst->op) {
275 case QOP_RCP:
276 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
277 src[0]));
278 break;
279 case QOP_RSQ:
280 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
281 src[0]));
282 break;
283 case QOP_EXP2:
284 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
285 src[0]));
286 break;
287 case QOP_LOG2:
288 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
289 src[0]));
290 break;
291 default:
292 abort();
293 }
294
295 queue(c, qpu_a_MOV(dst, qpu_r4()));
296
297 break;
298
299 case QOP_PACK_COLORS:
300 for (int i = 0; i < 4; i++) {
301 queue(c, qpu_m_MOV(qpu_r3(), src[i]));
302 *last_inst(c) |= QPU_PM;
303 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
304 QPU_PACK);
305 }
306
307 queue(c, qpu_a_MOV(dst, qpu_r3()));
308
309 break;
310
311 case QOP_FRAG_X:
312 queue(c, qpu_a_ITOF(dst,
313 qpu_ra(QPU_R_XY_PIXEL_COORD)));
314 break;
315
316 case QOP_FRAG_Y:
317 queue(c, qpu_a_ITOF(dst,
318 qpu_rb(QPU_R_XY_PIXEL_COORD)));
319 break;
320
321 case QOP_FRAG_REV_FLAG:
322 queue(c, qpu_a_ITOF(dst,
323 qpu_rb(QPU_R_MS_REV_FLAGS)));
324 break;
325
326 case QOP_FRAG_Z:
327 case QOP_FRAG_W:
328 /* QOP_FRAG_Z/W don't emit instructions, just allocate
329 * the register to the Z/W payload.
330 */
331 break;
332
333 case QOP_TLB_DISCARD_SETUP:
334 discard = true;
335 queue(c, qpu_a_MOV(src[0], src[0]));
336 *last_inst(c) |= QPU_SF;
337 break;
338
339 case QOP_TLB_STENCIL_SETUP:
340 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
341 break;
342
343 case QOP_TLB_Z_WRITE:
344 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
345 if (discard) {
346 set_last_cond_add(c, QPU_COND_ZS);
347 }
348 break;
349
350 case QOP_TLB_COLOR_READ:
351 queue(c, qpu_NOP());
352 *last_inst(c) = qpu_set_sig(*last_inst(c),
353 QPU_SIG_COLOR_LOAD);
354
355 break;
356
357 case QOP_TLB_COLOR_WRITE:
358 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
359 if (discard) {
360 set_last_cond_add(c, QPU_COND_ZS);
361 }
362 break;
363
364 case QOP_VARY_ADD_C:
365 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
366 break;
367
368 case QOP_PACK_SCALED: {
369 uint64_t a = (qpu_a_MOV(dst, src[0]) |
370 QPU_SET_FIELD(QPU_PACK_A_16A,
371 QPU_PACK));
372 uint64_t b = (qpu_a_MOV(dst, src[1]) |
373 QPU_SET_FIELD(QPU_PACK_A_16B,
374 QPU_PACK));
375
376 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
377 queue(c, b);
378 queue(c, a);
379 } else {
380 queue(c, a);
381 queue(c, b);
382 }
383 break;
384 }
385
386 case QOP_TEX_S:
387 case QOP_TEX_T:
388 case QOP_TEX_R:
389 case QOP_TEX_B:
390 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
391 (qinst->op - QOP_TEX_S)),
392 src[0]));
393 break;
394
395 case QOP_TEX_DIRECT:
396 fixup_raddr_conflict(c, &src[0], &src[1]);
397 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
398 break;
399
400 case QOP_TEX_RESULT:
401 queue(c, qpu_NOP());
402 *last_inst(c) = qpu_set_sig(*last_inst(c),
403 QPU_SIG_LOAD_TMU0);
404
405 break;
406
407 case QOP_R4_UNPACK_A:
408 case QOP_R4_UNPACK_B:
409 case QOP_R4_UNPACK_C:
410 case QOP_R4_UNPACK_D:
411 assert(src[0].mux == QPU_MUX_R4);
412 queue(c, qpu_a_MOV(dst, src[0]));
413 *last_inst(c) |= QPU_PM;
414 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
415 (qinst->op -
416 QOP_R4_UNPACK_A),
417 QPU_UNPACK);
418
419 break;
420
421 case QOP_UNPACK_8A:
422 case QOP_UNPACK_8B:
423 case QOP_UNPACK_8C:
424 case QOP_UNPACK_8D: {
425 assert(src[0].mux == QPU_MUX_A);
426
427 /* And, since we're setting the pack bits, if the
428 * destination is in A it would get re-packed.
429 */
430 struct qpu_reg orig_dst = dst;
431 if (orig_dst.mux == QPU_MUX_A)
432 dst = qpu_rn(3);
433
434 queue(c, qpu_a_FMAX(dst, src[0], src[0]));
435 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
436 (qinst->op -
437 QOP_UNPACK_8A),
438 QPU_UNPACK);
439
440 if (orig_dst.mux == QPU_MUX_A) {
441 queue(c, qpu_a_MOV(orig_dst, dst));
442 }
443 }
444 break;
445
446 default:
447 assert(qinst->op < ARRAY_SIZE(translate));
448 assert(translate[qinst->op].op != 0); /* NOPs */
449
450 /* If we have only one source, put it in the second
451 * argument slot as well so that we don't take up
452 * another raddr just to get unused data.
453 */
454 if (qir_get_op_nsrc(qinst->op) == 1)
455 src[1] = src[0];
456
457 fixup_raddr_conflict(c, &src[0], &src[1]);
458
459 if (translate[qinst->op].is_mul) {
460 queue(c, qpu_m_alu2(translate[qinst->op].op,
461 dst,
462 src[0], src[1]));
463 } else {
464 queue(c, qpu_a_alu2(translate[qinst->op].op,
465 dst,
466 src[0], src[1]));
467 }
468 break;
469 }
470 }
471
472 qpu_schedule_instructions(c);
473
474 /* thread end can't have VPM write or read */
475 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
476 QPU_WADDR_ADD) == QPU_W_VPM ||
477 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
478 QPU_WADDR_MUL) == QPU_W_VPM ||
479 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
480 QPU_RADDR_A) == QPU_R_VPM ||
481 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
482 QPU_RADDR_B) == QPU_R_VPM) {
483 qpu_serialize_one_inst(c, qpu_NOP());
484 }
485
486 /* thread end can't have uniform read */
487 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
488 QPU_RADDR_A) == QPU_R_UNIF ||
489 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
490 QPU_RADDR_B) == QPU_R_UNIF) {
491 qpu_serialize_one_inst(c, qpu_NOP());
492 }
493
494 /* thread end can't have TLB operations */
495 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
496 qpu_serialize_one_inst(c, qpu_NOP());
497
498 c->qpu_insts[c->qpu_inst_count - 1] =
499 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
500 QPU_SIG_PROG_END);
501 qpu_serialize_one_inst(c, qpu_NOP());
502 qpu_serialize_one_inst(c, qpu_NOP());
503
504 switch (c->stage) {
505 case QSTAGE_VERT:
506 case QSTAGE_COORD:
507 break;
508 case QSTAGE_FRAG:
509 c->qpu_insts[c->qpu_inst_count - 1] =
510 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
511 QPU_SIG_SCOREBOARD_UNLOCK);
512 break;
513 }
514
515 if (vc4_debug & VC4_DEBUG_QPU)
516 vc4_dump_program(c);
517
518 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
519
520 free(temp_registers);
521 }