vc4: Drop dependency on r3 for color packing.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29
30 static void
31 vc4_dump_program(struct vc4_compile *c)
32 {
33 fprintf(stderr, "%s prog %d/%d QPU:\n",
34 qir_get_stage_name(c->stage),
35 c->program_id, c->variant_id);
36
37 for (int i = 0; i < c->qpu_inst_count; i++) {
38 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
39 vc4_qpu_disasm(&c->qpu_insts[i], 1);
40 fprintf(stderr, "\n");
41 }
42 }
43
44 static void
45 queue(struct vc4_compile *c, uint64_t inst)
46 {
47 struct queued_qpu_inst *q = calloc(1, sizeof(*q));
48 q->inst = inst;
49 insert_at_tail(&c->qpu_inst_list, &q->link);
50 }
51
52 static uint64_t *
53 last_inst(struct vc4_compile *c)
54 {
55 struct queued_qpu_inst *q =
56 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
57 return &q->inst;
58 }
59
60 static void
61 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
62 {
63 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
64 }
65
66 /**
67 * Some special registers can be read from either file, which lets us resolve
68 * raddr conflicts without extra MOVs.
69 */
70 static bool
71 swap_file(struct qpu_reg *src)
72 {
73 switch (src->addr) {
74 case QPU_R_UNIF:
75 case QPU_R_VARY:
76 if (src->mux == QPU_MUX_A)
77 src->mux = QPU_MUX_B;
78 else
79 src->mux = QPU_MUX_A;
80 return true;
81
82 default:
83 return false;
84 }
85 }
86
87 /**
88 * This is used to resolve the fact that we might register-allocate two
89 * different operands of an instruction to the same physical register file
90 * even though instructions have only one field for the register file source
91 * address.
92 *
93 * In that case, we need to move one to a temporary that can be used in the
94 * instruction, instead.
95 */
96 static void
97 fixup_raddr_conflict(struct vc4_compile *c,
98 struct qpu_reg *src0, struct qpu_reg *src1)
99 {
100 if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
101 src0->mux != src1->mux ||
102 src0->addr == src1->addr) {
103 return;
104 }
105
106 if (swap_file(src0) || swap_file(src1))
107 return;
108
109 queue(c, qpu_a_MOV(qpu_r3(), *src1));
110 *src1 = qpu_r3();
111 }
112
113 void
114 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
115 {
116 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
117 bool discard = false;
118 uint32_t inputs_remaining = c->num_inputs;
119 uint32_t vpm_read_fifo_count = 0;
120 uint32_t vpm_read_offset = 0;
121
122 make_empty_list(&c->qpu_inst_list);
123
124 switch (c->stage) {
125 case QSTAGE_VERT:
126 case QSTAGE_COORD:
127 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
128 * load up to 16 dwords (4 vec4s) per vertex.
129 */
130 while (inputs_remaining) {
131 uint32_t num_entries = MIN2(inputs_remaining, 16);
132 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
133 vpm_read_offset |
134 0x00001a00 |
135 ((num_entries & 0xf) << 20)));
136 inputs_remaining -= num_entries;
137 vpm_read_offset += num_entries;
138 vpm_read_fifo_count++;
139 }
140 assert(vpm_read_fifo_count <= 4);
141
142 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
143 break;
144 case QSTAGE_FRAG:
145 break;
146 }
147
148 struct simple_node *node;
149 foreach(node, &c->instructions) {
150 struct qinst *qinst = (struct qinst *)node;
151
152 #if 0
153 fprintf(stderr, "translating qinst to qpu: ");
154 qir_dump_inst(qinst);
155 fprintf(stderr, "\n");
156 #endif
157
158 static const struct {
159 uint32_t op;
160 bool is_mul;
161 } translate[] = {
162 #define A(name) [QOP_##name] = {QPU_A_##name, false}
163 #define M(name) [QOP_##name] = {QPU_M_##name, true}
164 A(FADD),
165 A(FSUB),
166 A(FMIN),
167 A(FMAX),
168 A(FMINABS),
169 A(FMAXABS),
170 A(FTOI),
171 A(ITOF),
172 A(ADD),
173 A(SUB),
174 A(SHL),
175 A(SHR),
176 A(ASR),
177 A(MIN),
178 A(MAX),
179 A(AND),
180 A(OR),
181 A(XOR),
182 A(NOT),
183
184 M(FMUL),
185 M(MUL24),
186 };
187
188 struct qpu_reg src[4];
189 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
190 int index = qinst->src[i].index;
191 switch (qinst->src[i].file) {
192 case QFILE_NULL:
193 src[i] = qpu_rn(0);
194 break;
195 case QFILE_TEMP:
196 src[i] = temp_registers[index];
197 break;
198 case QFILE_UNIF:
199 src[i] = qpu_unif();
200 break;
201 case QFILE_VARY:
202 src[i] = qpu_vary();
203 break;
204 }
205 }
206
207 struct qpu_reg dst;
208 switch (qinst->dst.file) {
209 case QFILE_NULL:
210 dst = qpu_ra(QPU_W_NOP);
211 break;
212 case QFILE_TEMP:
213 dst = temp_registers[qinst->dst.index];
214 break;
215 case QFILE_VARY:
216 case QFILE_UNIF:
217 assert(!"not reached");
218 break;
219 }
220
221 switch (qinst->op) {
222 case QOP_MOV:
223 /* Skip emitting the MOV if it's a no-op. */
224 if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
225 dst.mux != src[0].mux || dst.addr != src[0].addr) {
226 queue(c, qpu_a_MOV(dst, src[0]));
227 }
228 break;
229
230 case QOP_SF:
231 queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
232 *last_inst(c) |= QPU_SF;
233 break;
234
235 case QOP_SEL_X_0_ZS:
236 case QOP_SEL_X_0_ZC:
237 case QOP_SEL_X_0_NS:
238 case QOP_SEL_X_0_NC:
239 queue(c, qpu_a_MOV(dst, src[0]));
240 set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
241 QPU_COND_ZS);
242
243 queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
244 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
245 1) + QPU_COND_ZS);
246 break;
247
248 case QOP_SEL_X_Y_ZS:
249 case QOP_SEL_X_Y_ZC:
250 case QOP_SEL_X_Y_NS:
251 case QOP_SEL_X_Y_NC:
252 queue(c, qpu_a_MOV(dst, src[0]));
253 set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
254 QPU_COND_ZS);
255
256 queue(c, qpu_a_MOV(dst, src[1]));
257 set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
258 1) + QPU_COND_ZS);
259
260 break;
261
262 case QOP_VPM_WRITE:
263 queue(c, qpu_a_MOV(qpu_ra(QPU_W_VPM), src[0]));
264 break;
265
266 case QOP_VPM_READ:
267 queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
268 break;
269
270 case QOP_RCP:
271 case QOP_RSQ:
272 case QOP_EXP2:
273 case QOP_LOG2:
274 switch (qinst->op) {
275 case QOP_RCP:
276 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
277 src[0]));
278 break;
279 case QOP_RSQ:
280 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
281 src[0]));
282 break;
283 case QOP_EXP2:
284 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
285 src[0]));
286 break;
287 case QOP_LOG2:
288 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
289 src[0]));
290 break;
291 default:
292 abort();
293 }
294
295 queue(c, qpu_a_MOV(dst, qpu_r4()));
296
297 break;
298
299 case QOP_PACK_COLORS: {
300 /* We have to be careful not to start writing over one
301 * of our source values when incrementally writing the
302 * destination. So, if the dst is one of the srcs, we
303 * pack that one first (and we pack 4 channels at once
304 * for the first pack).
305 */
306 struct qpu_reg first_pack = src[0];
307 for (int i = 0; i < 4; i++) {
308 if (src[i].mux == dst.mux &&
309 src[i].addr == dst.addr) {
310 first_pack = dst;
311 break;
312 }
313 }
314 queue(c, qpu_m_MOV(dst, first_pack));
315 *last_inst(c) |= QPU_PM;
316 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
317 QPU_PACK);
318
319 for (int i = 0; i < 4; i++) {
320 if (src[i].mux == first_pack.mux &&
321 src[i].addr == first_pack.addr) {
322 continue;
323 }
324
325 queue(c, qpu_m_MOV(dst, src[i]));
326 *last_inst(c) |= QPU_PM;
327 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
328 QPU_PACK);
329 }
330
331 break;
332 }
333
334 case QOP_FRAG_X:
335 queue(c, qpu_a_ITOF(dst,
336 qpu_ra(QPU_R_XY_PIXEL_COORD)));
337 break;
338
339 case QOP_FRAG_Y:
340 queue(c, qpu_a_ITOF(dst,
341 qpu_rb(QPU_R_XY_PIXEL_COORD)));
342 break;
343
344 case QOP_FRAG_REV_FLAG:
345 queue(c, qpu_a_ITOF(dst,
346 qpu_rb(QPU_R_MS_REV_FLAGS)));
347 break;
348
349 case QOP_FRAG_Z:
350 case QOP_FRAG_W:
351 /* QOP_FRAG_Z/W don't emit instructions, just allocate
352 * the register to the Z/W payload.
353 */
354 break;
355
356 case QOP_TLB_DISCARD_SETUP:
357 discard = true;
358 queue(c, qpu_a_MOV(src[0], src[0]));
359 *last_inst(c) |= QPU_SF;
360 break;
361
362 case QOP_TLB_STENCIL_SETUP:
363 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
364 break;
365
366 case QOP_TLB_Z_WRITE:
367 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
368 if (discard) {
369 set_last_cond_add(c, QPU_COND_ZS);
370 }
371 break;
372
373 case QOP_TLB_COLOR_READ:
374 queue(c, qpu_NOP());
375 *last_inst(c) = qpu_set_sig(*last_inst(c),
376 QPU_SIG_COLOR_LOAD);
377
378 break;
379
380 case QOP_TLB_COLOR_WRITE:
381 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
382 if (discard) {
383 set_last_cond_add(c, QPU_COND_ZS);
384 }
385 break;
386
387 case QOP_VARY_ADD_C:
388 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
389 break;
390
391 case QOP_PACK_SCALED: {
392 uint64_t a = (qpu_a_MOV(dst, src[0]) |
393 QPU_SET_FIELD(QPU_PACK_A_16A,
394 QPU_PACK));
395 uint64_t b = (qpu_a_MOV(dst, src[1]) |
396 QPU_SET_FIELD(QPU_PACK_A_16B,
397 QPU_PACK));
398
399 if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
400 queue(c, b);
401 queue(c, a);
402 } else {
403 queue(c, a);
404 queue(c, b);
405 }
406 break;
407 }
408
409 case QOP_TEX_S:
410 case QOP_TEX_T:
411 case QOP_TEX_R:
412 case QOP_TEX_B:
413 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
414 (qinst->op - QOP_TEX_S)),
415 src[0]));
416 break;
417
418 case QOP_TEX_DIRECT:
419 fixup_raddr_conflict(c, &src[0], &src[1]);
420 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
421 break;
422
423 case QOP_TEX_RESULT:
424 queue(c, qpu_NOP());
425 *last_inst(c) = qpu_set_sig(*last_inst(c),
426 QPU_SIG_LOAD_TMU0);
427
428 break;
429
430 case QOP_R4_UNPACK_A:
431 case QOP_R4_UNPACK_B:
432 case QOP_R4_UNPACK_C:
433 case QOP_R4_UNPACK_D:
434 assert(src[0].mux == QPU_MUX_R4);
435 queue(c, qpu_a_MOV(dst, src[0]));
436 *last_inst(c) |= QPU_PM;
437 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
438 (qinst->op -
439 QOP_R4_UNPACK_A),
440 QPU_UNPACK);
441
442 break;
443
444 case QOP_UNPACK_8A:
445 case QOP_UNPACK_8B:
446 case QOP_UNPACK_8C:
447 case QOP_UNPACK_8D: {
448 assert(src[0].mux == QPU_MUX_A);
449
450 /* And, since we're setting the pack bits, if the
451 * destination is in A it would get re-packed.
452 */
453 struct qpu_reg orig_dst = dst;
454 if (orig_dst.mux == QPU_MUX_A)
455 dst = qpu_rn(3);
456
457 queue(c, qpu_a_FMAX(dst, src[0], src[0]));
458 *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
459 (qinst->op -
460 QOP_UNPACK_8A),
461 QPU_UNPACK);
462
463 if (orig_dst.mux == QPU_MUX_A) {
464 queue(c, qpu_a_MOV(orig_dst, dst));
465 }
466 }
467 break;
468
469 default:
470 assert(qinst->op < ARRAY_SIZE(translate));
471 assert(translate[qinst->op].op != 0); /* NOPs */
472
473 /* If we have only one source, put it in the second
474 * argument slot as well so that we don't take up
475 * another raddr just to get unused data.
476 */
477 if (qir_get_op_nsrc(qinst->op) == 1)
478 src[1] = src[0];
479
480 fixup_raddr_conflict(c, &src[0], &src[1]);
481
482 if (translate[qinst->op].is_mul) {
483 queue(c, qpu_m_alu2(translate[qinst->op].op,
484 dst,
485 src[0], src[1]));
486 } else {
487 queue(c, qpu_a_alu2(translate[qinst->op].op,
488 dst,
489 src[0], src[1]));
490 }
491 break;
492 }
493 }
494
495 qpu_schedule_instructions(c);
496
497 /* thread end can't have VPM write or read */
498 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
499 QPU_WADDR_ADD) == QPU_W_VPM ||
500 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
501 QPU_WADDR_MUL) == QPU_W_VPM ||
502 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
503 QPU_RADDR_A) == QPU_R_VPM ||
504 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
505 QPU_RADDR_B) == QPU_R_VPM) {
506 qpu_serialize_one_inst(c, qpu_NOP());
507 }
508
509 /* thread end can't have uniform read */
510 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
511 QPU_RADDR_A) == QPU_R_UNIF ||
512 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
513 QPU_RADDR_B) == QPU_R_UNIF) {
514 qpu_serialize_one_inst(c, qpu_NOP());
515 }
516
517 /* thread end can't have TLB operations */
518 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
519 qpu_serialize_one_inst(c, qpu_NOP());
520
521 c->qpu_insts[c->qpu_inst_count - 1] =
522 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
523 QPU_SIG_PROG_END);
524 qpu_serialize_one_inst(c, qpu_NOP());
525 qpu_serialize_one_inst(c, qpu_NOP());
526
527 switch (c->stage) {
528 case QSTAGE_VERT:
529 case QSTAGE_COORD:
530 break;
531 case QSTAGE_FRAG:
532 c->qpu_insts[c->qpu_inst_count - 1] =
533 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
534 QPU_SIG_SCOREBOARD_UNLOCK);
535 break;
536 }
537
538 if (vc4_debug & VC4_DEBUG_QPU)
539 vc4_dump_program(c);
540
541 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
542
543 free(temp_registers);
544 }