vc4: Correct typo setting 'handled_qinst_cond'
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 static void
68 set_last_cond_mul(struct vc4_compile *c, uint32_t cond)
69 {
70 *last_inst(c) = qpu_set_cond_mul(*last_inst(c), cond);
71 }
72
73 /**
74 * Some special registers can be read from either file, which lets us resolve
75 * raddr conflicts without extra MOVs.
76 */
77 static bool
78 swap_file(struct qpu_reg *src)
79 {
80 switch (src->addr) {
81 case QPU_R_UNIF:
82 case QPU_R_VARY:
83 if (src->mux == QPU_MUX_SMALL_IMM) {
84 return false;
85 } else {
86 if (src->mux == QPU_MUX_A)
87 src->mux = QPU_MUX_B;
88 else
89 src->mux = QPU_MUX_A;
90 return true;
91 }
92
93 default:
94 return false;
95 }
96 }
97
98 /**
99 * This is used to resolve the fact that we might register-allocate two
100 * different operands of an instruction to the same physical register file
101 * even though instructions have only one field for the register file source
102 * address.
103 *
104 * In that case, we need to move one to a temporary that can be used in the
105 * instruction, instead. We reserve ra31/rb31 for this purpose.
106 */
107 static void
108 fixup_raddr_conflict(struct vc4_compile *c,
109 struct qpu_reg dst,
110 struct qpu_reg *src0, struct qpu_reg *src1,
111 struct qinst *inst, uint64_t *unpack)
112 {
113 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
114 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
115
116 if (mux0 <= QPU_MUX_R5 ||
117 mux0 != mux1 ||
118 (src0->addr == src1->addr &&
119 src0->mux == src1->mux)) {
120 return;
121 }
122
123 if (swap_file(src0) || swap_file(src1))
124 return;
125
126 if (mux0 == QPU_MUX_A) {
127 /* Make sure we use the same type of MOV as the instruction,
128 * in case of unpacks.
129 */
130 if (qir_is_float_input(inst))
131 queue(c, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
132 else
133 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
134
135 /* If we had an unpack on this A-file source, we need to put
136 * it into this MOV, not into the later move from regfile B.
137 */
138 if (inst->src[0].pack) {
139 *last_inst(c) |= *unpack;
140 *unpack = 0;
141 }
142 *src0 = qpu_rb(31);
143 } else {
144 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
145 *src0 = qpu_ra(31);
146 }
147 }
148
149 static void
150 set_last_dst_pack(struct vc4_compile *c, struct qinst *inst)
151 {
152 bool had_pm = *last_inst(c) & QPU_PM;
153 bool had_ws = *last_inst(c) & QPU_WS;
154 uint32_t unpack = QPU_GET_FIELD(*last_inst(c), QPU_UNPACK);
155
156 if (!inst->dst.pack)
157 return;
158
159 *last_inst(c) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
160
161 if (qir_is_mul(inst)) {
162 assert(!unpack || had_pm);
163 *last_inst(c) |= QPU_PM;
164 } else {
165 assert(!unpack || !had_pm);
166 assert(!had_ws); /* dst must be a-file to pack. */
167 }
168 }
169
170 void
171 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
172 {
173 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
174 bool discard = false;
175 uint32_t inputs_remaining = c->num_inputs;
176 uint32_t vpm_read_fifo_count = 0;
177 uint32_t vpm_read_offset = 0;
178 int last_vpm_read_index = -1;
179
180 list_inithead(&c->qpu_inst_list);
181
182 switch (c->stage) {
183 case QSTAGE_VERT:
184 case QSTAGE_COORD:
185 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
186 * load up to 16 dwords (4 vec4s) per vertex.
187 */
188 while (inputs_remaining) {
189 uint32_t num_entries = MIN2(inputs_remaining, 16);
190 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
191 vpm_read_offset |
192 0x00001a00 |
193 ((num_entries & 0xf) << 20)));
194 inputs_remaining -= num_entries;
195 vpm_read_offset += num_entries;
196 vpm_read_fifo_count++;
197 }
198 assert(vpm_read_fifo_count <= 4);
199
200 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
201 break;
202 case QSTAGE_FRAG:
203 break;
204 }
205
206 list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
207 #if 0
208 fprintf(stderr, "translating qinst to qpu: ");
209 qir_dump_inst(qinst);
210 fprintf(stderr, "\n");
211 #endif
212
213 static const struct {
214 uint32_t op;
215 } translate[] = {
216 #define A(name) [QOP_##name] = {QPU_A_##name}
217 #define M(name) [QOP_##name] = {QPU_M_##name}
218 A(FADD),
219 A(FSUB),
220 A(FMIN),
221 A(FMAX),
222 A(FMINABS),
223 A(FMAXABS),
224 A(FTOI),
225 A(ITOF),
226 A(ADD),
227 A(SUB),
228 A(SHL),
229 A(SHR),
230 A(ASR),
231 A(MIN),
232 A(MAX),
233 A(AND),
234 A(OR),
235 A(XOR),
236 A(NOT),
237
238 M(FMUL),
239 M(V8MULD),
240 M(V8MIN),
241 M(V8MAX),
242 M(V8ADDS),
243 M(V8SUBS),
244 M(MUL24),
245
246 /* If we replicate src[0] out to src[1], this works
247 * out the same as a MOV.
248 */
249 [QOP_MOV] = { QPU_A_OR },
250 [QOP_FMOV] = { QPU_A_FMAX },
251 [QOP_MMOV] = { QPU_M_V8MIN },
252 };
253
254 uint64_t unpack = 0;
255 struct qpu_reg src[4];
256 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
257 int index = qinst->src[i].index;
258 switch (qinst->src[i].file) {
259 case QFILE_NULL:
260 src[i] = qpu_rn(0);
261 break;
262 case QFILE_TEMP:
263 src[i] = temp_registers[index];
264 if (qinst->src[i].pack) {
265 assert(!unpack ||
266 unpack == qinst->src[i].pack);
267 unpack = QPU_SET_FIELD(qinst->src[i].pack,
268 QPU_UNPACK);
269 if (src[i].mux == QPU_MUX_R4)
270 unpack |= QPU_PM;
271 }
272 break;
273 case QFILE_UNIF:
274 src[i] = qpu_unif();
275 break;
276 case QFILE_VARY:
277 src[i] = qpu_vary();
278 break;
279 case QFILE_SMALL_IMM:
280 src[i].mux = QPU_MUX_SMALL_IMM;
281 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
282 /* This should only have returned a valid
283 * small immediate field, not ~0 for failure.
284 */
285 assert(src[i].addr <= 47);
286 break;
287 case QFILE_VPM:
288 assert((int)qinst->src[i].index >=
289 last_vpm_read_index);
290 (void)last_vpm_read_index;
291 last_vpm_read_index = qinst->src[i].index;
292 src[i] = qpu_ra(QPU_R_VPM);
293 break;
294 }
295 }
296
297 struct qpu_reg dst;
298 switch (qinst->dst.file) {
299 case QFILE_NULL:
300 dst = qpu_ra(QPU_W_NOP);
301 break;
302 case QFILE_TEMP:
303 dst = temp_registers[qinst->dst.index];
304 break;
305 case QFILE_VPM:
306 dst = qpu_ra(QPU_W_VPM);
307 break;
308 case QFILE_VARY:
309 case QFILE_UNIF:
310 case QFILE_SMALL_IMM:
311 assert(!"not reached");
312 break;
313 }
314
315 bool handled_qinst_cond = false;
316
317 switch (qinst->op) {
318 case QOP_RCP:
319 case QOP_RSQ:
320 case QOP_EXP2:
321 case QOP_LOG2:
322 switch (qinst->op) {
323 case QOP_RCP:
324 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
325 src[0]) | unpack);
326 break;
327 case QOP_RSQ:
328 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
329 src[0]) | unpack);
330 break;
331 case QOP_EXP2:
332 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
333 src[0]) | unpack);
334 break;
335 case QOP_LOG2:
336 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
337 src[0]) | unpack);
338 break;
339 default:
340 abort();
341 }
342
343 if (dst.mux != QPU_MUX_R4)
344 queue(c, qpu_a_MOV(dst, qpu_r4()));
345
346 break;
347
348 case QOP_FRAG_X:
349 queue(c, qpu_a_ITOF(dst,
350 qpu_ra(QPU_R_XY_PIXEL_COORD)));
351 break;
352
353 case QOP_FRAG_Y:
354 queue(c, qpu_a_ITOF(dst,
355 qpu_rb(QPU_R_XY_PIXEL_COORD)));
356 break;
357
358 case QOP_FRAG_REV_FLAG:
359 queue(c, qpu_a_ITOF(dst,
360 qpu_rb(QPU_R_MS_REV_FLAGS)));
361 break;
362
363 case QOP_MS_MASK:
364 src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
365 fixup_raddr_conflict(c, dst, &src[0], &src[1],
366 qinst, &unpack);
367 queue(c, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
368 src[0], src[1]) | unpack);
369 break;
370
371 case QOP_FRAG_Z:
372 case QOP_FRAG_W:
373 /* QOP_FRAG_Z/W don't emit instructions, just allocate
374 * the register to the Z/W payload.
375 */
376 break;
377
378 case QOP_TLB_DISCARD_SETUP:
379 discard = true;
380 queue(c, qpu_a_MOV(src[0], src[0]) | unpack);
381 *last_inst(c) |= QPU_SF;
382 break;
383
384 case QOP_TLB_STENCIL_SETUP:
385 assert(!unpack);
386 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP),
387 src[0]) | unpack);
388 break;
389
390 case QOP_TLB_Z_WRITE:
391 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z),
392 src[0]) | unpack);
393 if (discard) {
394 set_last_cond_add(c, QPU_COND_ZS);
395 }
396 break;
397
398 case QOP_TLB_COLOR_READ:
399 queue(c, qpu_NOP());
400 *last_inst(c) = qpu_set_sig(*last_inst(c),
401 QPU_SIG_COLOR_LOAD);
402
403 if (dst.mux != QPU_MUX_R4)
404 queue(c, qpu_a_MOV(dst, qpu_r4()));
405 break;
406
407 case QOP_TLB_COLOR_WRITE:
408 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]) | unpack);
409 if (discard) {
410 set_last_cond_add(c, QPU_COND_ZS);
411 }
412 break;
413
414 case QOP_TLB_COLOR_WRITE_MS:
415 queue(c, qpu_a_MOV(qpu_tlbc_ms(), src[0]));
416 if (discard) {
417 set_last_cond_add(c, QPU_COND_ZS);
418 }
419 break;
420
421 case QOP_VARY_ADD_C:
422 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
423 break;
424
425 case QOP_TEX_S:
426 case QOP_TEX_T:
427 case QOP_TEX_R:
428 case QOP_TEX_B:
429 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
430 (qinst->op - QOP_TEX_S)),
431 src[0]) | unpack);
432 break;
433
434 case QOP_TEX_DIRECT:
435 fixup_raddr_conflict(c, dst, &src[0], &src[1],
436 qinst, &unpack);
437 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
438 src[0], src[1]) | unpack);
439 break;
440
441 case QOP_TEX_RESULT:
442 queue(c, qpu_NOP());
443 *last_inst(c) = qpu_set_sig(*last_inst(c),
444 QPU_SIG_LOAD_TMU0);
445 if (dst.mux != QPU_MUX_R4)
446 queue(c, qpu_a_MOV(dst, qpu_r4()));
447 break;
448
449 default:
450 assert(qinst->op < ARRAY_SIZE(translate));
451 assert(translate[qinst->op].op != 0); /* NOPs */
452
453 /* Skip emitting the MOV if it's a no-op. */
454 if (qir_is_raw_mov(qinst) &&
455 dst.mux == src[0].mux && dst.addr == src[0].addr) {
456 break;
457 }
458
459 /* If we have only one source, put it in the second
460 * argument slot as well so that we don't take up
461 * another raddr just to get unused data.
462 */
463 if (qir_get_op_nsrc(qinst->op) == 1)
464 src[1] = src[0];
465
466 fixup_raddr_conflict(c, dst, &src[0], &src[1],
467 qinst, &unpack);
468
469 if (qir_is_mul(qinst)) {
470 queue(c, qpu_m_alu2(translate[qinst->op].op,
471 dst,
472 src[0], src[1]) | unpack);
473 set_last_cond_mul(c, qinst->cond);
474 } else {
475 queue(c, qpu_a_alu2(translate[qinst->op].op,
476 dst,
477 src[0], src[1]) | unpack);
478 set_last_cond_add(c, qinst->cond);
479 }
480 handled_qinst_cond = true;
481 set_last_dst_pack(c, qinst);
482
483 break;
484 }
485
486 assert(qinst->cond == QPU_COND_ALWAYS ||
487 handled_qinst_cond);
488
489 if (qinst->sf) {
490 assert(!qir_is_multi_instruction(qinst));
491 *last_inst(c) |= QPU_SF;
492 }
493 }
494
495 uint32_t cycles = qpu_schedule_instructions(c);
496 uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
497
498 /* thread end can't have VPM write or read */
499 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
500 QPU_WADDR_ADD) == QPU_W_VPM ||
501 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
502 QPU_WADDR_MUL) == QPU_W_VPM ||
503 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
504 QPU_RADDR_A) == QPU_R_VPM ||
505 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
506 QPU_RADDR_B) == QPU_R_VPM) {
507 qpu_serialize_one_inst(c, qpu_NOP());
508 }
509
510 /* thread end can't have uniform read */
511 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
512 QPU_RADDR_A) == QPU_R_UNIF ||
513 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
514 QPU_RADDR_B) == QPU_R_UNIF) {
515 qpu_serialize_one_inst(c, qpu_NOP());
516 }
517
518 /* thread end can't have TLB operations */
519 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
520 qpu_serialize_one_inst(c, qpu_NOP());
521
522 c->qpu_insts[c->qpu_inst_count - 1] =
523 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
524 QPU_SIG_PROG_END);
525 qpu_serialize_one_inst(c, qpu_NOP());
526 qpu_serialize_one_inst(c, qpu_NOP());
527
528 switch (c->stage) {
529 case QSTAGE_VERT:
530 case QSTAGE_COORD:
531 break;
532 case QSTAGE_FRAG:
533 c->qpu_insts[c->qpu_inst_count - 1] =
534 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
535 QPU_SIG_SCOREBOARD_UNLOCK);
536 break;
537 }
538
539 cycles += c->qpu_inst_count - inst_count_at_schedule_time;
540
541 if (vc4_debug & VC4_DEBUG_SHADERDB) {
542 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
543 qir_get_stage_name(c->stage),
544 c->program_id, c->variant_id,
545 cycles);
546 }
547
548 if (vc4_debug & VC4_DEBUG_QPU)
549 vc4_dump_program(c);
550
551 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
552
553 free(temp_registers);
554 }