Merge remote-tracking branch 'origin/master' into vulkan
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 }
44
45 static void
46 queue(struct vc4_compile *c, uint64_t inst)
47 {
48 struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
49 q->inst = inst;
50 list_addtail(&q->link, &c->qpu_inst_list);
51 }
52
53 static uint64_t *
54 last_inst(struct vc4_compile *c)
55 {
56 struct queued_qpu_inst *q =
57 (struct queued_qpu_inst *)c->qpu_inst_list.prev;
58 return &q->inst;
59 }
60
61 static void
62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
63 {
64 *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
65 }
66
67 static void
68 set_last_cond_mul(struct vc4_compile *c, uint32_t cond)
69 {
70 *last_inst(c) = qpu_set_cond_mul(*last_inst(c), cond);
71 }
72
73 /**
74 * Some special registers can be read from either file, which lets us resolve
75 * raddr conflicts without extra MOVs.
76 */
77 static bool
78 swap_file(struct qpu_reg *src)
79 {
80 switch (src->addr) {
81 case QPU_R_UNIF:
82 case QPU_R_VARY:
83 if (src->mux == QPU_MUX_SMALL_IMM) {
84 return false;
85 } else {
86 if (src->mux == QPU_MUX_A)
87 src->mux = QPU_MUX_B;
88 else
89 src->mux = QPU_MUX_A;
90 return true;
91 }
92
93 default:
94 return false;
95 }
96 }
97
98 /**
99 * This is used to resolve the fact that we might register-allocate two
100 * different operands of an instruction to the same physical register file
101 * even though instructions have only one field for the register file source
102 * address.
103 *
104 * In that case, we need to move one to a temporary that can be used in the
105 * instruction, instead. We reserve ra31/rb31 for this purpose.
106 */
107 static void
108 fixup_raddr_conflict(struct vc4_compile *c,
109 struct qpu_reg dst,
110 struct qpu_reg *src0, struct qpu_reg *src1,
111 struct qinst *inst, uint64_t *unpack)
112 {
113 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
114 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
115
116 if (mux0 <= QPU_MUX_R5 ||
117 mux0 != mux1 ||
118 (src0->addr == src1->addr &&
119 src0->mux == src1->mux)) {
120 return;
121 }
122
123 if (swap_file(src0) || swap_file(src1))
124 return;
125
126 if (mux0 == QPU_MUX_A) {
127 /* Make sure we use the same type of MOV as the instruction,
128 * in case of unpacks.
129 */
130 if (qir_is_float_input(inst))
131 queue(c, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
132 else
133 queue(c, qpu_a_MOV(qpu_rb(31), *src0));
134
135 /* If we had an unpack on this A-file source, we need to put
136 * it into this MOV, not into the later move from regfile B.
137 */
138 if (inst->src[0].pack) {
139 *last_inst(c) |= *unpack;
140 *unpack = 0;
141 }
142 *src0 = qpu_rb(31);
143 } else {
144 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
145 *src0 = qpu_ra(31);
146 }
147 }
148
149 static void
150 set_last_dst_pack(struct vc4_compile *c, struct qinst *inst)
151 {
152 bool had_pm = *last_inst(c) & QPU_PM;
153 bool had_ws = *last_inst(c) & QPU_WS;
154 uint32_t unpack = QPU_GET_FIELD(*last_inst(c), QPU_UNPACK);
155
156 if (!inst->dst.pack)
157 return;
158
159 *last_inst(c) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
160
161 if (qir_is_mul(inst)) {
162 assert(!unpack || had_pm);
163 *last_inst(c) |= QPU_PM;
164 } else {
165 assert(!unpack || !had_pm);
166 assert(!had_ws); /* dst must be a-file to pack. */
167 }
168 }
169
170 void
171 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
172 {
173 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
174 uint32_t inputs_remaining = c->num_inputs;
175 uint32_t vpm_read_fifo_count = 0;
176 uint32_t vpm_read_offset = 0;
177 int last_vpm_read_index = -1;
178
179 list_inithead(&c->qpu_inst_list);
180
181 switch (c->stage) {
182 case QSTAGE_VERT:
183 case QSTAGE_COORD:
184 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
185 * load up to 16 dwords (4 vec4s) per vertex.
186 */
187 while (inputs_remaining) {
188 uint32_t num_entries = MIN2(inputs_remaining, 16);
189 queue(c, qpu_load_imm_ui(qpu_vrsetup(),
190 vpm_read_offset |
191 0x00001a00 |
192 ((num_entries & 0xf) << 20)));
193 inputs_remaining -= num_entries;
194 vpm_read_offset += num_entries;
195 vpm_read_fifo_count++;
196 }
197 assert(vpm_read_fifo_count <= 4);
198
199 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
200 break;
201 case QSTAGE_FRAG:
202 break;
203 }
204
205 list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
206 #if 0
207 fprintf(stderr, "translating qinst to qpu: ");
208 qir_dump_inst(qinst);
209 fprintf(stderr, "\n");
210 #endif
211
212 static const struct {
213 uint32_t op;
214 } translate[] = {
215 #define A(name) [QOP_##name] = {QPU_A_##name}
216 #define M(name) [QOP_##name] = {QPU_M_##name}
217 A(FADD),
218 A(FSUB),
219 A(FMIN),
220 A(FMAX),
221 A(FMINABS),
222 A(FMAXABS),
223 A(FTOI),
224 A(ITOF),
225 A(ADD),
226 A(SUB),
227 A(SHL),
228 A(SHR),
229 A(ASR),
230 A(MIN),
231 A(MAX),
232 A(AND),
233 A(OR),
234 A(XOR),
235 A(NOT),
236
237 M(FMUL),
238 M(V8MULD),
239 M(V8MIN),
240 M(V8MAX),
241 M(V8ADDS),
242 M(V8SUBS),
243 M(MUL24),
244
245 /* If we replicate src[0] out to src[1], this works
246 * out the same as a MOV.
247 */
248 [QOP_MOV] = { QPU_A_OR },
249 [QOP_FMOV] = { QPU_A_FMAX },
250 [QOP_MMOV] = { QPU_M_V8MIN },
251 };
252
253 uint64_t unpack = 0;
254 struct qpu_reg src[4];
255 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
256 int index = qinst->src[i].index;
257 switch (qinst->src[i].file) {
258 case QFILE_NULL:
259 src[i] = qpu_rn(0);
260 break;
261 case QFILE_TEMP:
262 src[i] = temp_registers[index];
263 if (qinst->src[i].pack) {
264 assert(!unpack ||
265 unpack == qinst->src[i].pack);
266 unpack = QPU_SET_FIELD(qinst->src[i].pack,
267 QPU_UNPACK);
268 if (src[i].mux == QPU_MUX_R4)
269 unpack |= QPU_PM;
270 }
271 break;
272 case QFILE_UNIF:
273 src[i] = qpu_unif();
274 break;
275 case QFILE_VARY:
276 src[i] = qpu_vary();
277 break;
278 case QFILE_SMALL_IMM:
279 src[i].mux = QPU_MUX_SMALL_IMM;
280 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
281 /* This should only have returned a valid
282 * small immediate field, not ~0 for failure.
283 */
284 assert(src[i].addr <= 47);
285 break;
286 case QFILE_VPM:
287 assert((int)qinst->src[i].index >=
288 last_vpm_read_index);
289 (void)last_vpm_read_index;
290 last_vpm_read_index = qinst->src[i].index;
291 src[i] = qpu_ra(QPU_R_VPM);
292 break;
293 }
294 }
295
296 struct qpu_reg dst;
297 switch (qinst->dst.file) {
298 case QFILE_NULL:
299 dst = qpu_ra(QPU_W_NOP);
300 break;
301 case QFILE_TEMP:
302 dst = temp_registers[qinst->dst.index];
303 break;
304 case QFILE_VPM:
305 dst = qpu_ra(QPU_W_VPM);
306 break;
307 case QFILE_VARY:
308 case QFILE_UNIF:
309 case QFILE_SMALL_IMM:
310 assert(!"not reached");
311 break;
312 }
313
314 bool handled_qinst_cond = false;
315
316 switch (qinst->op) {
317 case QOP_RCP:
318 case QOP_RSQ:
319 case QOP_EXP2:
320 case QOP_LOG2:
321 switch (qinst->op) {
322 case QOP_RCP:
323 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
324 src[0]) | unpack);
325 break;
326 case QOP_RSQ:
327 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
328 src[0]) | unpack);
329 break;
330 case QOP_EXP2:
331 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
332 src[0]) | unpack);
333 break;
334 case QOP_LOG2:
335 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
336 src[0]) | unpack);
337 break;
338 default:
339 abort();
340 }
341
342 if (dst.mux != QPU_MUX_R4)
343 queue(c, qpu_a_MOV(dst, qpu_r4()));
344
345 break;
346
347 case QOP_FRAG_X:
348 queue(c, qpu_a_ITOF(dst,
349 qpu_ra(QPU_R_XY_PIXEL_COORD)));
350 break;
351
352 case QOP_FRAG_Y:
353 queue(c, qpu_a_ITOF(dst,
354 qpu_rb(QPU_R_XY_PIXEL_COORD)));
355 break;
356
357 case QOP_FRAG_REV_FLAG:
358 queue(c, qpu_a_ITOF(dst,
359 qpu_rb(QPU_R_MS_REV_FLAGS)));
360 break;
361
362 case QOP_MS_MASK:
363 src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
364 fixup_raddr_conflict(c, dst, &src[0], &src[1],
365 qinst, &unpack);
366 queue(c, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
367 src[0], src[1]) | unpack);
368 break;
369
370 case QOP_FRAG_Z:
371 case QOP_FRAG_W:
372 /* QOP_FRAG_Z/W don't emit instructions, just allocate
373 * the register to the Z/W payload.
374 */
375 break;
376
377 case QOP_TLB_STENCIL_SETUP:
378 assert(!unpack);
379 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP),
380 src[0]) | unpack);
381 break;
382
383 case QOP_TLB_Z_WRITE:
384 queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z),
385 src[0]) | unpack);
386 set_last_cond_add(c, qinst->cond);
387 handled_qinst_cond = true;
388 break;
389
390 case QOP_TLB_COLOR_READ:
391 queue(c, qpu_NOP());
392 *last_inst(c) = qpu_set_sig(*last_inst(c),
393 QPU_SIG_COLOR_LOAD);
394
395 if (dst.mux != QPU_MUX_R4)
396 queue(c, qpu_a_MOV(dst, qpu_r4()));
397 break;
398
399 case QOP_TLB_COLOR_WRITE:
400 queue(c, qpu_a_MOV(qpu_tlbc(), src[0]) | unpack);
401 set_last_cond_add(c, qinst->cond);
402 handled_qinst_cond = true;
403 break;
404
405 case QOP_TLB_COLOR_WRITE_MS:
406 queue(c, qpu_a_MOV(qpu_tlbc_ms(), src[0]));
407 set_last_cond_add(c, qinst->cond);
408 handled_qinst_cond = true;
409 break;
410
411 case QOP_VARY_ADD_C:
412 queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
413 break;
414
415 case QOP_TEX_S:
416 case QOP_TEX_T:
417 case QOP_TEX_R:
418 case QOP_TEX_B:
419 queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
420 (qinst->op - QOP_TEX_S)),
421 src[0]) | unpack);
422 break;
423
424 case QOP_TEX_DIRECT:
425 fixup_raddr_conflict(c, dst, &src[0], &src[1],
426 qinst, &unpack);
427 queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
428 src[0], src[1]) | unpack);
429 break;
430
431 case QOP_TEX_RESULT:
432 queue(c, qpu_NOP());
433 *last_inst(c) = qpu_set_sig(*last_inst(c),
434 QPU_SIG_LOAD_TMU0);
435 if (dst.mux != QPU_MUX_R4)
436 queue(c, qpu_a_MOV(dst, qpu_r4()));
437 break;
438
439 default:
440 assert(qinst->op < ARRAY_SIZE(translate));
441 assert(translate[qinst->op].op != 0); /* NOPs */
442
443 /* Skip emitting the MOV if it's a no-op. */
444 if (qir_is_raw_mov(qinst) &&
445 dst.mux == src[0].mux && dst.addr == src[0].addr) {
446 break;
447 }
448
449 /* If we have only one source, put it in the second
450 * argument slot as well so that we don't take up
451 * another raddr just to get unused data.
452 */
453 if (qir_get_op_nsrc(qinst->op) == 1)
454 src[1] = src[0];
455
456 fixup_raddr_conflict(c, dst, &src[0], &src[1],
457 qinst, &unpack);
458
459 if (qir_is_mul(qinst)) {
460 queue(c, qpu_m_alu2(translate[qinst->op].op,
461 dst,
462 src[0], src[1]) | unpack);
463 set_last_cond_mul(c, qinst->cond);
464 } else {
465 queue(c, qpu_a_alu2(translate[qinst->op].op,
466 dst,
467 src[0], src[1]) | unpack);
468 set_last_cond_add(c, qinst->cond);
469 }
470 handled_qinst_cond = true;
471 set_last_dst_pack(c, qinst);
472
473 break;
474 }
475
476 assert(qinst->cond == QPU_COND_ALWAYS ||
477 handled_qinst_cond);
478
479 if (qinst->sf) {
480 assert(!qir_is_multi_instruction(qinst));
481 *last_inst(c) |= QPU_SF;
482 }
483 }
484
485 uint32_t cycles = qpu_schedule_instructions(c);
486 uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
487
488 /* thread end can't have VPM write or read */
489 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
490 QPU_WADDR_ADD) == QPU_W_VPM ||
491 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
492 QPU_WADDR_MUL) == QPU_W_VPM ||
493 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
494 QPU_RADDR_A) == QPU_R_VPM ||
495 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
496 QPU_RADDR_B) == QPU_R_VPM) {
497 qpu_serialize_one_inst(c, qpu_NOP());
498 }
499
500 /* thread end can't have uniform read */
501 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
502 QPU_RADDR_A) == QPU_R_UNIF ||
503 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
504 QPU_RADDR_B) == QPU_R_UNIF) {
505 qpu_serialize_one_inst(c, qpu_NOP());
506 }
507
508 /* thread end can't have TLB operations */
509 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
510 qpu_serialize_one_inst(c, qpu_NOP());
511
512 c->qpu_insts[c->qpu_inst_count - 1] =
513 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
514 QPU_SIG_PROG_END);
515 qpu_serialize_one_inst(c, qpu_NOP());
516 qpu_serialize_one_inst(c, qpu_NOP());
517
518 switch (c->stage) {
519 case QSTAGE_VERT:
520 case QSTAGE_COORD:
521 break;
522 case QSTAGE_FRAG:
523 c->qpu_insts[c->qpu_inst_count - 1] =
524 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
525 QPU_SIG_SCOREBOARD_UNLOCK);
526 break;
527 }
528
529 cycles += c->qpu_inst_count - inst_count_at_schedule_time;
530
531 if (vc4_debug & VC4_DEBUG_SHADERDB) {
532 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
533 qir_get_stage_name(c->stage),
534 c->program_id, c->variant_id,
535 cycles);
536 }
537
538 if (vc4_debug & VC4_DEBUG_QPU)
539 vc4_dump_program(c);
540
541 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
542
543 free(temp_registers);
544 }