cc499d8ba09fc66bd95b71ee868f6c00e614fdf4
[mesa.git] / src / broadcom / compiler / vir_to_qpu.c
1 /*
2 * Copyright © 2016 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "compiler/v3d_compiler.h"
25 #include "qpu/qpu_instr.h"
26 #include "qpu/qpu_disasm.h"
27
28 static inline struct qpu_reg
29 qpu_reg(int index)
30 {
31 struct qpu_reg reg = {
32 .magic = false,
33 .index = index,
34 };
35 return reg;
36 }
37
38 static inline struct qpu_reg
39 qpu_magic(enum v3d_qpu_waddr waddr)
40 {
41 struct qpu_reg reg = {
42 .magic = true,
43 .index = waddr,
44 };
45 return reg;
46 }
47
48 static inline struct qpu_reg
49 qpu_acc(int acc)
50 {
51 return qpu_magic(V3D_QPU_WADDR_R0 + acc);
52 }
53
54 struct v3d_qpu_instr
55 v3d_qpu_nop(void)
56 {
57 struct v3d_qpu_instr instr = {
58 .type = V3D_QPU_INSTR_TYPE_ALU,
59 .alu = {
60 .add = {
61 .op = V3D_QPU_A_NOP,
62 .waddr = V3D_QPU_WADDR_NOP,
63 .magic_write = true,
64 },
65 .mul = {
66 .op = V3D_QPU_M_NOP,
67 .waddr = V3D_QPU_WADDR_NOP,
68 .magic_write = true,
69 },
70 }
71 };
72
73 return instr;
74 }
75
76 static struct qinst *
77 vir_nop(void)
78 {
79 struct qreg undef = vir_nop_reg();
80 struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
81
82 return qinst;
83 }
84
85 static struct qinst *
86 new_qpu_nop_before(struct qinst *inst)
87 {
88 struct qinst *q = vir_nop();
89
90 list_addtail(&q->link, &inst->link);
91
92 return q;
93 }
94
95 static void
96 new_ldunif_instr(struct qinst *inst, int i)
97 {
98 struct qinst *ldunif = new_qpu_nop_before(inst);
99
100 ldunif->qpu.sig.ldunif = true;
101 assert(inst->src[i].file == QFILE_UNIF);
102 ldunif->uniform = inst->src[i].index;
103 }
104
105 /**
106 * Allocates the src register (accumulator or register file) into the RADDR
107 * fields of the instruction.
108 */
109 static void
110 set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
111 {
112 if (src.smimm) {
113 assert(instr->sig.small_imm);
114 *mux = V3D_QPU_MUX_B;
115 return;
116 }
117
118 if (src.magic) {
119 assert(src.index >= V3D_QPU_WADDR_R0 &&
120 src.index <= V3D_QPU_WADDR_R5);
121 *mux = src.index - V3D_QPU_WADDR_R0 + V3D_QPU_MUX_R0;
122 return;
123 }
124
125 if (instr->alu.add.a != V3D_QPU_MUX_A &&
126 instr->alu.add.b != V3D_QPU_MUX_A &&
127 instr->alu.mul.a != V3D_QPU_MUX_A &&
128 instr->alu.mul.b != V3D_QPU_MUX_A) {
129 instr->raddr_a = src.index;
130 *mux = V3D_QPU_MUX_A;
131 } else {
132 if (instr->raddr_a == src.index) {
133 *mux = V3D_QPU_MUX_A;
134 } else {
135 assert(!(instr->alu.add.a == V3D_QPU_MUX_B &&
136 instr->alu.add.b == V3D_QPU_MUX_B &&
137 instr->alu.mul.a == V3D_QPU_MUX_B &&
138 instr->alu.mul.b == V3D_QPU_MUX_B) ||
139 src.index == instr->raddr_b);
140
141 instr->raddr_b = src.index;
142 *mux = V3D_QPU_MUX_B;
143 }
144 }
145 }
146
147 static bool
148 is_no_op_mov(struct qinst *qinst)
149 {
150 static const struct v3d_qpu_sig no_sig = {0};
151
152 /* Make sure it's just a lone MOV. */
153 if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
154 qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
155 qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
156 memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
157 return false;
158 }
159
160 /* Check if it's a MOV from a register to itself. */
161 enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
162 if (qinst->qpu.alu.mul.magic_write) {
163 if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4)
164 return false;
165
166 if (qinst->qpu.alu.mul.a !=
167 V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) {
168 return false;
169 }
170 } else {
171 int raddr;
172
173 switch (qinst->qpu.alu.mul.a) {
174 case V3D_QPU_MUX_A:
175 raddr = qinst->qpu.raddr_a;
176 break;
177 case V3D_QPU_MUX_B:
178 raddr = qinst->qpu.raddr_b;
179 break;
180 default:
181 return false;
182 }
183 if (raddr != waddr)
184 return false;
185 }
186
187 /* No packing or flags updates, or we need to execute the
188 * instruction.
189 */
190 if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
191 qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE ||
192 qinst->qpu.flags.mc != V3D_QPU_COND_NONE ||
193 qinst->qpu.flags.mpf != V3D_QPU_PF_NONE ||
194 qinst->qpu.flags.muf != V3D_QPU_UF_NONE) {
195 return false;
196 }
197
198 return true;
199 }
200
201 static void
202 v3d_generate_code_block(struct v3d_compile *c,
203 struct qblock *block,
204 struct qpu_reg *temp_registers)
205 {
206 int last_vpm_read_index = -1;
207
208 vir_for_each_inst_safe(qinst, block) {
209 #if 0
210 fprintf(stderr, "translating qinst to qpu: ");
211 vir_dump_inst(c, qinst);
212 fprintf(stderr, "\n");
213 #endif
214
215 struct qinst *temp;
216
217 if (vir_has_uniform(qinst))
218 c->num_uniforms++;
219
220 int nsrc = vir_get_nsrc(qinst);
221 struct qpu_reg src[ARRAY_SIZE(qinst->src)];
222 bool emitted_ldunif = false;
223 for (int i = 0; i < nsrc; i++) {
224 int index = qinst->src[i].index;
225 switch (qinst->src[i].file) {
226 case QFILE_REG:
227 src[i] = qpu_reg(qinst->src[i].index);
228 break;
229 case QFILE_MAGIC:
230 src[i] = qpu_magic(qinst->src[i].index);
231 break;
232 case QFILE_NULL:
233 case QFILE_LOAD_IMM:
234 src[i] = qpu_acc(0);
235 break;
236 case QFILE_TEMP:
237 src[i] = temp_registers[index];
238 break;
239 case QFILE_UNIF:
240 /* XXX perf: If the last ldunif we emitted was
241 * the same uniform value, skip it. Common
242 * for multop/umul24 sequences.
243 */
244 if (!emitted_ldunif) {
245 new_ldunif_instr(qinst, i);
246 c->num_uniforms++;
247 emitted_ldunif = true;
248 }
249
250 src[i] = qpu_acc(5);
251 break;
252 case QFILE_SMALL_IMM:
253 src[i].smimm = true;
254 break;
255
256 case QFILE_VPM:
257 assert((int)qinst->src[i].index >=
258 last_vpm_read_index);
259 (void)last_vpm_read_index;
260 last_vpm_read_index = qinst->src[i].index;
261
262 temp = new_qpu_nop_before(qinst);
263 temp->qpu.sig.ldvpm = true;
264
265 src[i] = qpu_acc(3);
266 break;
267
268 case QFILE_TLB:
269 case QFILE_TLBU:
270 unreachable("bad vir src file");
271 }
272 }
273
274 struct qpu_reg dst;
275 switch (qinst->dst.file) {
276 case QFILE_NULL:
277 dst = qpu_magic(V3D_QPU_WADDR_NOP);
278 break;
279
280 case QFILE_REG:
281 dst = qpu_reg(qinst->dst.index);
282 break;
283
284 case QFILE_MAGIC:
285 dst = qpu_magic(qinst->dst.index);
286 break;
287
288 case QFILE_TEMP:
289 dst = temp_registers[qinst->dst.index];
290 break;
291
292 case QFILE_VPM:
293 dst = qpu_magic(V3D_QPU_WADDR_VPM);
294 break;
295
296 case QFILE_TLB:
297 dst = qpu_magic(V3D_QPU_WADDR_TLB);
298 break;
299
300 case QFILE_TLBU:
301 dst = qpu_magic(V3D_QPU_WADDR_TLBU);
302 break;
303
304 case QFILE_UNIF:
305 case QFILE_SMALL_IMM:
306 case QFILE_LOAD_IMM:
307 assert(!"not reached");
308 break;
309 }
310
311 if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
312 if (qinst->qpu.sig.ldunif) {
313 assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
314 assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
315
316 if (!dst.magic ||
317 dst.index != V3D_QPU_WADDR_R5) {
318 assert(c->devinfo->ver >= 40);
319
320 qinst->qpu.sig.ldunif = false;
321 qinst->qpu.sig.ldunifrf = true;
322 qinst->qpu.sig_addr = dst.index;
323 qinst->qpu.sig_magic = dst.magic;
324 }
325 } else if (v3d_qpu_sig_writes_address(c->devinfo,
326 &qinst->qpu.sig)) {
327 assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
328 assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
329
330 qinst->qpu.sig_addr = dst.index;
331 qinst->qpu.sig_magic = dst.magic;
332 } else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) {
333 assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
334 if (nsrc >= 1) {
335 set_src(&qinst->qpu,
336 &qinst->qpu.alu.add.a, src[0]);
337 }
338 if (nsrc >= 2) {
339 set_src(&qinst->qpu,
340 &qinst->qpu.alu.add.b, src[1]);
341 }
342
343 qinst->qpu.alu.add.waddr = dst.index;
344 qinst->qpu.alu.add.magic_write = dst.magic;
345 } else {
346 if (nsrc >= 1) {
347 set_src(&qinst->qpu,
348 &qinst->qpu.alu.mul.a, src[0]);
349 }
350 if (nsrc >= 2) {
351 set_src(&qinst->qpu,
352 &qinst->qpu.alu.mul.b, src[1]);
353 }
354
355 qinst->qpu.alu.mul.waddr = dst.index;
356 qinst->qpu.alu.mul.magic_write = dst.magic;
357
358 if (is_no_op_mov(qinst)) {
359 vir_remove_instruction(c, qinst);
360 continue;
361 }
362 }
363 } else {
364 assert(qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
365 }
366 }
367 }
368
369 static bool
370 reads_uniform(const struct v3d_device_info *devinfo, uint64_t instruction)
371 {
372 struct v3d_qpu_instr qpu;
373 MAYBE_UNUSED bool ok = v3d_qpu_instr_unpack(devinfo, instruction, &qpu);
374 assert(ok);
375
376 if (qpu.sig.ldunif ||
377 qpu.sig.ldunifrf ||
378 qpu.sig.wrtmuc) {
379 return true;
380 }
381
382 if (qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
383 return true;
384
385 if (qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
386 if (qpu.alu.add.magic_write &&
387 v3d_qpu_magic_waddr_loads_unif(qpu.alu.add.waddr)) {
388 return true;
389 }
390
391 if (qpu.alu.mul.magic_write &&
392 v3d_qpu_magic_waddr_loads_unif(qpu.alu.mul.waddr)) {
393 return true;
394 }
395 }
396
397 return false;
398 }
399
400 static void
401 v3d_dump_qpu(struct v3d_compile *c)
402 {
403 fprintf(stderr, "%s prog %d/%d QPU:\n",
404 vir_get_stage_name(c),
405 c->program_id, c->variant_id);
406
407 int next_uniform = 0;
408 for (int i = 0; i < c->qpu_inst_count; i++) {
409 const char *str = v3d_qpu_disasm(c->devinfo, c->qpu_insts[i]);
410 fprintf(stderr, "0x%016"PRIx64" %s", c->qpu_insts[i], str);
411
412 /* We can only do this on 4.x, because we're not tracking TMU
413 * implicit uniforms here on 3.x.
414 */
415 if (c->devinfo->ver >= 40 &&
416 reads_uniform(c->devinfo, c->qpu_insts[i])) {
417 fprintf(stderr, " (");
418 vir_dump_uniform(c->uniform_contents[next_uniform],
419 c->uniform_data[next_uniform]);
420 fprintf(stderr, ")");
421 next_uniform++;
422 }
423 fprintf(stderr, "\n");
424 ralloc_free((void *)str);
425 }
426
427 /* Make sure our dumping lined up. */
428 if (c->devinfo->ver >= 40)
429 assert(next_uniform == c->num_uniforms);
430
431 fprintf(stderr, "\n");
432 }
433
434 void
435 v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers)
436 {
437 /* Reset the uniform count to how many will be actually loaded by the
438 * generated QPU code.
439 */
440 c->num_uniforms = 0;
441
442 vir_for_each_block(block, c)
443 v3d_generate_code_block(c, block, temp_registers);
444
445 uint32_t cycles = v3d_qpu_schedule_instructions(c);
446
447 c->qpu_insts = rzalloc_array(c, uint64_t, c->qpu_inst_count);
448 int i = 0;
449 vir_for_each_inst_inorder(inst, c) {
450 bool ok = v3d_qpu_instr_pack(c->devinfo, &inst->qpu,
451 &c->qpu_insts[i++]);
452 if (!ok) {
453 fprintf(stderr, "Failed to pack instruction:\n");
454 vir_dump_inst(c, inst);
455 fprintf(stderr, "\n");
456 c->failed = true;
457 return;
458 }
459 }
460 assert(i == c->qpu_inst_count);
461
462 if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
463 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d instructions\n",
464 vir_get_stage_name(c),
465 c->program_id, c->variant_id,
466 c->qpu_inst_count);
467 }
468
469 /* The QPU cycle estimates are pretty broken (see waddr_latency()), so
470 * don't report them for now.
471 */
472 if (false) {
473 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
474 vir_get_stage_name(c),
475 c->program_id, c->variant_id,
476 cycles);
477 }
478
479 if (V3D_DEBUG & (V3D_DEBUG_QPU |
480 v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
481 v3d_dump_qpu(c);
482 }
483
484 qpu_validate(c);
485
486 free(temp_registers);
487 }