pan/bi: Squash LD_ATTR ops together
[mesa.git] / src / panfrost / bifrost / disassemble.c
1 /*
2 * Copyright (C) 2019 Connor Abbott <cwabbott0@gmail.com>
3 * Copyright (C) 2019 Lyude Paul <thatslyude@gmail.com>
4 * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 */
25
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stdint.h>
29 #include <assert.h>
30 #include <inttypes.h>
31 #include <string.h>
32
33 #include "bifrost.h"
34 #include "bifrost_ops.h"
35 #include "disassemble.h"
36 #include "util/macros.h"
37
38 // return bits (high, lo]
39 static uint64_t bits(uint32_t word, unsigned lo, unsigned high)
40 {
41 if (high == 32)
42 return word >> lo;
43 return (word & ((1 << high) - 1)) >> lo;
44 }
45
46 // each of these structs represents an instruction that's dispatched in one
47 // cycle. Note that these instructions are packed in funny ways within the
48 // clause, hence the need for a separate struct.
49 struct bifrost_alu_inst {
50 uint32_t fma_bits;
51 uint32_t add_bits;
52 uint64_t reg_bits;
53 };
54
55 struct bifrost_regs {
56 unsigned uniform_const : 8;
57 unsigned reg2 : 6;
58 unsigned reg3 : 6;
59 unsigned reg0 : 5;
60 unsigned reg1 : 6;
61 unsigned ctrl : 4;
62 };
63
64 static unsigned get_reg0(struct bifrost_regs regs)
65 {
66 if (regs.ctrl == 0)
67 return regs.reg0 | ((regs.reg1 & 0x1) << 5);
68
69 return regs.reg0 <= regs.reg1 ? regs.reg0 : 63 - regs.reg0;
70 }
71
72 static unsigned get_reg1(struct bifrost_regs regs)
73 {
74 return regs.reg0 <= regs.reg1 ? regs.reg1 : 63 - regs.reg1;
75 }
76
77 enum bifrost_reg_write_unit {
78 REG_WRITE_NONE = 0, // don't write
79 REG_WRITE_TWO, // write using reg2
80 REG_WRITE_THREE, // write using reg3
81 };
82
83 // this represents the decoded version of the ctrl register field.
84 struct bifrost_reg_ctrl {
85 bool read_reg0;
86 bool read_reg1;
87 bool read_reg3;
88 enum bifrost_reg_write_unit fma_write_unit;
89 enum bifrost_reg_write_unit add_write_unit;
90 bool clause_start;
91 };
92
93 enum fma_src_type {
94 FMA_ONE_SRC,
95 FMA_TWO_SRC,
96 FMA_FADD,
97 FMA_FMINMAX,
98 FMA_FADD16,
99 FMA_FMINMAX16,
100 FMA_FCMP,
101 FMA_FCMP16,
102 FMA_THREE_SRC,
103 FMA_SHIFT,
104 FMA_FMA,
105 FMA_FMA16,
106 FMA_CSEL4,
107 FMA_FMA_MSCALE,
108 FMA_SHIFT_ADD64,
109 };
110
111 struct fma_op_info {
112 bool extended;
113 unsigned op;
114 char name[30];
115 enum fma_src_type src_type;
116 };
117
118 enum add_src_type {
119 ADD_ONE_SRC,
120 ADD_TWO_SRC,
121 ADD_FADD,
122 ADD_FMINMAX,
123 ADD_FADD16,
124 ADD_FMINMAX16,
125 ADD_THREE_SRC,
126 ADD_SHIFT,
127 ADD_FADDMscale,
128 ADD_FCMP,
129 ADD_FCMP16,
130 ADD_TEX_COMPACT, // texture instruction with embedded sampler
131 ADD_TEX, // texture instruction with sampler/etc. in uniform port
132 ADD_VARYING_INTERP,
133 ADD_BLENDING,
134 ADD_LOAD_ATTR,
135 ADD_VARYING_ADDRESS,
136 ADD_BRANCH,
137 };
138
139 struct add_op_info {
140 unsigned op;
141 char name[30];
142 enum add_src_type src_type;
143 bool has_data_reg;
144 };
145
146 struct bifrost_tex_ctrl {
147 unsigned sampler_index : 4; // also used to signal indirects
148 unsigned tex_index : 7;
149 bool no_merge_index : 1; // whether to merge (direct) sampler & texture indices
150 bool filter : 1; // use the usual filtering pipeline (0 for texelFetch & textureGather)
151 unsigned unk0 : 2;
152 bool texel_offset : 1; // *Offset()
153 bool is_shadow : 1;
154 bool is_array : 1;
155 unsigned tex_type : 2; // 2D, 3D, Cube, Buffer
156 bool compute_lod : 1; // 0 for *Lod()
157 bool not_supply_lod : 1; // 0 for *Lod() or when a bias is applied
158 bool calc_gradients : 1; // 0 for *Grad()
159 unsigned unk1 : 1;
160 unsigned result_type : 4; // integer, unsigned, float TODO: why is this 4 bits?
161 unsigned unk2 : 4;
162 };
163
164 struct bifrost_dual_tex_ctrl {
165 unsigned sampler_index0 : 2;
166 unsigned unk0 : 2;
167 unsigned tex_index0 : 2;
168 unsigned sampler_index1 : 2;
169 unsigned tex_index1 : 2;
170 unsigned unk1 : 22;
171 };
172
173 enum branch_bit_size {
174 BR_SIZE_32 = 0,
175 BR_SIZE_16XX = 1,
176 BR_SIZE_16YY = 2,
177 // For the above combinations of bitsize and location, an extra bit is
178 // encoded via comparing the sources. The only possible source of ambiguity
179 // would be if the sources were the same, but then the branch condition
180 // would be always true or always false anyways, so we can ignore it. But
181 // this no longer works when comparing the y component to the x component,
182 // since it's valid to compare the y component of a source against its own
183 // x component. Instead, the extra bit is encoded via an extra bitsize.
184 BR_SIZE_16YX0 = 3,
185 BR_SIZE_16YX1 = 4,
186 BR_SIZE_32_AND_16X = 5,
187 BR_SIZE_32_AND_16Y = 6,
188 // Used for comparisons with zero and always-true, see below. I think this
189 // only works for integer comparisons.
190 BR_SIZE_ZERO = 7,
191 };
192
193 void dump_header(FILE *fp, struct bifrost_header header, bool verbose);
194 void dump_instr(FILE *fp, const struct bifrost_alu_inst *instr,
195 struct bifrost_regs next_regs, uint64_t *consts,
196 unsigned data_reg, unsigned offset, bool verbose);
197 bool dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offset, bool verbose);
198
199 void dump_header(FILE *fp, struct bifrost_header header, bool verbose)
200 {
201 if (header.clause_type != 0) {
202 fprintf(fp, "id(%du) ", header.scoreboard_index);
203 }
204
205 if (header.scoreboard_deps != 0) {
206 fprintf(fp, "next-wait(");
207 bool first = true;
208 for (unsigned i = 0; i < 8; i++) {
209 if (header.scoreboard_deps & (1 << i)) {
210 if (!first) {
211 fprintf(fp, ", ");
212 }
213 fprintf(fp, "%d", i);
214 first = false;
215 }
216 }
217 fprintf(fp, ") ");
218 }
219
220 if (header.datareg_writebarrier)
221 fprintf(fp, "data-reg-barrier ");
222
223 if (!header.no_end_of_shader)
224 fprintf(fp, "eos ");
225
226 if (!header.back_to_back) {
227 fprintf(fp, "nbb ");
228 if (header.branch_cond)
229 fprintf(fp, "branch-cond ");
230 else
231 fprintf(fp, "branch-uncond ");
232 }
233
234 if (header.elide_writes)
235 fprintf(fp, "we ");
236
237 if (header.suppress_inf)
238 fprintf(fp, "suppress-inf ");
239 if (header.suppress_nan)
240 fprintf(fp, "suppress-nan ");
241
242 if (header.unk0)
243 fprintf(fp, "unk0 ");
244 if (header.unk1)
245 fprintf(fp, "unk1 ");
246 if (header.unk2)
247 fprintf(fp, "unk2 ");
248 if (header.unk3)
249 fprintf(fp, "unk3 ");
250 if (header.unk4)
251 fprintf(fp, "unk4 ");
252
253 fprintf(fp, "\n");
254
255 if (verbose) {
256 fprintf(fp, "# clause type %d, next clause type %d\n",
257 header.clause_type, header.next_clause_type);
258 }
259 }
260
261 static struct bifrost_reg_ctrl DecodeRegCtrl(FILE *fp, struct bifrost_regs regs)
262 {
263 struct bifrost_reg_ctrl decoded = {};
264 unsigned ctrl;
265 if (regs.ctrl == 0) {
266 ctrl = regs.reg1 >> 2;
267 decoded.read_reg0 = !(regs.reg1 & 0x2);
268 decoded.read_reg1 = false;
269 } else {
270 ctrl = regs.ctrl;
271 decoded.read_reg0 = decoded.read_reg1 = true;
272 }
273 switch (ctrl) {
274 case 1:
275 decoded.fma_write_unit = REG_WRITE_TWO;
276 break;
277 case 2:
278 case 3:
279 decoded.fma_write_unit = REG_WRITE_TWO;
280 decoded.read_reg3 = true;
281 break;
282 case 4:
283 decoded.read_reg3 = true;
284 break;
285 case 5:
286 decoded.add_write_unit = REG_WRITE_TWO;
287 break;
288 case 6:
289 decoded.add_write_unit = REG_WRITE_TWO;
290 decoded.read_reg3 = true;
291 break;
292 case 8:
293 decoded.clause_start = true;
294 break;
295 case 9:
296 decoded.fma_write_unit = REG_WRITE_TWO;
297 decoded.clause_start = true;
298 break;
299 case 11:
300 break;
301 case 12:
302 decoded.read_reg3 = true;
303 decoded.clause_start = true;
304 break;
305 case 13:
306 decoded.add_write_unit = REG_WRITE_TWO;
307 decoded.clause_start = true;
308 break;
309
310 case 7:
311 case 15:
312 decoded.fma_write_unit = REG_WRITE_THREE;
313 decoded.add_write_unit = REG_WRITE_TWO;
314 break;
315 default:
316 fprintf(fp, "# unknown reg ctrl %d\n", ctrl);
317 }
318
319 return decoded;
320 }
321
322 // Pass in the add_write_unit or fma_write_unit, and this returns which register
323 // the ADD/FMA units are writing to
324 static unsigned GetRegToWrite(enum bifrost_reg_write_unit unit, struct bifrost_regs regs)
325 {
326 switch (unit) {
327 case REG_WRITE_TWO:
328 return regs.reg2;
329 case REG_WRITE_THREE:
330 return regs.reg3;
331 default: /* REG_WRITE_NONE */
332 assert(0);
333 return 0;
334 }
335 }
336
337 static void dump_regs(FILE *fp, struct bifrost_regs srcs)
338 {
339 struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, srcs);
340 fprintf(fp, "# ");
341 if (ctrl.read_reg0)
342 fprintf(fp, "port 0: R%d ", get_reg0(srcs));
343 if (ctrl.read_reg1)
344 fprintf(fp, "port 1: R%d ", get_reg1(srcs));
345
346 if (ctrl.fma_write_unit == REG_WRITE_TWO)
347 fprintf(fp, "port 2: R%d (write FMA) ", srcs.reg2);
348 else if (ctrl.add_write_unit == REG_WRITE_TWO)
349 fprintf(fp, "port 2: R%d (write ADD) ", srcs.reg2);
350
351 if (ctrl.fma_write_unit == REG_WRITE_THREE)
352 fprintf(fp, "port 3: R%d (write FMA) ", srcs.reg3);
353 else if (ctrl.add_write_unit == REG_WRITE_THREE)
354 fprintf(fp, "port 3: R%d (write ADD) ", srcs.reg3);
355 else if (ctrl.read_reg3)
356 fprintf(fp, "port 3: R%d (read) ", srcs.reg3);
357
358 if (srcs.uniform_const) {
359 if (srcs.uniform_const & 0x80) {
360 fprintf(fp, "uniform: U%d", (srcs.uniform_const & 0x7f) * 2);
361 }
362 }
363
364 fprintf(fp, "\n");
365 }
366 static void dump_const_imm(FILE *fp, uint32_t imm)
367 {
368 union {
369 float f;
370 uint32_t i;
371 } fi;
372 fi.i = imm;
373 fprintf(fp, "0x%08x /* %f */", imm, fi.f);
374 }
375
376 static uint64_t get_const(uint64_t *consts, struct bifrost_regs srcs)
377 {
378 unsigned low_bits = srcs.uniform_const & 0xf;
379 uint64_t imm;
380 switch (srcs.uniform_const >> 4) {
381 case 4:
382 imm = consts[0];
383 break;
384 case 5:
385 imm = consts[1];
386 break;
387 case 6:
388 imm = consts[2];
389 break;
390 case 7:
391 imm = consts[3];
392 break;
393 case 2:
394 imm = consts[4];
395 break;
396 case 3:
397 imm = consts[5];
398 break;
399 default:
400 assert(0);
401 break;
402 }
403 return imm | low_bits;
404 }
405
406 static void dump_uniform_const_src(FILE *fp, struct bifrost_regs srcs, uint64_t *consts, bool high32)
407 {
408 if (srcs.uniform_const & 0x80) {
409 unsigned uniform = (srcs.uniform_const & 0x7f) * 2;
410 fprintf(fp, "U%d", uniform + (high32 ? 1 : 0));
411 } else if (srcs.uniform_const >= 0x20) {
412 uint64_t imm = get_const(consts, srcs);
413 if (high32)
414 dump_const_imm(fp, imm >> 32);
415 else
416 dump_const_imm(fp, imm);
417 } else {
418 switch (srcs.uniform_const) {
419 case 0:
420 fprintf(fp, "0");
421 break;
422 case 5:
423 fprintf(fp, "atest-data");
424 break;
425 case 6:
426 fprintf(fp, "sample-ptr");
427 break;
428 case 8:
429 case 9:
430 case 10:
431 case 11:
432 case 12:
433 case 13:
434 case 14:
435 case 15:
436 fprintf(fp, "blend-descriptor%u", (unsigned) srcs.uniform_const - 8);
437 break;
438 default:
439 fprintf(fp, "unkConst%u", (unsigned) srcs.uniform_const);
440 break;
441 }
442
443 if (high32)
444 fprintf(fp, ".y");
445 else
446 fprintf(fp, ".x");
447 }
448 }
449
450 static void dump_src(FILE *fp, unsigned src, struct bifrost_regs srcs, uint64_t *consts, bool isFMA)
451 {
452 switch (src) {
453 case 0:
454 fprintf(fp, "R%d", get_reg0(srcs));
455 break;
456 case 1:
457 fprintf(fp, "R%d", get_reg1(srcs));
458 break;
459 case 2:
460 fprintf(fp, "R%d", srcs.reg3);
461 break;
462 case 3:
463 if (isFMA)
464 fprintf(fp, "0");
465 else
466 fprintf(fp, "T"); // i.e. the output of FMA this cycle
467 break;
468 case 4:
469 dump_uniform_const_src(fp, srcs, consts, false);
470 break;
471 case 5:
472 dump_uniform_const_src(fp, srcs, consts, true);
473 break;
474 case 6:
475 fprintf(fp, "T0");
476 break;
477 case 7:
478 fprintf(fp, "T1");
479 break;
480 }
481 }
482
483 static void dump_output_mod(FILE *fp, unsigned mod)
484 {
485 switch (mod) {
486 case 0:
487 break;
488 case 1:
489 fprintf(fp, ".clamp_0_inf");
490 break; // max(out, 0)
491 case 2:
492 fprintf(fp, ".clamp_m1_1");
493 break; // clamp(out, -1, 1)
494 case 3:
495 fprintf(fp, ".clamp_0_1");
496 break; // clamp(out, 0, 1)
497 default:
498 break;
499 }
500 }
501
502 static void dump_minmax_mode(FILE *fp, unsigned mod)
503 {
504 switch (mod) {
505 case 0:
506 /* Same as fmax() and fmin() -- return the other number if any
507 * number is NaN. Also always return +0 if one argument is +0 and
508 * the other is -0.
509 */
510 break;
511 case 1:
512 /* Instead of never returning a NaN, always return one. The
513 * "greater"/"lesser" NaN is always returned, first by checking the
514 * sign and then the mantissa bits.
515 */
516 fprintf(fp, ".nan_wins");
517 break;
518 case 2:
519 /* For max, implement src0 > src1 ? src0 : src1
520 * For min, implement src0 < src1 ? src0 : src1
521 *
522 * This includes handling NaN's and signedness of 0 differently
523 * from above, since +0 and -0 compare equal and comparisons always
524 * return false for NaN's. As a result, this mode is *not*
525 * commutative.
526 */
527 fprintf(fp, ".src1_wins");
528 break;
529 case 3:
530 /* For max, implement src0 < src1 ? src1 : src0
531 * For min, implement src0 > src1 ? src1 : src0
532 */
533 fprintf(fp, ".src0_wins");
534 break;
535 default:
536 break;
537 }
538 }
539
540 static void dump_round_mode(FILE *fp, unsigned mod)
541 {
542 switch (mod) {
543 case 0:
544 /* roundTiesToEven, the IEEE default. */
545 break;
546 case 1:
547 /* roundTowardPositive in the IEEE spec. */
548 fprintf(fp, ".round_pos");
549 break;
550 case 2:
551 /* roundTowardNegative in the IEEE spec. */
552 fprintf(fp, ".round_neg");
553 break;
554 case 3:
555 /* roundTowardZero in the IEEE spec. */
556 fprintf(fp, ".round_zero");
557 break;
558 default:
559 break;
560 }
561 }
562
563 static const char *
564 csel_cond_name(enum bifrost_csel_cond cond)
565 {
566 switch (cond) {
567 case BIFROST_FEQ_F: return "feq.f";
568 case BIFROST_FGT_F: return "fgt.f";
569 case BIFROST_FGE_F: return "fge.f";
570 case BIFROST_IEQ_F: return "ieq.f";
571 case BIFROST_IGT_I: return "igt.i";
572 case BIFROST_IGE_I: return "uge.i";
573 case BIFROST_UGT_I: return "ugt.i";
574 case BIFROST_UGE_I: return "uge.i";
575 default: return "invalid";
576 }
577 }
578
579 static const struct fma_op_info FMAOpInfos[] = {
580 { false, 0x00000, "FMA.f32", FMA_FMA },
581 { false, 0x40000, "MAX.f32", FMA_FMINMAX },
582 { false, 0x44000, "MIN.f32", FMA_FMINMAX },
583 { false, 0x48000, "FCMP.GL", FMA_FCMP },
584 { false, 0x4c000, "FCMP.D3D", FMA_FCMP },
585 { false, 0x4ff98, "ADD.i32", FMA_TWO_SRC },
586 { false, 0x4ffd8, "SUB.i32", FMA_TWO_SRC },
587 { false, 0x4fff0, "SUBB.i32", FMA_TWO_SRC },
588 { false, 0x50000, "FMA_MSCALE", FMA_FMA_MSCALE },
589 { false, 0x58000, "ADD.f32", FMA_FADD },
590 { false, 0x5c000, "CSEL4", FMA_CSEL4 },
591 { false, 0x5d8d0, "ICMP.D3D.GT.v2i16", FMA_TWO_SRC },
592 { false, 0x5d9d0, "UCMP.D3D.GT.v2i16", FMA_TWO_SRC },
593 { false, 0x5dad0, "ICMP.D3D.GE.v2i16", FMA_TWO_SRC },
594 { false, 0x5dbd0, "UCMP.D3D.GE.v2i16", FMA_TWO_SRC },
595 { false, 0x5dcd0, "ICMP.D3D.EQ.v2i16", FMA_TWO_SRC },
596 { false, 0x5de40, "ICMP.GL.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? 1 : 0
597 { false, 0x5de48, "ICMP.GL.GE.i32", FMA_TWO_SRC },
598 { false, 0x5de50, "UCMP.GL.GT.i32", FMA_TWO_SRC },
599 { false, 0x5de58, "UCMP.GL.GE.i32", FMA_TWO_SRC },
600 { false, 0x5de60, "ICMP.GL.EQ.i32", FMA_TWO_SRC },
601 { false, 0x5dec0, "ICMP.D3D.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? ~0 : 0
602 { false, 0x5dec8, "ICMP.D3D.GE.i32", FMA_TWO_SRC },
603 { false, 0x5ded0, "UCMP.D3D.GT.i32", FMA_TWO_SRC },
604 { false, 0x5ded8, "UCMP.D3D.GE.i32", FMA_TWO_SRC },
605 { false, 0x5dee0, "ICMP.D3D.EQ.i32", FMA_TWO_SRC },
606 { false, 0x60000, "RSHIFT_NAND", FMA_SHIFT },
607 { false, 0x61000, "RSHIFT_AND", FMA_SHIFT },
608 { false, 0x62000, "LSHIFT_NAND", FMA_SHIFT },
609 { false, 0x63000, "LSHIFT_AND", FMA_SHIFT }, // (src0 << src2) & src1
610 { false, 0x64000, "RSHIFT_XOR", FMA_SHIFT },
611 { false, 0x65200, "LSHIFT_ADD.i32", FMA_THREE_SRC },
612 { false, 0x65600, "LSHIFT_SUB.i32", FMA_THREE_SRC }, // (src0 << src2) - src1
613 { false, 0x65a00, "LSHIFT_RSUB.i32", FMA_THREE_SRC }, // src1 - (src0 << src2)
614 { false, 0x65e00, "RSHIFT_ADD.i32", FMA_THREE_SRC },
615 { false, 0x66200, "RSHIFT_SUB.i32", FMA_THREE_SRC },
616 { false, 0x66600, "RSHIFT_RSUB.i32", FMA_THREE_SRC },
617 { false, 0x66a00, "ARSHIFT_ADD.i32", FMA_THREE_SRC },
618 { false, 0x66e00, "ARSHIFT_SUB.i32", FMA_THREE_SRC },
619 { false, 0x67200, "ARSHIFT_RSUB.i32", FMA_THREE_SRC },
620 { false, 0x80000, "FMA.v2f16", FMA_FMA16 },
621 { false, 0xc0000, "MAX.v2f16", FMA_FMINMAX16 },
622 { false, 0xc4000, "MIN.v2f16", FMA_FMINMAX16 },
623 { false, 0xc8000, "FCMP.GL", FMA_FCMP16 },
624 { false, 0xcc000, "FCMP.D3D", FMA_FCMP16 },
625 { false, 0xcf900, "ADD.v2i16", FMA_TWO_SRC },
626 { false, 0xcfc10, "ADDC.i32", FMA_TWO_SRC },
627 { false, 0xcfd80, "ADD.i32.i16.X", FMA_TWO_SRC },
628 { false, 0xcfd90, "ADD.i32.u16.X", FMA_TWO_SRC },
629 { false, 0xcfdc0, "ADD.i32.i16.Y", FMA_TWO_SRC },
630 { false, 0xcfdd0, "ADD.i32.u16.Y", FMA_TWO_SRC },
631 { false, 0xd8000, "ADD.v2f16", FMA_FADD16 },
632 { false, 0xdc000, "CSEL4.v16", FMA_CSEL4 },
633 { false, 0xdd000, "F32_TO_F16", FMA_TWO_SRC },
634 { true, 0x00046, "F16_TO_I16.XX", FMA_ONE_SRC },
635 { true, 0x00047, "F16_TO_U16.XX", FMA_ONE_SRC },
636 { true, 0x0004e, "F16_TO_I16.YX", FMA_ONE_SRC },
637 { true, 0x0004f, "F16_TO_U16.YX", FMA_ONE_SRC },
638 { true, 0x00056, "F16_TO_I16.XY", FMA_ONE_SRC },
639 { true, 0x00057, "F16_TO_U16.XY", FMA_ONE_SRC },
640 { true, 0x0005e, "F16_TO_I16.YY", FMA_ONE_SRC },
641 { true, 0x0005f, "F16_TO_U16.YY", FMA_ONE_SRC },
642 { true, 0x000c0, "I16_TO_F16.XX", FMA_ONE_SRC },
643 { true, 0x000c1, "U16_TO_F16.XX", FMA_ONE_SRC },
644 { true, 0x000c8, "I16_TO_F16.YX", FMA_ONE_SRC },
645 { true, 0x000c9, "U16_TO_F16.YX", FMA_ONE_SRC },
646 { true, 0x000d0, "I16_TO_F16.XY", FMA_ONE_SRC },
647 { true, 0x000d1, "U16_TO_F16.XY", FMA_ONE_SRC },
648 { true, 0x000d8, "I16_TO_F16.YY", FMA_ONE_SRC },
649 { true, 0x000d9, "U16_TO_F16.YY", FMA_ONE_SRC },
650 { true, 0x00136, "F32_TO_I32", FMA_ONE_SRC },
651 { true, 0x00137, "F32_TO_U32", FMA_ONE_SRC },
652 { true, 0x00178, "I32_TO_F32", FMA_ONE_SRC },
653 { true, 0x00179, "U32_TO_F32", FMA_ONE_SRC },
654 { true, 0x00198, "I16_TO_I32.X", FMA_ONE_SRC },
655 { true, 0x00199, "U16_TO_U32.X", FMA_ONE_SRC },
656 { true, 0x0019a, "I16_TO_I32.Y", FMA_ONE_SRC },
657 { true, 0x0019b, "U16_TO_U32.Y", FMA_ONE_SRC },
658 { true, 0x0019c, "I16_TO_F32.X", FMA_ONE_SRC },
659 { true, 0x0019d, "U16_TO_F32.X", FMA_ONE_SRC },
660 { true, 0x0019e, "I16_TO_F32.Y", FMA_ONE_SRC },
661 { true, 0x0019f, "U16_TO_F32.Y", FMA_ONE_SRC },
662 { true, 0x001a2, "F16_TO_F32.X", FMA_ONE_SRC },
663 { true, 0x001a3, "F16_TO_F32.Y", FMA_ONE_SRC },
664 { true, 0x0032c, "NOP", FMA_ONE_SRC },
665 { true, 0x0032d, "MOV", FMA_ONE_SRC },
666 { true, 0x0032f, "SWZ.YY.v2i16", FMA_ONE_SRC },
667 { true, 0x00345, "LOG_FREXPM", FMA_ONE_SRC },
668 { true, 0x00365, "FRCP_FREXPM", FMA_ONE_SRC },
669 { true, 0x00375, "FSQRT_FREXPM", FMA_ONE_SRC },
670 { true, 0x0038d, "FRCP_FREXPE", FMA_ONE_SRC },
671 { true, 0x003a5, "FSQRT_FREXPE", FMA_ONE_SRC },
672 { true, 0x003ad, "FRSQ_FREXPE", FMA_ONE_SRC },
673 { true, 0x003c5, "LOG_FREXPE", FMA_ONE_SRC },
674 { true, 0x003fa, "CLZ", FMA_ONE_SRC },
675 { true, 0x00b80, "IMAX3", FMA_THREE_SRC },
676 { true, 0x00bc0, "UMAX3", FMA_THREE_SRC },
677 { true, 0x00c00, "IMIN3", FMA_THREE_SRC },
678 { true, 0x00c40, "UMIN3", FMA_THREE_SRC },
679 { true, 0x00ec5, "ROUND", FMA_ONE_SRC },
680 { true, 0x00f40, "CSEL", FMA_THREE_SRC }, // src2 != 0 ? src1 : src0
681 { true, 0x00fc0, "MUX.i32", FMA_THREE_SRC }, // see ADD comment
682 { true, 0x01805, "ROUNDEVEN", FMA_ONE_SRC },
683 { true, 0x01845, "CEIL", FMA_ONE_SRC },
684 { true, 0x01885, "FLOOR", FMA_ONE_SRC },
685 { true, 0x018c5, "TRUNC", FMA_ONE_SRC },
686 { true, 0x019b0, "ATAN_LDEXP.Y.f32", FMA_TWO_SRC },
687 { true, 0x019b8, "ATAN_LDEXP.X.f32", FMA_TWO_SRC },
688 { true, 0x01c80, "LSHIFT_ADD_LOW32.u32", FMA_SHIFT_ADD64 },
689 { true, 0x01cc0, "LSHIFT_ADD_LOW32.i64", FMA_SHIFT_ADD64 },
690 { true, 0x01d80, "LSHIFT_ADD_LOW32.i32", FMA_SHIFT_ADD64 },
691 { true, 0x01e00, "SEL.XX.i16", FMA_TWO_SRC },
692 { true, 0x01e08, "SEL.YX.i16", FMA_TWO_SRC },
693 { true, 0x01e10, "SEL.XY.i16", FMA_TWO_SRC },
694 { true, 0x01e18, "SEL.YY.i16", FMA_TWO_SRC },
695 { true, 0x00800, "IMAD", FMA_THREE_SRC },
696 { true, 0x078db, "POPCNT", FMA_ONE_SRC },
697 };
698
699 static struct fma_op_info find_fma_op_info(unsigned op, bool extended)
700 {
701 for (unsigned i = 0; i < ARRAY_SIZE(FMAOpInfos); i++) {
702 unsigned opCmp = ~0;
703
704 if (FMAOpInfos[i].extended != extended)
705 continue;
706
707 if (extended)
708 op &= ~0xe0000;
709
710 switch (FMAOpInfos[i].src_type) {
711 case FMA_ONE_SRC:
712 opCmp = op;
713 break;
714 case FMA_TWO_SRC:
715 opCmp = op & ~0x7;
716 break;
717 case FMA_FCMP:
718 case FMA_FCMP16:
719 opCmp = op & ~0x1fff;
720 break;
721 case FMA_THREE_SRC:
722 case FMA_SHIFT_ADD64:
723 opCmp = op & ~0x3f;
724 break;
725 case FMA_FADD:
726 case FMA_FMINMAX:
727 case FMA_FADD16:
728 case FMA_FMINMAX16:
729 opCmp = op & ~0x3fff;
730 break;
731 case FMA_FMA:
732 case FMA_FMA16:
733 opCmp = op & ~0x3ffff;
734 break;
735 case FMA_CSEL4:
736 case FMA_SHIFT:
737 opCmp = op & ~0xfff;
738 break;
739 case FMA_FMA_MSCALE:
740 opCmp = op & ~0x7fff;
741 break;
742 default:
743 opCmp = ~0;
744 break;
745 }
746 if (FMAOpInfos[i].op == opCmp)
747 return FMAOpInfos[i];
748 }
749
750 struct fma_op_info info;
751 snprintf(info.name, sizeof(info.name), "op%04x", op);
752 info.op = op;
753 info.src_type = FMA_THREE_SRC;
754 return info;
755 }
756
757 static void dump_fcmp(FILE *fp, unsigned op)
758 {
759 switch (op) {
760 case 0:
761 fprintf(fp, ".OEQ");
762 break;
763 case 1:
764 fprintf(fp, ".OGT");
765 break;
766 case 2:
767 fprintf(fp, ".OGE");
768 break;
769 case 3:
770 fprintf(fp, ".UNE");
771 break;
772 case 4:
773 fprintf(fp, ".OLT");
774 break;
775 case 5:
776 fprintf(fp, ".OLE");
777 break;
778 default:
779 fprintf(fp, ".unk%d", op);
780 break;
781 }
782 }
783
784 static void dump_16swizzle(FILE *fp, unsigned swiz)
785 {
786 if (swiz == 2)
787 return;
788 fprintf(fp, ".%c%c", "xy"[swiz & 1], "xy"[(swiz >> 1) & 1]);
789 }
790
791 static void dump_fma_expand_src0(FILE *fp, unsigned ctrl)
792 {
793 switch (ctrl) {
794 case 3:
795 case 4:
796 case 6:
797 fprintf(fp, ".x");
798 break;
799 case 5:
800 case 7:
801 fprintf(fp, ".y");
802 break;
803 case 0:
804 case 1:
805 case 2:
806 break;
807 default:
808 fprintf(fp, ".unk");
809 break;
810 }
811 }
812
813 static void dump_fma_expand_src1(FILE *fp, unsigned ctrl)
814 {
815 switch (ctrl) {
816 case 1:
817 case 3:
818 fprintf(fp, ".x");
819 break;
820 case 2:
821 case 4:
822 case 5:
823 fprintf(fp, ".y");
824 break;
825 case 0:
826 case 6:
827 case 7:
828 break;
829 default:
830 fprintf(fp, ".unk");
831 break;
832 }
833 }
834
835 static const char *
836 bi_ldst_type_name(enum bifrost_ldst_type type)
837 {
838 switch (type) {
839 case BIFROST_LDST_F16: return "f16";
840 case BIFROST_LDST_F32: return "f32";
841 case BIFROST_LDST_I32: return "i32";
842 case BIFROST_LDST_U32: return "u32";
843 default: return "invalid";
844 }
845 }
846
847 static void dump_fma(FILE *fp, uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, bool verbose)
848 {
849 if (verbose) {
850 fprintf(fp, "# FMA: %016" PRIx64 "\n", word);
851 }
852 struct bifrost_fma_inst FMA;
853 memcpy((char *) &FMA, (char *) &word, sizeof(struct bifrost_fma_inst));
854 struct fma_op_info info = find_fma_op_info(FMA.op, (FMA.op & 0xe0000) == 0xe0000);
855
856 fprintf(fp, "%s", info.name);
857 if (info.src_type == FMA_FADD ||
858 info.src_type == FMA_FMINMAX ||
859 info.src_type == FMA_FMA ||
860 info.src_type == FMA_FADD16 ||
861 info.src_type == FMA_FMINMAX16 ||
862 info.src_type == FMA_FMA16) {
863 dump_output_mod(fp, bits(FMA.op, 12, 14));
864 switch (info.src_type) {
865 case FMA_FADD:
866 case FMA_FMA:
867 case FMA_FADD16:
868 case FMA_FMA16:
869 dump_round_mode(fp, bits(FMA.op, 10, 12));
870 break;
871 case FMA_FMINMAX:
872 case FMA_FMINMAX16:
873 dump_minmax_mode(fp, bits(FMA.op, 10, 12));
874 break;
875 default:
876 assert(0);
877 }
878 } else if (info.src_type == FMA_FCMP || info.src_type == FMA_FCMP16) {
879 dump_fcmp(fp, bits(FMA.op, 10, 13));
880 if (info.src_type == FMA_FCMP)
881 fprintf(fp, ".f32");
882 else
883 fprintf(fp, ".v2f16");
884 } else if (info.src_type == FMA_FMA_MSCALE) {
885 if (FMA.op & (1 << 11)) {
886 switch ((FMA.op >> 9) & 0x3) {
887 case 0:
888 /* This mode seems to do a few things:
889 * - Makes 0 * infinity (and incidentally 0 * nan) return 0,
890 * since generating a nan would poison the result of
891 * 1/infinity and 1/0.
892 * - Fiddles with which nan is returned in nan * nan,
893 * presumably to make sure that the same exact nan is
894 * returned for 1/nan.
895 */
896 fprintf(fp, ".rcp_mode");
897 break;
898 case 3:
899 /* Similar to the above, but src0 always wins when multiplying
900 * 0 by infinity.
901 */
902 fprintf(fp, ".sqrt_mode");
903 break;
904 default:
905 fprintf(fp, ".unk%d_mode", (int) (FMA.op >> 9) & 0x3);
906 }
907 } else {
908 dump_output_mod(fp, bits(FMA.op, 9, 11));
909 }
910 } else if (info.src_type == FMA_SHIFT) {
911 struct bifrost_shift_fma shift;
912 memcpy(&shift, &FMA, sizeof(shift));
913
914 if (shift.half == 0x7)
915 fprintf(fp, ".v2i16");
916 else if (shift.half == 0)
917 fprintf(fp, ".i32");
918 else if (shift.half == 0x4)
919 fprintf(fp, ".v4i8");
920 else
921 fprintf(fp, ".unk%u", shift.half);
922
923 if (!shift.unk)
924 fprintf(fp, ".no_unk");
925
926 if (shift.invert_1)
927 fprintf(fp, ".invert_1");
928
929 if (shift.invert_2)
930 fprintf(fp, ".invert_2");
931 }
932
933 fprintf(fp, " ");
934
935 struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(fp, next_regs);
936 if (next_ctrl.fma_write_unit != REG_WRITE_NONE) {
937 fprintf(fp, "{R%d, T0}, ", GetRegToWrite(next_ctrl.fma_write_unit, next_regs));
938 } else {
939 fprintf(fp, "T0, ");
940 }
941
942 switch (info.src_type) {
943 case FMA_ONE_SRC:
944 dump_src(fp, FMA.src0, regs, consts, true);
945 break;
946 case FMA_TWO_SRC:
947 dump_src(fp, FMA.src0, regs, consts, true);
948 fprintf(fp, ", ");
949 dump_src(fp, FMA.op & 0x7, regs, consts, true);
950 break;
951 case FMA_FADD:
952 case FMA_FMINMAX:
953 if (FMA.op & 0x10)
954 fprintf(fp, "-");
955 if (FMA.op & 0x200)
956 fprintf(fp, "abs(");
957 dump_src(fp, FMA.src0, regs, consts, true);
958 dump_fma_expand_src0(fp, (FMA.op >> 6) & 0x7);
959 if (FMA.op & 0x200)
960 fprintf(fp, ")");
961 fprintf(fp, ", ");
962 if (FMA.op & 0x20)
963 fprintf(fp, "-");
964 if (FMA.op & 0x8)
965 fprintf(fp, "abs(");
966 dump_src(fp, FMA.op & 0x7, regs, consts, true);
967 dump_fma_expand_src1(fp, (FMA.op >> 6) & 0x7);
968 if (FMA.op & 0x8)
969 fprintf(fp, ")");
970 break;
971 case FMA_FADD16:
972 case FMA_FMINMAX16: {
973 bool abs1 = FMA.op & 0x8;
974 bool abs2 = (FMA.op & 0x7) < FMA.src0;
975 if (FMA.op & 0x10)
976 fprintf(fp, "-");
977 if (abs1 || abs2)
978 fprintf(fp, "abs(");
979 dump_src(fp, FMA.src0, regs, consts, true);
980 dump_16swizzle(fp, (FMA.op >> 6) & 0x3);
981 if (abs1 || abs2)
982 fprintf(fp, ")");
983 fprintf(fp, ", ");
984 if (FMA.op & 0x20)
985 fprintf(fp, "-");
986 if (abs1 && abs2)
987 fprintf(fp, "abs(");
988 dump_src(fp, FMA.op & 0x7, regs, consts, true);
989 dump_16swizzle(fp, (FMA.op >> 8) & 0x3);
990 if (abs1 && abs2)
991 fprintf(fp, ")");
992 break;
993 }
994 case FMA_FCMP:
995 if (FMA.op & 0x200)
996 fprintf(fp, "abs(");
997 dump_src(fp, FMA.src0, regs, consts, true);
998 dump_fma_expand_src0(fp, (FMA.op >> 6) & 0x7);
999 if (FMA.op & 0x200)
1000 fprintf(fp, ")");
1001 fprintf(fp, ", ");
1002 if (FMA.op & 0x20)
1003 fprintf(fp, "-");
1004 if (FMA.op & 0x8)
1005 fprintf(fp, "abs(");
1006 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1007 dump_fma_expand_src1(fp, (FMA.op >> 6) & 0x7);
1008 if (FMA.op & 0x8)
1009 fprintf(fp, ")");
1010 break;
1011 case FMA_FCMP16:
1012 dump_src(fp, FMA.src0, regs, consts, true);
1013 // Note: this is kinda a guess, I haven't seen the blob set this to
1014 // anything other than the identity, but it matches FMA_TWO_SRCFmod16
1015 dump_16swizzle(fp, (FMA.op >> 6) & 0x3);
1016 fprintf(fp, ", ");
1017 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1018 dump_16swizzle(fp, (FMA.op >> 8) & 0x3);
1019 break;
1020 case FMA_SHIFT_ADD64:
1021 dump_src(fp, FMA.src0, regs, consts, true);
1022 fprintf(fp, ", ");
1023 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1024 fprintf(fp, ", ");
1025 fprintf(fp, "shift:%u", (FMA.op >> 3) & 0x7);
1026 break;
1027 case FMA_THREE_SRC:
1028 dump_src(fp, FMA.src0, regs, consts, true);
1029 fprintf(fp, ", ");
1030 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1031 fprintf(fp, ", ");
1032 dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true);
1033 break;
1034 case FMA_SHIFT: {
1035 struct bifrost_shift_fma shift;
1036 memcpy(&shift, &FMA, sizeof(shift));
1037
1038 dump_src(fp, shift.src0, regs, consts, true);
1039 fprintf(fp, ", ");
1040 dump_src(fp, shift.src1, regs, consts, true);
1041 fprintf(fp, ", ");
1042 dump_src(fp, shift.src2, regs, consts, true);
1043 break;
1044 }
1045 case FMA_FMA:
1046 if (FMA.op & (1 << 14))
1047 fprintf(fp, "-");
1048 if (FMA.op & (1 << 9))
1049 fprintf(fp, "abs(");
1050 dump_src(fp, FMA.src0, regs, consts, true);
1051 dump_fma_expand_src0(fp, (FMA.op >> 6) & 0x7);
1052 if (FMA.op & (1 << 9))
1053 fprintf(fp, ")");
1054 fprintf(fp, ", ");
1055 if (FMA.op & (1 << 16))
1056 fprintf(fp, "abs(");
1057 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1058 dump_fma_expand_src1(fp, (FMA.op >> 6) & 0x7);
1059 if (FMA.op & (1 << 16))
1060 fprintf(fp, ")");
1061 fprintf(fp, ", ");
1062 if (FMA.op & (1 << 15))
1063 fprintf(fp, "-");
1064 if (FMA.op & (1 << 17))
1065 fprintf(fp, "abs(");
1066 dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true);
1067 if (FMA.op & (1 << 17))
1068 fprintf(fp, ")");
1069 break;
1070 case FMA_FMA16:
1071 if (FMA.op & (1 << 14))
1072 fprintf(fp, "-");
1073 dump_src(fp, FMA.src0, regs, consts, true);
1074 dump_16swizzle(fp, (FMA.op >> 6) & 0x3);
1075 fprintf(fp, ", ");
1076 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1077 dump_16swizzle(fp, (FMA.op >> 8) & 0x3);
1078 fprintf(fp, ", ");
1079 if (FMA.op & (1 << 15))
1080 fprintf(fp, "-");
1081 dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true);
1082 dump_16swizzle(fp, (FMA.op >> 16) & 0x3);
1083 break;
1084 case FMA_CSEL4: {
1085 struct bifrost_csel4 csel;
1086 memcpy(&csel, &FMA, sizeof(csel));
1087 fprintf(fp, ".%s ", csel_cond_name(csel.cond));
1088
1089 dump_src(fp, csel.src0, regs, consts, true);
1090 fprintf(fp, ", ");
1091 dump_src(fp, csel.src1, regs, consts, true);
1092 fprintf(fp, ", ");
1093 dump_src(fp, csel.src2, regs, consts, true);
1094 fprintf(fp, ", ");
1095 dump_src(fp, csel.src3, regs, consts, true);
1096 break;
1097 }
1098 case FMA_FMA_MSCALE:
1099 if (FMA.op & (1 << 12))
1100 fprintf(fp, "abs(");
1101 dump_src(fp, FMA.src0, regs, consts, true);
1102 if (FMA.op & (1 << 12))
1103 fprintf(fp, ")");
1104 fprintf(fp, ", ");
1105 if (FMA.op & (1 << 13))
1106 fprintf(fp, "-");
1107 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1108 fprintf(fp, ", ");
1109 if (FMA.op & (1 << 14))
1110 fprintf(fp, "-");
1111 dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true);
1112 fprintf(fp, ", ");
1113 dump_src(fp, (FMA.op >> 6) & 0x7, regs, consts, true);
1114 break;
1115 }
1116 fprintf(fp, "\n");
1117 }
1118
1119 static const struct add_op_info add_op_infos[] = {
1120 { 0x00000, "MAX.f32", ADD_FMINMAX },
1121 { 0x02000, "MIN.f32", ADD_FMINMAX },
1122 { 0x04000, "ADD.f32", ADD_FADD },
1123 { 0x06000, "FCMP.GL", ADD_FCMP },
1124 { 0x07000, "FCMP.D3D", ADD_FCMP },
1125 { 0x07856, "F16_TO_I16", ADD_ONE_SRC },
1126 { 0x07857, "F16_TO_U16", ADD_ONE_SRC },
1127 { 0x078c0, "I16_TO_F16.XX", ADD_ONE_SRC },
1128 { 0x078c1, "U16_TO_F16.XX", ADD_ONE_SRC },
1129 { 0x078c8, "I16_TO_F16.YX", ADD_ONE_SRC },
1130 { 0x078c9, "U16_TO_F16.YX", ADD_ONE_SRC },
1131 { 0x078d0, "I16_TO_F16.XY", ADD_ONE_SRC },
1132 { 0x078d1, "U16_TO_F16.XY", ADD_ONE_SRC },
1133 { 0x078d8, "I16_TO_F16.YY", ADD_ONE_SRC },
1134 { 0x078d9, "U16_TO_F16.YY", ADD_ONE_SRC },
1135 { 0x07936, "F32_TO_I32", ADD_ONE_SRC },
1136 { 0x07937, "F32_TO_U32", ADD_ONE_SRC },
1137 { 0x07978, "I32_TO_F32", ADD_ONE_SRC },
1138 { 0x07979, "U32_TO_F32", ADD_ONE_SRC },
1139 { 0x07998, "I16_TO_I32.X", ADD_ONE_SRC },
1140 { 0x07999, "U16_TO_U32.X", ADD_ONE_SRC },
1141 { 0x0799a, "I16_TO_I32.Y", ADD_ONE_SRC },
1142 { 0x0799b, "U16_TO_U32.Y", ADD_ONE_SRC },
1143 { 0x0799c, "I16_TO_F32.X", ADD_ONE_SRC },
1144 { 0x0799d, "U16_TO_F32.X", ADD_ONE_SRC },
1145 { 0x0799e, "I16_TO_F32.Y", ADD_ONE_SRC },
1146 { 0x0799f, "U16_TO_F32.Y", ADD_ONE_SRC },
1147 { 0x079a2, "F16_TO_F32.X", ADD_ONE_SRC },
1148 { 0x079a3, "F16_TO_F32.Y", ADD_ONE_SRC },
1149 { 0x07b2b, "SWZ.YX.v2i16", ADD_ONE_SRC },
1150 { 0x07b2c, "NOP", ADD_ONE_SRC },
1151 { 0x07b29, "SWZ.XX.v2i16", ADD_ONE_SRC },
1152 { 0x07b2d, "MOV", ADD_ONE_SRC },
1153 { 0x07b2f, "SWZ.YY.v2i16", ADD_ONE_SRC },
1154 { 0x07b65, "FRCP_FREXPM", ADD_ONE_SRC },
1155 { 0x07b75, "FSQRT_FREXPM", ADD_ONE_SRC },
1156 { 0x07b8d, "FRCP_FREXPE", ADD_ONE_SRC },
1157 { 0x07ba5, "FSQRT_FREXPE", ADD_ONE_SRC },
1158 { 0x07bad, "FRSQ_FREXPE", ADD_ONE_SRC },
1159 { 0x07bc5, "FLOG_FREXPE", ADD_ONE_SRC },
1160 { 0x07d45, "CEIL", ADD_ONE_SRC },
1161 { 0x07d85, "FLOOR", ADD_ONE_SRC },
1162 { 0x07dc5, "TRUNC", ADD_ONE_SRC },
1163 { 0x07f18, "LSHIFT_ADD_HIGH32.i32", ADD_TWO_SRC },
1164 { 0x08000, "LD_ATTR", ADD_LOAD_ATTR, true },
1165 { 0x0a000, "LD_VAR.32", ADD_VARYING_INTERP, true },
1166 { 0x0b000, "TEX", ADD_TEX_COMPACT, true },
1167 { 0x0c188, "LOAD.i32", ADD_TWO_SRC, true },
1168 { 0x0c1a0, "LD_UBO.i32", ADD_TWO_SRC, true },
1169 { 0x0c1b8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true },
1170 { 0x0c1c8, "LOAD.v2i32", ADD_TWO_SRC, true },
1171 { 0x0c1e0, "LD_UBO.v2i32", ADD_TWO_SRC, true },
1172 { 0x0c1f8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true },
1173 { 0x0c208, "LOAD.v4i32", ADD_TWO_SRC, true },
1174 { 0x0c220, "LD_UBO.v4i32", ADD_TWO_SRC, true },
1175 { 0x0c238, "LD_SCRATCH.v4i32", ADD_TWO_SRC, true },
1176 { 0x0c248, "STORE.v4i32", ADD_TWO_SRC, true },
1177 { 0x0c278, "ST_SCRATCH.v4i32", ADD_TWO_SRC, true },
1178 { 0x0c588, "STORE.i32", ADD_TWO_SRC, true },
1179 { 0x0c5b8, "ST_SCRATCH.i32", ADD_TWO_SRC, true },
1180 { 0x0c5c8, "STORE.v2i32", ADD_TWO_SRC, true },
1181 { 0x0c5f8, "ST_SCRATCH.v2i32", ADD_TWO_SRC, true },
1182 { 0x0c648, "LOAD.u16", ADD_TWO_SRC, true }, // zero-extends
1183 { 0x0ca88, "LOAD.v3i32", ADD_TWO_SRC, true },
1184 { 0x0caa0, "LD_UBO.v3i32", ADD_TWO_SRC, true },
1185 { 0x0cab8, "LD_SCRATCH.v3i32", ADD_TWO_SRC, true },
1186 { 0x0cb88, "STORE.v3i32", ADD_TWO_SRC, true },
1187 { 0x0cbb8, "ST_SCRATCH.v3i32", ADD_TWO_SRC, true },
1188 { 0x0cc00, "FRCP_FAST.f32", ADD_ONE_SRC },
1189 { 0x0cc20, "FRSQ_FAST.f32", ADD_ONE_SRC },
1190 { 0x0ce00, "FRCP_TABLE", ADD_ONE_SRC },
1191 { 0x0ce10, "FRCP_FAST.f16.X", ADD_ONE_SRC },
1192 { 0x0ce20, "FRSQ_TABLE", ADD_ONE_SRC },
1193 { 0x0ce30, "FRCP_FAST.f16.Y", ADD_ONE_SRC },
1194 { 0x0ce50, "FRSQ_FAST.f16.X", ADD_ONE_SRC },
1195 { 0x0ce60, "FRCP_APPROX", ADD_ONE_SRC },
1196 { 0x0ce70, "FRSQ_FAST.f16.Y", ADD_ONE_SRC },
1197 { 0x0cf40, "ATAN_ASSIST", ADD_TWO_SRC },
1198 { 0x0cf48, "ATAN_TABLE", ADD_TWO_SRC },
1199 { 0x0cf50, "SIN_TABLE", ADD_ONE_SRC },
1200 { 0x0cf51, "COS_TABLE", ADD_ONE_SRC },
1201 { 0x0cf58, "EXP_TABLE", ADD_ONE_SRC },
1202 { 0x0cf60, "FLOG2_TABLE", ADD_ONE_SRC },
1203 { 0x0cf64, "FLOGE_TABLE", ADD_ONE_SRC },
1204 { 0x0d000, "BRANCH", ADD_BRANCH },
1205 { 0x0e8c0, "MUX", ADD_THREE_SRC },
1206 { 0x0e9b0, "ATAN_LDEXP.Y.f32", ADD_TWO_SRC },
1207 { 0x0e9b8, "ATAN_LDEXP.X.f32", ADD_TWO_SRC },
1208 { 0x0ea60, "SEL.XX.i16", ADD_TWO_SRC },
1209 { 0x0ea70, "SEL.XY.i16", ADD_TWO_SRC },
1210 { 0x0ea68, "SEL.YX.i16", ADD_TWO_SRC },
1211 { 0x0ea78, "SEL.YY.i16", ADD_TWO_SRC },
1212 { 0x0ec00, "F32_TO_F16", ADD_TWO_SRC },
1213 { 0x0f640, "ICMP.GL.GT", ADD_TWO_SRC }, // src0 > src1 ? 1 : 0
1214 { 0x0f648, "ICMP.GL.GE", ADD_TWO_SRC },
1215 { 0x0f650, "UCMP.GL.GT", ADD_TWO_SRC },
1216 { 0x0f658, "UCMP.GL.GE", ADD_TWO_SRC },
1217 { 0x0f660, "ICMP.GL.EQ", ADD_TWO_SRC },
1218 { 0x0f669, "ICMP.GL.NEQ", ADD_TWO_SRC },
1219 { 0x0f6c0, "ICMP.D3D.GT", ADD_TWO_SRC }, // src0 > src1 ? ~0 : 0
1220 { 0x0f6c8, "ICMP.D3D.GE", ADD_TWO_SRC },
1221 { 0x0f6d0, "UCMP.D3D.GT", ADD_TWO_SRC },
1222 { 0x0f6d8, "UCMP.D3D.GE", ADD_TWO_SRC },
1223 { 0x0f6e0, "ICMP.D3D.EQ", ADD_TWO_SRC },
1224 { 0x10000, "MAX.v2f16", ADD_FMINMAX16 },
1225 { 0x11000, "ADD_MSCALE.f32", ADD_FADDMscale },
1226 { 0x12000, "MIN.v2f16", ADD_FMINMAX16 },
1227 { 0x14000, "ADD.v2f16", ADD_FADD16 },
1228 { 0x17000, "FCMP.D3D", ADD_FCMP16 },
1229 { 0x178c0, "ADD.i32", ADD_TWO_SRC },
1230 { 0x17900, "ADD.v2i16", ADD_TWO_SRC },
1231 { 0x17ac0, "SUB.i32", ADD_TWO_SRC },
1232 { 0x17c10, "ADDC.i32", ADD_TWO_SRC }, // adds src0 to the bottom bit of src1
1233 { 0x17d80, "ADD.i32.i16.X", ADD_TWO_SRC },
1234 { 0x17d90, "ADD.i32.u16.X", ADD_TWO_SRC },
1235 { 0x17dc0, "ADD.i32.i16.Y", ADD_TWO_SRC },
1236 { 0x17dd0, "ADD.i32.u16.Y", ADD_TWO_SRC },
1237 { 0x18000, "LD_VAR_ADDR", ADD_VARYING_ADDRESS, true },
1238 { 0x19181, "DISCARD.FEQ.f32", ADD_TWO_SRC, true },
1239 { 0x19189, "DISCARD.FNE.f32", ADD_TWO_SRC, true },
1240 { 0x1918C, "DISCARD.GL.f32", ADD_TWO_SRC, true }, /* Consumes ICMP.GL/etc with fixed 0 argument */
1241 { 0x19190, "DISCARD.FLE.f32", ADD_TWO_SRC, true },
1242 { 0x19198, "DISCARD.FLT.f32", ADD_TWO_SRC, true },
1243 { 0x191e8, "ATEST.f32", ADD_TWO_SRC, true },
1244 { 0x191f0, "ATEST.X.f16", ADD_TWO_SRC, true },
1245 { 0x191f8, "ATEST.Y.f16", ADD_TWO_SRC, true },
1246 { 0x19300, "ST_VAR.v1", ADD_THREE_SRC, true },
1247 { 0x19340, "ST_VAR.v2", ADD_THREE_SRC, true },
1248 { 0x19380, "ST_VAR.v3", ADD_THREE_SRC, true },
1249 { 0x193c0, "ST_VAR.v4", ADD_THREE_SRC, true },
1250 { 0x1952c, "BLEND", ADD_BLENDING, true },
1251 { 0x1a000, "LD_VAR.16", ADD_VARYING_INTERP, true },
1252 { 0x1ae60, "TEX", ADD_TEX, true },
1253 { 0x1c000, "RSHIFT_NAND.i32", ADD_SHIFT },
1254 { 0x1c400, "RSHIFT_AND.i32", ADD_SHIFT },
1255 { 0x1c800, "LSHIFT_NAND.i32", ADD_SHIFT },
1256 { 0x1cc00, "LSHIFT_AND.i32", ADD_SHIFT },
1257 { 0x1d000, "RSHIFT_XOR.i32", ADD_SHIFT },
1258 { 0x1d400, "LSHIFT_ADD.i32", ADD_SHIFT },
1259 { 0x1d800, "RSHIFT_SUB.i32", ADD_SHIFT },
1260 { 0x1dd18, "OR.i32", ADD_TWO_SRC },
1261 { 0x1dd20, "AND.i32", ADD_TWO_SRC },
1262 { 0x1dd60, "LSHIFT.i32", ADD_TWO_SRC },
1263 { 0x1dd50, "XOR.i32", ADD_TWO_SRC },
1264 { 0x1dd80, "RSHIFT.i32", ADD_TWO_SRC },
1265 { 0x1dda0, "ARSHIFT.i32", ADD_TWO_SRC },
1266 };
1267
1268 static struct add_op_info find_add_op_info(unsigned op)
1269 {
1270 for (unsigned i = 0; i < ARRAY_SIZE(add_op_infos); i++) {
1271 unsigned opCmp = ~0;
1272 switch (add_op_infos[i].src_type) {
1273 case ADD_ONE_SRC:
1274 case ADD_BLENDING:
1275 opCmp = op;
1276 break;
1277 case ADD_TWO_SRC:
1278 opCmp = op & ~0x7;
1279 break;
1280 case ADD_THREE_SRC:
1281 opCmp = op & ~0x3f;
1282 break;
1283 case ADD_SHIFT:
1284 opCmp = op & ~0x3ff;
1285 break;
1286 case ADD_TEX:
1287 opCmp = op & ~0xf;
1288 break;
1289 case ADD_FADD:
1290 case ADD_FMINMAX:
1291 case ADD_FADD16:
1292 opCmp = op & ~0x1fff;
1293 break;
1294 case ADD_FMINMAX16:
1295 case ADD_FADDMscale:
1296 opCmp = op & ~0xfff;
1297 break;
1298 case ADD_FCMP:
1299 case ADD_FCMP16:
1300 opCmp = op & ~0x7ff;
1301 break;
1302 case ADD_TEX_COMPACT:
1303 opCmp = op & ~0x3ff;
1304 break;
1305 case ADD_VARYING_INTERP:
1306 opCmp = op & ~0x7ff;
1307 break;
1308 case ADD_VARYING_ADDRESS:
1309 opCmp = op & ~0xfff;
1310 break;
1311 case ADD_LOAD_ATTR:
1312 case ADD_BRANCH:
1313 opCmp = op & ~0xfff;
1314 break;
1315 default:
1316 opCmp = ~0;
1317 break;
1318 }
1319 if (add_op_infos[i].op == opCmp)
1320 return add_op_infos[i];
1321 }
1322
1323 struct add_op_info info;
1324 snprintf(info.name, sizeof(info.name), "op%04x", op);
1325 info.op = op;
1326 info.src_type = ADD_TWO_SRC;
1327 info.has_data_reg = true;
1328 return info;
1329 }
1330
1331 static void dump_add(FILE *fp, uint64_t word, struct bifrost_regs regs,
1332 struct bifrost_regs next_regs, uint64_t *consts,
1333 unsigned data_reg, unsigned offset, bool verbose)
1334 {
1335 if (verbose) {
1336 fprintf(fp, "# ADD: %016" PRIx64 "\n", word);
1337 }
1338 struct bifrost_add_inst ADD;
1339 memcpy((char *) &ADD, (char *) &word, sizeof(ADD));
1340 struct add_op_info info = find_add_op_info(ADD.op);
1341
1342 fprintf(fp, "%s", info.name);
1343
1344 // float16 seems like it doesn't support output modifiers
1345 if (info.src_type == ADD_FADD || info.src_type == ADD_FMINMAX) {
1346 // output modifiers
1347 dump_output_mod(fp, bits(ADD.op, 8, 10));
1348 if (info.src_type == ADD_FADD)
1349 dump_round_mode(fp, bits(ADD.op, 10, 12));
1350 else
1351 dump_minmax_mode(fp, bits(ADD.op, 10, 12));
1352 } else if (info.src_type == ADD_FCMP || info.src_type == ADD_FCMP16) {
1353 dump_fcmp(fp, bits(ADD.op, 3, 6));
1354 if (info.src_type == ADD_FCMP)
1355 fprintf(fp, ".f32");
1356 else
1357 fprintf(fp, ".v2f16");
1358 } else if (info.src_type == ADD_FADDMscale) {
1359 switch ((ADD.op >> 6) & 0x7) {
1360 case 0:
1361 break;
1362 // causes GPU hangs on G71
1363 case 1:
1364 fprintf(fp, ".invalid");
1365 break;
1366 // Same as usual outmod value.
1367 case 2:
1368 fprintf(fp, ".clamp_0_1");
1369 break;
1370 // If src0 is infinite or NaN, flush it to zero so that the other
1371 // source is passed through unmodified.
1372 case 3:
1373 fprintf(fp, ".flush_src0_inf_nan");
1374 break;
1375 // Vice versa.
1376 case 4:
1377 fprintf(fp, ".flush_src1_inf_nan");
1378 break;
1379 // Every other case seems to behave the same as the above?
1380 default:
1381 fprintf(fp, ".unk%d", (ADD.op >> 6) & 0x7);
1382 break;
1383 }
1384 } else if (info.src_type == ADD_VARYING_INTERP) {
1385 if (ADD.op & 0x200)
1386 fprintf(fp, ".reuse");
1387 if (ADD.op & 0x400)
1388 fprintf(fp, ".flat");
1389 switch ((ADD.op >> 7) & 0x3) {
1390 case 0:
1391 fprintf(fp, ".per_frag");
1392 break;
1393 case 1:
1394 fprintf(fp, ".centroid");
1395 break;
1396 case 2:
1397 break;
1398 case 3:
1399 fprintf(fp, ".explicit");
1400 break;
1401 }
1402 fprintf(fp, ".v%d", ((ADD.op >> 5) & 0x3) + 1);
1403 } else if (info.src_type == ADD_BRANCH) {
1404 enum branch_code branchCode = (enum branch_code) ((ADD.op >> 6) & 0x3f);
1405 if (branchCode == BR_ALWAYS) {
1406 // unconditional branch
1407 } else {
1408 enum branch_cond cond = (enum branch_cond) ((ADD.op >> 6) & 0x7);
1409 enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7);
1410 bool portSwapped = (ADD.op & 0x7) < ADD.src0;
1411 // See the comment in branch_bit_size
1412 if (size == BR_SIZE_16YX0)
1413 portSwapped = true;
1414 if (size == BR_SIZE_16YX1)
1415 portSwapped = false;
1416 // These sizes are only for floating point comparisons, so the
1417 // non-floating-point comparisons are reused to encode the flipped
1418 // versions.
1419 if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y)
1420 portSwapped = false;
1421 // There's only one argument, so we reuse the extra argument to
1422 // encode this.
1423 if (size == BR_SIZE_ZERO)
1424 portSwapped = !(ADD.op & 1);
1425
1426 switch (cond) {
1427 case BR_COND_LT:
1428 if (portSwapped)
1429 fprintf(fp, ".LT.u");
1430 else
1431 fprintf(fp, ".LT.i");
1432 break;
1433 case BR_COND_LE:
1434 if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y) {
1435 fprintf(fp, ".UNE.f");
1436 } else {
1437 if (portSwapped)
1438 fprintf(fp, ".LE.u");
1439 else
1440 fprintf(fp, ".LE.i");
1441 }
1442 break;
1443 case BR_COND_GT:
1444 if (portSwapped)
1445 fprintf(fp, ".GT.u");
1446 else
1447 fprintf(fp, ".GT.i");
1448 break;
1449 case BR_COND_GE:
1450 if (portSwapped)
1451 fprintf(fp, ".GE.u");
1452 else
1453 fprintf(fp, ".GE.i");
1454 break;
1455 case BR_COND_EQ:
1456 if (portSwapped)
1457 fprintf(fp, ".NE.i");
1458 else
1459 fprintf(fp, ".EQ.i");
1460 break;
1461 case BR_COND_OEQ:
1462 if (portSwapped)
1463 fprintf(fp, ".UNE.f");
1464 else
1465 fprintf(fp, ".OEQ.f");
1466 break;
1467 case BR_COND_OGT:
1468 if (portSwapped)
1469 fprintf(fp, ".OGT.unk.f");
1470 else
1471 fprintf(fp, ".OGT.f");
1472 break;
1473 case BR_COND_OLT:
1474 if (portSwapped)
1475 fprintf(fp, ".OLT.unk.f");
1476 else
1477 fprintf(fp, ".OLT.f");
1478 break;
1479 }
1480 switch (size) {
1481 case BR_SIZE_32:
1482 case BR_SIZE_32_AND_16X:
1483 case BR_SIZE_32_AND_16Y:
1484 fprintf(fp, "32");
1485 break;
1486 case BR_SIZE_16XX:
1487 case BR_SIZE_16YY:
1488 case BR_SIZE_16YX0:
1489 case BR_SIZE_16YX1:
1490 fprintf(fp, "16");
1491 break;
1492 case BR_SIZE_ZERO: {
1493 unsigned ctrl = (ADD.op >> 1) & 0x3;
1494 if (ctrl == 0)
1495 fprintf(fp, "32.Z");
1496 else
1497 fprintf(fp, "16.Z");
1498 break;
1499 }
1500 }
1501 }
1502 } else if (info.src_type == ADD_SHIFT) {
1503 struct bifrost_shift_add shift;
1504 memcpy(&shift, &ADD, sizeof(ADD));
1505
1506 if (shift.invert_1)
1507 fprintf(fp, ".invert_1");
1508
1509 if (shift.invert_2)
1510 fprintf(fp, ".invert_2");
1511
1512 if (shift.zero)
1513 fprintf(fp, ".unk%u", shift.zero);
1514 } else if (info.src_type == ADD_VARYING_ADDRESS) {
1515 struct bifrost_ld_var_addr ld;
1516 memcpy(&ld, &ADD, sizeof(ADD));
1517 fprintf(fp, ".%s", bi_ldst_type_name(ld.type));
1518 } else if (info.src_type == ADD_LOAD_ATTR) {
1519 struct bifrost_ld_attr ld;
1520 memcpy(&ld, &ADD, sizeof(ADD));
1521
1522 if (ld.channels)
1523 fprintf(fp, ".v%d%s", ld.channels + 1, bi_ldst_type_name(ld.type));
1524 else
1525 fprintf(fp, ".%s", bi_ldst_type_name(ld.type));
1526 }
1527
1528 fprintf(fp, " ");
1529
1530 struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(fp, next_regs);
1531 if (next_ctrl.add_write_unit != REG_WRITE_NONE) {
1532 fprintf(fp, "{R%d, T1}, ", GetRegToWrite(next_ctrl.add_write_unit, next_regs));
1533 } else {
1534 fprintf(fp, "T1, ");
1535 }
1536
1537 switch (info.src_type) {
1538 case ADD_BLENDING:
1539 // Note: in this case, regs.uniform_const == location | 0x8
1540 // This probably means we can't load uniforms or immediates in the
1541 // same instruction. This re-uses the encoding that normally means
1542 // "disabled", where the low 4 bits are ignored. Perhaps the extra
1543 // 0x8 or'd in indicates this is happening.
1544 fprintf(fp, "location:%d, ", regs.uniform_const & 0x7);
1545 // fallthrough
1546 case ADD_ONE_SRC:
1547 dump_src(fp, ADD.src0, regs, consts, false);
1548 break;
1549 case ADD_TEX:
1550 case ADD_TEX_COMPACT: {
1551 int tex_index;
1552 int sampler_index;
1553 bool dualTex = false;
1554 if (info.src_type == ADD_TEX_COMPACT) {
1555 tex_index = (ADD.op >> 3) & 0x7;
1556 sampler_index = (ADD.op >> 7) & 0x7;
1557 bool unknown = (ADD.op & 0x40);
1558 // TODO: figure out if the unknown bit is ever 0
1559 if (!unknown)
1560 fprintf(fp, "unknown ");
1561 } else {
1562 uint64_t constVal = get_const(consts, regs);
1563 uint32_t controlBits = (ADD.op & 0x8) ? (constVal >> 32) : constVal;
1564 struct bifrost_tex_ctrl ctrl;
1565 memcpy((char *) &ctrl, (char *) &controlBits, sizeof(ctrl));
1566
1567 // TODO: figure out what actually triggers dual-tex
1568 if (ctrl.result_type == 9) {
1569 struct bifrost_dual_tex_ctrl dualCtrl;
1570 memcpy((char *) &dualCtrl, (char *) &controlBits, sizeof(ctrl));
1571 fprintf(fp, "(dualtex) tex0:%d samp0:%d tex1:%d samp1:%d ",
1572 dualCtrl.tex_index0, dualCtrl.sampler_index0,
1573 dualCtrl.tex_index1, dualCtrl.sampler_index1);
1574 if (dualCtrl.unk0 != 3)
1575 fprintf(fp, "unk:%d ", dualCtrl.unk0);
1576 dualTex = true;
1577 } else {
1578 if (ctrl.no_merge_index) {
1579 tex_index = ctrl.tex_index;
1580 sampler_index = ctrl.sampler_index;
1581 } else {
1582 tex_index = sampler_index = ctrl.tex_index;
1583 unsigned unk = ctrl.sampler_index >> 2;
1584 if (unk != 3)
1585 fprintf(fp, "unk:%d ", unk);
1586 if (ctrl.sampler_index & 1)
1587 tex_index = -1;
1588 if (ctrl.sampler_index & 2)
1589 sampler_index = -1;
1590 }
1591
1592 if (ctrl.unk0 != 3)
1593 fprintf(fp, "unk0:%d ", ctrl.unk0);
1594 if (ctrl.unk1)
1595 fprintf(fp, "unk1 ");
1596 if (ctrl.unk2 != 0xf)
1597 fprintf(fp, "unk2:%x ", ctrl.unk2);
1598
1599 switch (ctrl.result_type) {
1600 case 0x4:
1601 fprintf(fp, "f32 ");
1602 break;
1603 case 0xe:
1604 fprintf(fp, "i32 ");
1605 break;
1606 case 0xf:
1607 fprintf(fp, "u32 ");
1608 break;
1609 default:
1610 fprintf(fp, "unktype(%x) ", ctrl.result_type);
1611 }
1612
1613 switch (ctrl.tex_type) {
1614 case 0:
1615 fprintf(fp, "cube ");
1616 break;
1617 case 1:
1618 fprintf(fp, "buffer ");
1619 break;
1620 case 2:
1621 fprintf(fp, "2D ");
1622 break;
1623 case 3:
1624 fprintf(fp, "3D ");
1625 break;
1626 }
1627
1628 if (ctrl.is_shadow)
1629 fprintf(fp, "shadow ");
1630 if (ctrl.is_array)
1631 fprintf(fp, "array ");
1632
1633 if (!ctrl.filter) {
1634 if (ctrl.calc_gradients) {
1635 int comp = (controlBits >> 20) & 0x3;
1636 fprintf(fp, "txg comp:%d ", comp);
1637 } else {
1638 fprintf(fp, "txf ");
1639 }
1640 } else {
1641 if (!ctrl.not_supply_lod) {
1642 if (ctrl.compute_lod)
1643 fprintf(fp, "lod_bias ");
1644 else
1645 fprintf(fp, "lod ");
1646 }
1647
1648 if (!ctrl.calc_gradients)
1649 fprintf(fp, "grad ");
1650 }
1651
1652 if (ctrl.texel_offset)
1653 fprintf(fp, "offset ");
1654 }
1655 }
1656
1657 if (!dualTex) {
1658 if (tex_index == -1)
1659 fprintf(fp, "tex:indirect ");
1660 else
1661 fprintf(fp, "tex:%d ", tex_index);
1662
1663 if (sampler_index == -1)
1664 fprintf(fp, "samp:indirect ");
1665 else
1666 fprintf(fp, "samp:%d ", sampler_index);
1667 }
1668 break;
1669 }
1670 case ADD_VARYING_INTERP: {
1671 unsigned addr = ADD.op & 0x1f;
1672 if (addr < 0b10100) {
1673 // direct addr
1674 fprintf(fp, "%d", addr);
1675 } else if (addr < 0b11000) {
1676 if (addr == 22)
1677 fprintf(fp, "fragw");
1678 else if (addr == 23)
1679 fprintf(fp, "fragz");
1680 else
1681 fprintf(fp, "unk%d", addr);
1682 } else {
1683 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1684 }
1685 fprintf(fp, ", ");
1686 dump_src(fp, ADD.src0, regs, consts, false);
1687 break;
1688 }
1689 case ADD_VARYING_ADDRESS: {
1690 dump_src(fp, ADD.src0, regs, consts, false);
1691 fprintf(fp, ", ");
1692 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1693 fprintf(fp, ", ");
1694 unsigned location = (ADD.op >> 3) & 0x1f;
1695 if (location < 16) {
1696 fprintf(fp, "location:%d", location);
1697 } else if (location == 20) {
1698 fprintf(fp, "location:%u", (uint32_t) get_const(consts, regs));
1699 } else if (location == 21) {
1700 fprintf(fp, "location:%u", (uint32_t) (get_const(consts, regs) >> 32));
1701 } else {
1702 fprintf(fp, "location:%d(unk)", location);
1703 }
1704 break;
1705 }
1706 case ADD_LOAD_ATTR:
1707 fprintf(fp, "location:%d, ", (ADD.op >> 3) & 0x1f);
1708 case ADD_TWO_SRC:
1709 dump_src(fp, ADD.src0, regs, consts, false);
1710 fprintf(fp, ", ");
1711 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1712 break;
1713 case ADD_THREE_SRC:
1714 dump_src(fp, ADD.src0, regs, consts, false);
1715 fprintf(fp, ", ");
1716 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1717 fprintf(fp, ", ");
1718 dump_src(fp, (ADD.op >> 3) & 0x7, regs, consts, false);
1719 break;
1720 case ADD_SHIFT: {
1721 struct bifrost_shift_add shift;
1722 memcpy(&shift, &ADD, sizeof(ADD));
1723 dump_src(fp, shift.src0, regs, consts, false);
1724 fprintf(fp, ", ");
1725 dump_src(fp, shift.src1, regs, consts, false);
1726 fprintf(fp, ", ");
1727 dump_src(fp, shift.src2, regs, consts, false);
1728 break;
1729 }
1730 case ADD_FADD:
1731 case ADD_FMINMAX:
1732 if (ADD.op & 0x10)
1733 fprintf(fp, "-");
1734 if (ADD.op & 0x1000)
1735 fprintf(fp, "abs(");
1736 dump_src(fp, ADD.src0, regs, consts, false);
1737 switch ((ADD.op >> 6) & 0x3) {
1738 case 3:
1739 fprintf(fp, ".x");
1740 break;
1741 default:
1742 break;
1743 }
1744 if (ADD.op & 0x1000)
1745 fprintf(fp, ")");
1746 fprintf(fp, ", ");
1747 if (ADD.op & 0x20)
1748 fprintf(fp, "-");
1749 if (ADD.op & 0x8)
1750 fprintf(fp, "abs(");
1751 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1752 switch ((ADD.op >> 6) & 0x3) {
1753 case 1:
1754 case 3:
1755 fprintf(fp, ".x");
1756 break;
1757 case 2:
1758 fprintf(fp, ".y");
1759 break;
1760 case 0:
1761 break;
1762 default:
1763 fprintf(fp, ".unk");
1764 break;
1765 }
1766 if (ADD.op & 0x8)
1767 fprintf(fp, ")");
1768 break;
1769 case ADD_FADD16:
1770 if (ADD.op & 0x10)
1771 fprintf(fp, "-");
1772 if (ADD.op & 0x1000)
1773 fprintf(fp, "abs(");
1774 dump_src(fp, ADD.src0, regs, consts, false);
1775 if (ADD.op & 0x1000)
1776 fprintf(fp, ")");
1777 dump_16swizzle(fp, (ADD.op >> 6) & 0x3);
1778 fprintf(fp, ", ");
1779 if (ADD.op & 0x20)
1780 fprintf(fp, "-");
1781 if (ADD.op & 0x8)
1782 fprintf(fp, "abs(");
1783 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1784 dump_16swizzle(fp, (ADD.op >> 8) & 0x3);
1785 if (ADD.op & 0x8)
1786 fprintf(fp, ")");
1787 break;
1788 case ADD_FMINMAX16: {
1789 bool abs1 = ADD.op & 0x8;
1790 bool abs2 = (ADD.op & 0x7) < ADD.src0;
1791 if (ADD.op & 0x10)
1792 fprintf(fp, "-");
1793 if (abs1 || abs2)
1794 fprintf(fp, "abs(");
1795 dump_src(fp, ADD.src0, regs, consts, false);
1796 dump_16swizzle(fp, (ADD.op >> 6) & 0x3);
1797 if (abs1 || abs2)
1798 fprintf(fp, ")");
1799 fprintf(fp, ", ");
1800 if (ADD.op & 0x20)
1801 fprintf(fp, "-");
1802 if (abs1 && abs2)
1803 fprintf(fp, "abs(");
1804 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1805 dump_16swizzle(fp, (ADD.op >> 8) & 0x3);
1806 if (abs1 && abs2)
1807 fprintf(fp, ")");
1808 break;
1809 }
1810 case ADD_FADDMscale: {
1811 if (ADD.op & 0x400)
1812 fprintf(fp, "-");
1813 if (ADD.op & 0x200)
1814 fprintf(fp, "abs(");
1815 dump_src(fp, ADD.src0, regs, consts, false);
1816 if (ADD.op & 0x200)
1817 fprintf(fp, ")");
1818
1819 fprintf(fp, ", ");
1820
1821 if (ADD.op & 0x800)
1822 fprintf(fp, "-");
1823 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1824
1825 fprintf(fp, ", ");
1826
1827 dump_src(fp, (ADD.op >> 3) & 0x7, regs, consts, false);
1828 break;
1829 }
1830 case ADD_FCMP:
1831 if (ADD.op & 0x400) {
1832 fprintf(fp, "-");
1833 }
1834 if (ADD.op & 0x100) {
1835 fprintf(fp, "abs(");
1836 }
1837 dump_src(fp, ADD.src0, regs, consts, false);
1838 switch ((ADD.op >> 6) & 0x3) {
1839 case 3:
1840 fprintf(fp, ".x");
1841 break;
1842 default:
1843 break;
1844 }
1845 if (ADD.op & 0x100) {
1846 fprintf(fp, ")");
1847 }
1848 fprintf(fp, ", ");
1849 if (ADD.op & 0x200) {
1850 fprintf(fp, "abs(");
1851 }
1852 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1853 switch ((ADD.op >> 6) & 0x3) {
1854 case 1:
1855 case 3:
1856 fprintf(fp, ".x");
1857 break;
1858 case 2:
1859 fprintf(fp, ".y");
1860 break;
1861 case 0:
1862 break;
1863 default:
1864 fprintf(fp, ".unk");
1865 break;
1866 }
1867 if (ADD.op & 0x200) {
1868 fprintf(fp, ")");
1869 }
1870 break;
1871 case ADD_FCMP16:
1872 dump_src(fp, ADD.src0, regs, consts, false);
1873 dump_16swizzle(fp, (ADD.op >> 6) & 0x3);
1874 fprintf(fp, ", ");
1875 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1876 dump_16swizzle(fp, (ADD.op >> 8) & 0x3);
1877 break;
1878 case ADD_BRANCH: {
1879 enum branch_code code = (enum branch_code) ((ADD.op >> 6) & 0x3f);
1880 enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7);
1881 if (code != BR_ALWAYS) {
1882 dump_src(fp, ADD.src0, regs, consts, false);
1883 switch (size) {
1884 case BR_SIZE_16XX:
1885 fprintf(fp, ".x");
1886 break;
1887 case BR_SIZE_16YY:
1888 case BR_SIZE_16YX0:
1889 case BR_SIZE_16YX1:
1890 fprintf(fp, ".y");
1891 break;
1892 case BR_SIZE_ZERO: {
1893 unsigned ctrl = (ADD.op >> 1) & 0x3;
1894 switch (ctrl) {
1895 case 1:
1896 fprintf(fp, ".y");
1897 break;
1898 case 2:
1899 fprintf(fp, ".x");
1900 break;
1901 default:
1902 break;
1903 }
1904 }
1905 default:
1906 break;
1907 }
1908 fprintf(fp, ", ");
1909 }
1910 if (code != BR_ALWAYS && size != BR_SIZE_ZERO) {
1911 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1912 switch (size) {
1913 case BR_SIZE_16XX:
1914 case BR_SIZE_16YX0:
1915 case BR_SIZE_16YX1:
1916 case BR_SIZE_32_AND_16X:
1917 fprintf(fp, ".x");
1918 break;
1919 case BR_SIZE_16YY:
1920 case BR_SIZE_32_AND_16Y:
1921 fprintf(fp, ".y");
1922 break;
1923 default:
1924 break;
1925 }
1926 fprintf(fp, ", ");
1927 }
1928 // I haven't had the chance to test if this actually specifies the
1929 // branch offset, since I couldn't get it to produce values other
1930 // than 5 (uniform/const high), but these three bits are always
1931 // consistent across branch instructions, so it makes sense...
1932 int offsetSrc = (ADD.op >> 3) & 0x7;
1933 if (offsetSrc == 4 || offsetSrc == 5) {
1934 // If the offset is known/constant, we can decode it
1935 uint32_t raw_offset;
1936 if (offsetSrc == 4)
1937 raw_offset = get_const(consts, regs);
1938 else
1939 raw_offset = get_const(consts, regs) >> 32;
1940 // The high 4 bits are flags, while the rest is the
1941 // twos-complement offset in bytes (here we convert to
1942 // clauses).
1943 int32_t branch_offset = ((int32_t) raw_offset << 4) >> 8;
1944
1945 // If high4 is the high 4 bits of the last 64-bit constant,
1946 // this is calculated as (high4 + 4) & 0xf, or 0 if the branch
1947 // offset itself is the last constant. Not sure if this is
1948 // actually used, or just garbage in unused bits, but in any
1949 // case, we can just ignore it here since it's redundant. Note
1950 // that if there is any padding, this will be 4 since the
1951 // padding counts as the last constant.
1952 unsigned flags = raw_offset >> 28;
1953 (void) flags;
1954
1955 // Note: the offset is in bytes, relative to the beginning of the
1956 // current clause, so a zero offset would be a loop back to the
1957 // same clause (annoyingly different from Midgard).
1958 fprintf(fp, "clause_%d", offset + branch_offset);
1959 } else {
1960 dump_src(fp, offsetSrc, regs, consts, false);
1961 }
1962 }
1963 }
1964 if (info.has_data_reg) {
1965 fprintf(fp, ", R%d", data_reg);
1966 }
1967 fprintf(fp, "\n");
1968 }
1969
1970 void dump_instr(FILE *fp, const struct bifrost_alu_inst *instr,
1971 struct bifrost_regs next_regs, uint64_t *consts,
1972 unsigned data_reg, unsigned offset, bool verbose)
1973 {
1974 struct bifrost_regs regs;
1975 memcpy((char *) &regs, (char *) &instr->reg_bits, sizeof(regs));
1976
1977 if (verbose) {
1978 fprintf(fp, "# regs: %016" PRIx64 "\n", instr->reg_bits);
1979 dump_regs(fp, regs);
1980 }
1981 dump_fma(fp, instr->fma_bits, regs, next_regs, consts, verbose);
1982 dump_add(fp, instr->add_bits, regs, next_regs, consts, data_reg, offset, verbose);
1983 }
1984
1985 bool dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offset, bool verbose)
1986 {
1987 // State for a decoded clause
1988 struct bifrost_alu_inst instrs[8] = {};
1989 uint64_t consts[6] = {};
1990 unsigned num_instrs = 0;
1991 unsigned num_consts = 0;
1992 uint64_t header_bits = 0;
1993 bool stopbit = false;
1994
1995 unsigned i;
1996 for (i = 0; ; i++, words += 4) {
1997 if (verbose) {
1998 fprintf(fp, "# ");
1999 for (int j = 0; j < 4; j++)
2000 fprintf(fp, "%08x ", words[3 - j]); // low bit on the right
2001 fprintf(fp, "\n");
2002 }
2003 unsigned tag = bits(words[0], 0, 8);
2004
2005 // speculatively decode some things that are common between many formats, so we can share some code
2006 struct bifrost_alu_inst main_instr = {};
2007 // 20 bits
2008 main_instr.add_bits = bits(words[2], 2, 32 - 13);
2009 // 23 bits
2010 main_instr.fma_bits = bits(words[1], 11, 32) | bits(words[2], 0, 2) << (32 - 11);
2011 // 35 bits
2012 main_instr.reg_bits = ((uint64_t) bits(words[1], 0, 11)) << 24 | (uint64_t) bits(words[0], 8, 32);
2013
2014 uint64_t const0 = bits(words[0], 8, 32) << 4 | (uint64_t) words[1] << 28 | bits(words[2], 0, 4) << 60;
2015 uint64_t const1 = bits(words[2], 4, 32) << 4 | (uint64_t) words[3] << 32;
2016
2017 bool stop = tag & 0x40;
2018
2019 if (verbose) {
2020 fprintf(fp, "# tag: 0x%02x\n", tag);
2021 }
2022 if (tag & 0x80) {
2023 unsigned idx = stop ? 5 : 2;
2024 main_instr.add_bits |= ((tag >> 3) & 0x7) << 17;
2025 instrs[idx + 1] = main_instr;
2026 instrs[idx].add_bits = bits(words[3], 0, 17) | ((tag & 0x7) << 17);
2027 instrs[idx].fma_bits |= bits(words[2], 19, 32) << 10;
2028 consts[0] = bits(words[3], 17, 32) << 4;
2029 } else {
2030 bool done = false;
2031 switch ((tag >> 3) & 0x7) {
2032 case 0x0:
2033 switch (tag & 0x7) {
2034 case 0x3:
2035 main_instr.add_bits |= bits(words[3], 29, 32) << 17;
2036 instrs[1] = main_instr;
2037 num_instrs = 2;
2038 done = stop;
2039 break;
2040 case 0x4:
2041 instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
2042 instrs[2].fma_bits |= bits(words[2], 19, 32) << 10;
2043 consts[0] = const0;
2044 num_instrs = 3;
2045 num_consts = 1;
2046 done = stop;
2047 break;
2048 case 0x1:
2049 case 0x5:
2050 instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
2051 instrs[2].fma_bits |= bits(words[2], 19, 32) << 10;
2052 main_instr.add_bits |= bits(words[3], 26, 29) << 17;
2053 instrs[3] = main_instr;
2054 if ((tag & 0x7) == 0x5) {
2055 num_instrs = 4;
2056 done = stop;
2057 }
2058 break;
2059 case 0x6:
2060 instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
2061 instrs[5].fma_bits |= bits(words[2], 19, 32) << 10;
2062 consts[0] = const0;
2063 num_instrs = 6;
2064 num_consts = 1;
2065 done = stop;
2066 break;
2067 case 0x7:
2068 instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
2069 instrs[5].fma_bits |= bits(words[2], 19, 32) << 10;
2070 main_instr.add_bits |= bits(words[3], 26, 29) << 17;
2071 instrs[6] = main_instr;
2072 num_instrs = 7;
2073 done = stop;
2074 break;
2075 default:
2076 fprintf(fp, "unknown tag bits 0x%02x\n", tag);
2077 }
2078 break;
2079 case 0x2:
2080 case 0x3: {
2081 unsigned idx = ((tag >> 3) & 0x7) == 2 ? 4 : 7;
2082 main_instr.add_bits |= (tag & 0x7) << 17;
2083 instrs[idx] = main_instr;
2084 consts[0] |= (bits(words[2], 19, 32) | ((uint64_t) words[3] << 13)) << 19;
2085 num_consts = 1;
2086 num_instrs = idx + 1;
2087 done = stop;
2088 break;
2089 }
2090 case 0x4: {
2091 unsigned idx = stop ? 4 : 1;
2092 main_instr.add_bits |= (tag & 0x7) << 17;
2093 instrs[idx] = main_instr;
2094 instrs[idx + 1].fma_bits |= bits(words[3], 22, 32);
2095 instrs[idx + 1].reg_bits = bits(words[2], 19, 32) | (bits(words[3], 0, 22) << (32 - 19));
2096 break;
2097 }
2098 case 0x1:
2099 // only constants can come after this
2100 num_instrs = 1;
2101 done = stop;
2102 case 0x5:
2103 header_bits = bits(words[2], 19, 32) | ((uint64_t) words[3] << (32 - 19));
2104 main_instr.add_bits |= (tag & 0x7) << 17;
2105 instrs[0] = main_instr;
2106 break;
2107 case 0x6:
2108 case 0x7: {
2109 unsigned pos = tag & 0xf;
2110 // note that `pos' encodes both the total number of
2111 // instructions and the position in the constant stream,
2112 // presumably because decoded constants and instructions
2113 // share a buffer in the decoder, but we only care about
2114 // the position in the constant stream; the total number of
2115 // instructions is redundant.
2116 unsigned const_idx = 0;
2117 switch (pos) {
2118 case 0:
2119 case 1:
2120 case 2:
2121 case 6:
2122 const_idx = 0;
2123 break;
2124 case 3:
2125 case 4:
2126 case 7:
2127 case 9:
2128 const_idx = 1;
2129 break;
2130 case 5:
2131 case 0xa:
2132 const_idx = 2;
2133 break;
2134 case 8:
2135 case 0xb:
2136 case 0xc:
2137 const_idx = 3;
2138 break;
2139 case 0xd:
2140 const_idx = 4;
2141 break;
2142 default:
2143 fprintf(fp, "# unknown pos 0x%x\n", pos);
2144 break;
2145 }
2146
2147 if (num_consts < const_idx + 2)
2148 num_consts = const_idx + 2;
2149
2150 consts[const_idx] = const0;
2151 consts[const_idx + 1] = const1;
2152 done = stop;
2153 break;
2154 }
2155 default:
2156 break;
2157 }
2158
2159 if (done)
2160 break;
2161 }
2162 }
2163
2164 *size = i + 1;
2165
2166 if (verbose) {
2167 fprintf(fp, "# header: %012" PRIx64 "\n", header_bits);
2168 }
2169
2170 struct bifrost_header header;
2171 memcpy((char *) &header, (char *) &header_bits, sizeof(struct bifrost_header));
2172 dump_header(fp, header, verbose);
2173 if (!header.no_end_of_shader)
2174 stopbit = true;
2175
2176 fprintf(fp, "{\n");
2177 for (i = 0; i < num_instrs; i++) {
2178 struct bifrost_regs next_regs;
2179 if (i + 1 == num_instrs) {
2180 memcpy((char *) &next_regs, (char *) &instrs[0].reg_bits,
2181 sizeof(next_regs));
2182 } else {
2183 memcpy((char *) &next_regs, (char *) &instrs[i + 1].reg_bits,
2184 sizeof(next_regs));
2185 }
2186
2187 dump_instr(fp, &instrs[i], next_regs, consts, header.datareg, offset, verbose);
2188 }
2189 fprintf(fp, "}\n");
2190
2191 if (verbose) {
2192 for (unsigned i = 0; i < num_consts; i++) {
2193 fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i, consts[i] & 0xffffffff);
2194 fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i + 1, consts[i] >> 32);
2195 }
2196 }
2197 return stopbit;
2198 }
2199
2200 void disassemble_bifrost(FILE *fp, uint8_t *code, size_t size, bool verbose)
2201 {
2202 uint32_t *words = (uint32_t *) code;
2203 uint32_t *words_end = words + (size / 4);
2204 // used for displaying branch targets
2205 unsigned offset = 0;
2206 while (words != words_end) {
2207 // we don't know what the program-end bit is quite yet, so for now just
2208 // assume that an all-0 quadword is padding
2209 uint32_t zero[4] = {};
2210 if (memcmp(words, zero, 4 * sizeof(uint32_t)) == 0)
2211 break;
2212 fprintf(fp, "clause_%d:\n", offset);
2213 unsigned size;
2214 if (dump_clause(fp, words, &size, offset, verbose) == true) {
2215 break;
2216 }
2217 words += size * 4;
2218 offset += size;
2219 }
2220 }
2221