pan/bi: Add v4i8 mode to FMA_SHIFT
[mesa.git] / src / panfrost / bifrost / disassemble.c
1 /*
2 * Copyright (C) 2019 Connor Abbott <cwabbott0@gmail.com>
3 * Copyright (C) 2019 Lyude Paul <thatslyude@gmail.com>
4 * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 */
25
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stdint.h>
29 #include <assert.h>
30 #include <inttypes.h>
31 #include <string.h>
32
33 #include "bifrost.h"
34 #include "bifrost_ops.h"
35 #include "disassemble.h"
36 #include "util/macros.h"
37
38 // return bits (high, lo]
39 static uint64_t bits(uint32_t word, unsigned lo, unsigned high)
40 {
41 if (high == 32)
42 return word >> lo;
43 return (word & ((1 << high) - 1)) >> lo;
44 }
45
46 // each of these structs represents an instruction that's dispatched in one
47 // cycle. Note that these instructions are packed in funny ways within the
48 // clause, hence the need for a separate struct.
49 struct bifrost_alu_inst {
50 uint32_t fma_bits;
51 uint32_t add_bits;
52 uint64_t reg_bits;
53 };
54
55 struct bifrost_regs {
56 unsigned uniform_const : 8;
57 unsigned reg2 : 6;
58 unsigned reg3 : 6;
59 unsigned reg0 : 5;
60 unsigned reg1 : 6;
61 unsigned ctrl : 4;
62 };
63
64 static unsigned get_reg0(struct bifrost_regs regs)
65 {
66 if (regs.ctrl == 0)
67 return regs.reg0 | ((regs.reg1 & 0x1) << 5);
68
69 return regs.reg0 <= regs.reg1 ? regs.reg0 : 63 - regs.reg0;
70 }
71
72 static unsigned get_reg1(struct bifrost_regs regs)
73 {
74 return regs.reg0 <= regs.reg1 ? regs.reg1 : 63 - regs.reg1;
75 }
76
77 enum bifrost_reg_write_unit {
78 REG_WRITE_NONE = 0, // don't write
79 REG_WRITE_TWO, // write using reg2
80 REG_WRITE_THREE, // write using reg3
81 };
82
83 // this represents the decoded version of the ctrl register field.
84 struct bifrost_reg_ctrl {
85 bool read_reg0;
86 bool read_reg1;
87 bool read_reg3;
88 enum bifrost_reg_write_unit fma_write_unit;
89 enum bifrost_reg_write_unit add_write_unit;
90 bool clause_start;
91 };
92
93 enum fma_src_type {
94 FMA_ONE_SRC,
95 FMA_TWO_SRC,
96 FMA_FADD,
97 FMA_FMINMAX,
98 FMA_FADD16,
99 FMA_FMINMAX16,
100 FMA_FCMP,
101 FMA_FCMP16,
102 FMA_THREE_SRC,
103 FMA_SHIFT,
104 FMA_FMA,
105 FMA_FMA16,
106 FMA_CSEL4,
107 FMA_FMA_MSCALE,
108 FMA_SHIFT_ADD64,
109 };
110
111 struct fma_op_info {
112 unsigned op;
113 char name[30];
114 enum fma_src_type src_type;
115 };
116
117 enum add_src_type {
118 ADD_ONE_SRC,
119 ADD_TWO_SRC,
120 ADD_FADD,
121 ADD_FMINMAX,
122 ADD_FADD16,
123 ADD_FMINMAX16,
124 ADD_THREE_SRC,
125 ADD_FADDMscale,
126 ADD_FCMP,
127 ADD_FCMP16,
128 ADD_TEX_COMPACT, // texture instruction with embedded sampler
129 ADD_TEX, // texture instruction with sampler/etc. in uniform port
130 ADD_VARYING_INTERP,
131 ADD_BLENDING,
132 ADD_LOAD_ATTR,
133 ADD_VARYING_ADDRESS,
134 ADD_BRANCH,
135 };
136
137 struct add_op_info {
138 unsigned op;
139 char name[30];
140 enum add_src_type src_type;
141 bool has_data_reg;
142 };
143
144 struct bifrost_tex_ctrl {
145 unsigned sampler_index : 4; // also used to signal indirects
146 unsigned tex_index : 7;
147 bool no_merge_index : 1; // whether to merge (direct) sampler & texture indices
148 bool filter : 1; // use the usual filtering pipeline (0 for texelFetch & textureGather)
149 unsigned unk0 : 2;
150 bool texel_offset : 1; // *Offset()
151 bool is_shadow : 1;
152 bool is_array : 1;
153 unsigned tex_type : 2; // 2D, 3D, Cube, Buffer
154 bool compute_lod : 1; // 0 for *Lod()
155 bool not_supply_lod : 1; // 0 for *Lod() or when a bias is applied
156 bool calc_gradients : 1; // 0 for *Grad()
157 unsigned unk1 : 1;
158 unsigned result_type : 4; // integer, unsigned, float TODO: why is this 4 bits?
159 unsigned unk2 : 4;
160 };
161
162 struct bifrost_dual_tex_ctrl {
163 unsigned sampler_index0 : 2;
164 unsigned unk0 : 2;
165 unsigned tex_index0 : 2;
166 unsigned sampler_index1 : 2;
167 unsigned tex_index1 : 2;
168 unsigned unk1 : 22;
169 };
170
171 enum branch_bit_size {
172 BR_SIZE_32 = 0,
173 BR_SIZE_16XX = 1,
174 BR_SIZE_16YY = 2,
175 // For the above combinations of bitsize and location, an extra bit is
176 // encoded via comparing the sources. The only possible source of ambiguity
177 // would be if the sources were the same, but then the branch condition
178 // would be always true or always false anyways, so we can ignore it. But
179 // this no longer works when comparing the y component to the x component,
180 // since it's valid to compare the y component of a source against its own
181 // x component. Instead, the extra bit is encoded via an extra bitsize.
182 BR_SIZE_16YX0 = 3,
183 BR_SIZE_16YX1 = 4,
184 BR_SIZE_32_AND_16X = 5,
185 BR_SIZE_32_AND_16Y = 6,
186 // Used for comparisons with zero and always-true, see below. I think this
187 // only works for integer comparisons.
188 BR_SIZE_ZERO = 7,
189 };
190
191 void dump_header(FILE *fp, struct bifrost_header header, bool verbose);
192 void dump_instr(FILE *fp, const struct bifrost_alu_inst *instr,
193 struct bifrost_regs next_regs, uint64_t *consts,
194 unsigned data_reg, unsigned offset, bool verbose);
195 bool dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offset, bool verbose);
196
197 void dump_header(FILE *fp, struct bifrost_header header, bool verbose)
198 {
199 if (header.clause_type != 0) {
200 fprintf(fp, "id(%du) ", header.scoreboard_index);
201 }
202
203 if (header.scoreboard_deps != 0) {
204 fprintf(fp, "next-wait(");
205 bool first = true;
206 for (unsigned i = 0; i < 8; i++) {
207 if (header.scoreboard_deps & (1 << i)) {
208 if (!first) {
209 fprintf(fp, ", ");
210 }
211 fprintf(fp, "%d", i);
212 first = false;
213 }
214 }
215 fprintf(fp, ") ");
216 }
217
218 if (header.datareg_writebarrier)
219 fprintf(fp, "data-reg-barrier ");
220
221 if (!header.no_end_of_shader)
222 fprintf(fp, "eos ");
223
224 if (!header.back_to_back) {
225 fprintf(fp, "nbb ");
226 if (header.branch_cond)
227 fprintf(fp, "branch-cond ");
228 else
229 fprintf(fp, "branch-uncond ");
230 }
231
232 if (header.elide_writes)
233 fprintf(fp, "we ");
234
235 if (header.suppress_inf)
236 fprintf(fp, "suppress-inf ");
237 if (header.suppress_nan)
238 fprintf(fp, "suppress-nan ");
239
240 if (header.unk0)
241 fprintf(fp, "unk0 ");
242 if (header.unk1)
243 fprintf(fp, "unk1 ");
244 if (header.unk2)
245 fprintf(fp, "unk2 ");
246 if (header.unk3)
247 fprintf(fp, "unk3 ");
248 if (header.unk4)
249 fprintf(fp, "unk4 ");
250
251 fprintf(fp, "\n");
252
253 if (verbose) {
254 fprintf(fp, "# clause type %d, next clause type %d\n",
255 header.clause_type, header.next_clause_type);
256 }
257 }
258
259 static struct bifrost_reg_ctrl DecodeRegCtrl(FILE *fp, struct bifrost_regs regs)
260 {
261 struct bifrost_reg_ctrl decoded = {};
262 unsigned ctrl;
263 if (regs.ctrl == 0) {
264 ctrl = regs.reg1 >> 2;
265 decoded.read_reg0 = !(regs.reg1 & 0x2);
266 decoded.read_reg1 = false;
267 } else {
268 ctrl = regs.ctrl;
269 decoded.read_reg0 = decoded.read_reg1 = true;
270 }
271 switch (ctrl) {
272 case 1:
273 decoded.fma_write_unit = REG_WRITE_TWO;
274 break;
275 case 2:
276 case 3:
277 decoded.fma_write_unit = REG_WRITE_TWO;
278 decoded.read_reg3 = true;
279 break;
280 case 4:
281 decoded.read_reg3 = true;
282 break;
283 case 5:
284 decoded.add_write_unit = REG_WRITE_TWO;
285 break;
286 case 6:
287 decoded.add_write_unit = REG_WRITE_TWO;
288 decoded.read_reg3 = true;
289 break;
290 case 8:
291 decoded.clause_start = true;
292 break;
293 case 9:
294 decoded.fma_write_unit = REG_WRITE_TWO;
295 decoded.clause_start = true;
296 break;
297 case 11:
298 break;
299 case 12:
300 decoded.read_reg3 = true;
301 decoded.clause_start = true;
302 break;
303 case 13:
304 decoded.add_write_unit = REG_WRITE_TWO;
305 decoded.clause_start = true;
306 break;
307
308 case 7:
309 case 15:
310 decoded.fma_write_unit = REG_WRITE_THREE;
311 decoded.add_write_unit = REG_WRITE_TWO;
312 break;
313 default:
314 fprintf(fp, "# unknown reg ctrl %d\n", ctrl);
315 }
316
317 return decoded;
318 }
319
320 // Pass in the add_write_unit or fma_write_unit, and this returns which register
321 // the ADD/FMA units are writing to
322 static unsigned GetRegToWrite(enum bifrost_reg_write_unit unit, struct bifrost_regs regs)
323 {
324 switch (unit) {
325 case REG_WRITE_TWO:
326 return regs.reg2;
327 case REG_WRITE_THREE:
328 return regs.reg3;
329 default: /* REG_WRITE_NONE */
330 assert(0);
331 return 0;
332 }
333 }
334
335 static void dump_regs(FILE *fp, struct bifrost_regs srcs)
336 {
337 struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, srcs);
338 fprintf(fp, "# ");
339 if (ctrl.read_reg0)
340 fprintf(fp, "port 0: R%d ", get_reg0(srcs));
341 if (ctrl.read_reg1)
342 fprintf(fp, "port 1: R%d ", get_reg1(srcs));
343
344 if (ctrl.fma_write_unit == REG_WRITE_TWO)
345 fprintf(fp, "port 2: R%d (write FMA) ", srcs.reg2);
346 else if (ctrl.add_write_unit == REG_WRITE_TWO)
347 fprintf(fp, "port 2: R%d (write ADD) ", srcs.reg2);
348
349 if (ctrl.fma_write_unit == REG_WRITE_THREE)
350 fprintf(fp, "port 3: R%d (write FMA) ", srcs.reg3);
351 else if (ctrl.add_write_unit == REG_WRITE_THREE)
352 fprintf(fp, "port 3: R%d (write ADD) ", srcs.reg3);
353 else if (ctrl.read_reg3)
354 fprintf(fp, "port 3: R%d (read) ", srcs.reg3);
355
356 if (srcs.uniform_const) {
357 if (srcs.uniform_const & 0x80) {
358 fprintf(fp, "uniform: U%d", (srcs.uniform_const & 0x7f) * 2);
359 }
360 }
361
362 fprintf(fp, "\n");
363 }
364 static void dump_const_imm(FILE *fp, uint32_t imm)
365 {
366 union {
367 float f;
368 uint32_t i;
369 } fi;
370 fi.i = imm;
371 fprintf(fp, "0x%08x /* %f */", imm, fi.f);
372 }
373
374 static uint64_t get_const(uint64_t *consts, struct bifrost_regs srcs)
375 {
376 unsigned low_bits = srcs.uniform_const & 0xf;
377 uint64_t imm;
378 switch (srcs.uniform_const >> 4) {
379 case 4:
380 imm = consts[0];
381 break;
382 case 5:
383 imm = consts[1];
384 break;
385 case 6:
386 imm = consts[2];
387 break;
388 case 7:
389 imm = consts[3];
390 break;
391 case 2:
392 imm = consts[4];
393 break;
394 case 3:
395 imm = consts[5];
396 break;
397 default:
398 assert(0);
399 break;
400 }
401 return imm | low_bits;
402 }
403
404 static void dump_uniform_const_src(FILE *fp, struct bifrost_regs srcs, uint64_t *consts, bool high32)
405 {
406 if (srcs.uniform_const & 0x80) {
407 unsigned uniform = (srcs.uniform_const & 0x7f) * 2;
408 fprintf(fp, "U%d", uniform + (high32 ? 1 : 0));
409 } else if (srcs.uniform_const >= 0x20) {
410 uint64_t imm = get_const(consts, srcs);
411 if (high32)
412 dump_const_imm(fp, imm >> 32);
413 else
414 dump_const_imm(fp, imm);
415 } else {
416 switch (srcs.uniform_const) {
417 case 0:
418 fprintf(fp, "0");
419 break;
420 case 5:
421 fprintf(fp, "atest-data");
422 break;
423 case 6:
424 fprintf(fp, "sample-ptr");
425 break;
426 case 8:
427 case 9:
428 case 10:
429 case 11:
430 case 12:
431 case 13:
432 case 14:
433 case 15:
434 fprintf(fp, "blend-descriptor%u", (unsigned) srcs.uniform_const - 8);
435 break;
436 default:
437 fprintf(fp, "unkConst%u", (unsigned) srcs.uniform_const);
438 break;
439 }
440
441 if (high32)
442 fprintf(fp, ".y");
443 else
444 fprintf(fp, ".x");
445 }
446 }
447
448 static void dump_src(FILE *fp, unsigned src, struct bifrost_regs srcs, uint64_t *consts, bool isFMA)
449 {
450 switch (src) {
451 case 0:
452 fprintf(fp, "R%d", get_reg0(srcs));
453 break;
454 case 1:
455 fprintf(fp, "R%d", get_reg1(srcs));
456 break;
457 case 2:
458 fprintf(fp, "R%d", srcs.reg3);
459 break;
460 case 3:
461 if (isFMA)
462 fprintf(fp, "0");
463 else
464 fprintf(fp, "T"); // i.e. the output of FMA this cycle
465 break;
466 case 4:
467 dump_uniform_const_src(fp, srcs, consts, false);
468 break;
469 case 5:
470 dump_uniform_const_src(fp, srcs, consts, true);
471 break;
472 case 6:
473 fprintf(fp, "T0");
474 break;
475 case 7:
476 fprintf(fp, "T1");
477 break;
478 }
479 }
480
481 static void dump_output_mod(FILE *fp, unsigned mod)
482 {
483 switch (mod) {
484 case 0:
485 break;
486 case 1:
487 fprintf(fp, ".clamp_0_inf");
488 break; // max(out, 0)
489 case 2:
490 fprintf(fp, ".clamp_m1_1");
491 break; // clamp(out, -1, 1)
492 case 3:
493 fprintf(fp, ".clamp_0_1");
494 break; // clamp(out, 0, 1)
495 default:
496 break;
497 }
498 }
499
500 static void dump_minmax_mode(FILE *fp, unsigned mod)
501 {
502 switch (mod) {
503 case 0:
504 /* Same as fmax() and fmin() -- return the other number if any
505 * number is NaN. Also always return +0 if one argument is +0 and
506 * the other is -0.
507 */
508 break;
509 case 1:
510 /* Instead of never returning a NaN, always return one. The
511 * "greater"/"lesser" NaN is always returned, first by checking the
512 * sign and then the mantissa bits.
513 */
514 fprintf(fp, ".nan_wins");
515 break;
516 case 2:
517 /* For max, implement src0 > src1 ? src0 : src1
518 * For min, implement src0 < src1 ? src0 : src1
519 *
520 * This includes handling NaN's and signedness of 0 differently
521 * from above, since +0 and -0 compare equal and comparisons always
522 * return false for NaN's. As a result, this mode is *not*
523 * commutative.
524 */
525 fprintf(fp, ".src1_wins");
526 break;
527 case 3:
528 /* For max, implement src0 < src1 ? src1 : src0
529 * For min, implement src0 > src1 ? src1 : src0
530 */
531 fprintf(fp, ".src0_wins");
532 break;
533 default:
534 break;
535 }
536 }
537
538 static void dump_round_mode(FILE *fp, unsigned mod)
539 {
540 switch (mod) {
541 case 0:
542 /* roundTiesToEven, the IEEE default. */
543 break;
544 case 1:
545 /* roundTowardPositive in the IEEE spec. */
546 fprintf(fp, ".round_pos");
547 break;
548 case 2:
549 /* roundTowardNegative in the IEEE spec. */
550 fprintf(fp, ".round_neg");
551 break;
552 case 3:
553 /* roundTowardZero in the IEEE spec. */
554 fprintf(fp, ".round_zero");
555 break;
556 default:
557 break;
558 }
559 }
560
561 static const char *
562 csel_cond_name(enum bifrost_csel_cond cond)
563 {
564 switch (cond) {
565 case BIFROST_FEQ_F: return "feq.f";
566 case BIFROST_FGT_F: return "fgt.f";
567 case BIFROST_FGE_F: return "fge.f";
568 case BIFROST_IEQ_F: return "ieq.f";
569 case BIFROST_IGT_I: return "igt.i";
570 case BIFROST_IGE_I: return "uge.i";
571 case BIFROST_UGT_I: return "ugt.i";
572 case BIFROST_UGE_I: return "uge.i";
573 default: return "invalid";
574 }
575 }
576
577 static const struct fma_op_info FMAOpInfos[] = {
578 { 0x00000, "FMA.f32", FMA_FMA },
579 { 0x40000, "MAX.f32", FMA_FMINMAX },
580 { 0x44000, "MIN.f32", FMA_FMINMAX },
581 { 0x48000, "FCMP.GL", FMA_FCMP },
582 { 0x4c000, "FCMP.D3D", FMA_FCMP },
583 { 0x4ff98, "ADD.i32", FMA_TWO_SRC },
584 { 0x4ffd8, "SUB.i32", FMA_TWO_SRC },
585 { 0x4fff0, "SUBB.i32", FMA_TWO_SRC },
586 { 0x50000, "FMA_MSCALE", FMA_FMA_MSCALE },
587 { 0x58000, "ADD.f32", FMA_FADD },
588 { 0x5c000, "CSEL4", FMA_CSEL4 },
589 { 0x5d8d0, "ICMP.D3D.GT.v2i16", FMA_TWO_SRC },
590 { 0x5d9d0, "UCMP.D3D.GT.v2i16", FMA_TWO_SRC },
591 { 0x5dad0, "ICMP.D3D.GE.v2i16", FMA_TWO_SRC },
592 { 0x5dbd0, "UCMP.D3D.GE.v2i16", FMA_TWO_SRC },
593 { 0x5dcd0, "ICMP.D3D.EQ.v2i16", FMA_TWO_SRC },
594 { 0x5de40, "ICMP.GL.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? 1 : 0
595 { 0x5de48, "ICMP.GL.GE.i32", FMA_TWO_SRC },
596 { 0x5de50, "UCMP.GL.GT.i32", FMA_TWO_SRC },
597 { 0x5de58, "UCMP.GL.GE.i32", FMA_TWO_SRC },
598 { 0x5de60, "ICMP.GL.EQ.i32", FMA_TWO_SRC },
599 { 0x5dec0, "ICMP.D3D.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? ~0 : 0
600 { 0x5dec8, "ICMP.D3D.GE.i32", FMA_TWO_SRC },
601 { 0x5ded0, "UCMP.D3D.GT.i32", FMA_TWO_SRC },
602 { 0x5ded8, "UCMP.D3D.GE.i32", FMA_TWO_SRC },
603 { 0x5dee0, "ICMP.D3D.EQ.i32", FMA_TWO_SRC },
604 { 0x60000, "RSHIFT_NAND", FMA_SHIFT },
605 { 0x61000, "RSHIFT_AND", FMA_SHIFT },
606 { 0x62000, "LSHIFT_NAND", FMA_SHIFT },
607 { 0x63000, "LSHIFT_AND", FMA_SHIFT }, // (src0 << src2) & src1
608 { 0x64000, "RSHIFT_XOR", FMA_SHIFT },
609 { 0x65200, "LSHIFT_ADD.i32", FMA_THREE_SRC },
610 { 0x65600, "LSHIFT_SUB.i32", FMA_THREE_SRC }, // (src0 << src2) - src1
611 { 0x65a00, "LSHIFT_RSUB.i32", FMA_THREE_SRC }, // src1 - (src0 << src2)
612 { 0x65e00, "RSHIFT_ADD.i32", FMA_THREE_SRC },
613 { 0x66200, "RSHIFT_SUB.i32", FMA_THREE_SRC },
614 { 0x66600, "RSHIFT_RSUB.i32", FMA_THREE_SRC },
615 { 0x66a00, "ARSHIFT_ADD.i32", FMA_THREE_SRC },
616 { 0x66e00, "ARSHIFT_SUB.i32", FMA_THREE_SRC },
617 { 0x67200, "ARSHIFT_RSUB.i32", FMA_THREE_SRC },
618 { 0x80000, "FMA.v2f16", FMA_FMA16 },
619 { 0xc0000, "MAX.v2f16", FMA_FMINMAX16 },
620 { 0xc4000, "MIN.v2f16", FMA_FMINMAX16 },
621 { 0xc8000, "FCMP.GL", FMA_FCMP16 },
622 { 0xcc000, "FCMP.D3D", FMA_FCMP16 },
623 { 0xcf900, "ADD.v2i16", FMA_TWO_SRC },
624 { 0xcfc10, "ADDC.i32", FMA_TWO_SRC },
625 { 0xcfd80, "ADD.i32.i16.X", FMA_TWO_SRC },
626 { 0xcfd90, "ADD.i32.u16.X", FMA_TWO_SRC },
627 { 0xcfdc0, "ADD.i32.i16.Y", FMA_TWO_SRC },
628 { 0xcfdd0, "ADD.i32.u16.Y", FMA_TWO_SRC },
629 { 0xd8000, "ADD.v2f16", FMA_FADD16 },
630 { 0xdc000, "CSEL4.v16", FMA_CSEL4 },
631 { 0xdd000, "F32_TO_F16", FMA_TWO_SRC },
632 { 0xe0046, "F16_TO_I16.XX", FMA_ONE_SRC },
633 { 0xe0047, "F16_TO_U16.XX", FMA_ONE_SRC },
634 { 0xe004e, "F16_TO_I16.YX", FMA_ONE_SRC },
635 { 0xe004f, "F16_TO_U16.YX", FMA_ONE_SRC },
636 { 0xe0056, "F16_TO_I16.XY", FMA_ONE_SRC },
637 { 0xe0057, "F16_TO_U16.XY", FMA_ONE_SRC },
638 { 0xe005e, "F16_TO_I16.YY", FMA_ONE_SRC },
639 { 0xe005f, "F16_TO_U16.YY", FMA_ONE_SRC },
640 { 0xe00c0, "I16_TO_F16.XX", FMA_ONE_SRC },
641 { 0xe00c1, "U16_TO_F16.XX", FMA_ONE_SRC },
642 { 0xe00c8, "I16_TO_F16.YX", FMA_ONE_SRC },
643 { 0xe00c9, "U16_TO_F16.YX", FMA_ONE_SRC },
644 { 0xe00d0, "I16_TO_F16.XY", FMA_ONE_SRC },
645 { 0xe00d1, "U16_TO_F16.XY", FMA_ONE_SRC },
646 { 0xe00d8, "I16_TO_F16.YY", FMA_ONE_SRC },
647 { 0xe00d9, "U16_TO_F16.YY", FMA_ONE_SRC },
648 { 0xe0136, "F32_TO_I32", FMA_ONE_SRC },
649 { 0xe0137, "F32_TO_U32", FMA_ONE_SRC },
650 { 0xe0178, "I32_TO_F32", FMA_ONE_SRC },
651 { 0xe0179, "U32_TO_F32", FMA_ONE_SRC },
652 { 0xe0198, "I16_TO_I32.X", FMA_ONE_SRC },
653 { 0xe0199, "U16_TO_U32.X", FMA_ONE_SRC },
654 { 0xe019a, "I16_TO_I32.Y", FMA_ONE_SRC },
655 { 0xe019b, "U16_TO_U32.Y", FMA_ONE_SRC },
656 { 0xe019c, "I16_TO_F32.X", FMA_ONE_SRC },
657 { 0xe019d, "U16_TO_F32.X", FMA_ONE_SRC },
658 { 0xe019e, "I16_TO_F32.Y", FMA_ONE_SRC },
659 { 0xe019f, "U16_TO_F32.Y", FMA_ONE_SRC },
660 { 0xe01a2, "F16_TO_F32.X", FMA_ONE_SRC },
661 { 0xe01a3, "F16_TO_F32.Y", FMA_ONE_SRC },
662 { 0xe032c, "NOP", FMA_ONE_SRC },
663 { 0xe032d, "MOV", FMA_ONE_SRC },
664 { 0xe032f, "SWZ.YY.v2i16", FMA_ONE_SRC },
665 { 0xe0345, "LOG_FREXPM", FMA_ONE_SRC },
666 { 0xe0365, "FRCP_FREXPM", FMA_ONE_SRC },
667 { 0xe0375, "FSQRT_FREXPM", FMA_ONE_SRC },
668 { 0xe038d, "FRCP_FREXPE", FMA_ONE_SRC },
669 { 0xe03a5, "FSQRT_FREXPE", FMA_ONE_SRC },
670 { 0xe03ad, "FRSQ_FREXPE", FMA_ONE_SRC },
671 { 0xe03c5, "LOG_FREXPE", FMA_ONE_SRC },
672 { 0xe03fa, "CLZ", FMA_ONE_SRC },
673 { 0xe0b80, "IMAX3", FMA_THREE_SRC },
674 { 0xe0bc0, "UMAX3", FMA_THREE_SRC },
675 { 0xe0c00, "IMIN3", FMA_THREE_SRC },
676 { 0xe0c40, "UMIN3", FMA_THREE_SRC },
677 { 0xe0ec5, "ROUND", FMA_ONE_SRC },
678 { 0xe0f40, "CSEL", FMA_THREE_SRC }, // src2 != 0 ? src1 : src0
679 { 0xe0fc0, "MUX.i32", FMA_THREE_SRC }, // see ADD comment
680 { 0xe1805, "ROUNDEVEN", FMA_ONE_SRC },
681 { 0xe1845, "CEIL", FMA_ONE_SRC },
682 { 0xe1885, "FLOOR", FMA_ONE_SRC },
683 { 0xe18c5, "TRUNC", FMA_ONE_SRC },
684 { 0xe19b0, "ATAN_LDEXP.Y.f32", FMA_TWO_SRC },
685 { 0xe19b8, "ATAN_LDEXP.X.f32", FMA_TWO_SRC },
686 { 0xe1c80, "LSHIFT_ADD_LOW32.u32", FMA_SHIFT_ADD64 },
687 { 0xe1cc0, "LSHIFT_ADD_LOW32.i64", FMA_SHIFT_ADD64 },
688 { 0xe1d80, "LSHIFT_ADD_LOW32.i32", FMA_SHIFT_ADD64 },
689 { 0xe1e00, "SEL.XX.i16", FMA_TWO_SRC },
690 { 0xe1e08, "SEL.YX.i16", FMA_TWO_SRC },
691 { 0xe1e10, "SEL.XY.i16", FMA_TWO_SRC },
692 { 0xe1e18, "SEL.YY.i16", FMA_TWO_SRC },
693 { 0xe7800, "IMAD", FMA_THREE_SRC },
694 { 0xe78db, "POPCNT", FMA_ONE_SRC },
695 };
696
697 static struct fma_op_info find_fma_op_info(unsigned op)
698 {
699 for (unsigned i = 0; i < ARRAY_SIZE(FMAOpInfos); i++) {
700 unsigned opCmp = ~0;
701 switch (FMAOpInfos[i].src_type) {
702 case FMA_ONE_SRC:
703 opCmp = op;
704 break;
705 case FMA_TWO_SRC:
706 opCmp = op & ~0x7;
707 break;
708 case FMA_FCMP:
709 case FMA_FCMP16:
710 opCmp = op & ~0x1fff;
711 break;
712 case FMA_THREE_SRC:
713 case FMA_SHIFT_ADD64:
714 opCmp = op & ~0x3f;
715 break;
716 case FMA_FADD:
717 case FMA_FMINMAX:
718 case FMA_FADD16:
719 case FMA_FMINMAX16:
720 opCmp = op & ~0x3fff;
721 break;
722 case FMA_FMA:
723 case FMA_FMA16:
724 opCmp = op & ~0x3ffff;
725 break;
726 case FMA_CSEL4:
727 case FMA_SHIFT:
728 opCmp = op & ~0xfff;
729 break;
730 case FMA_FMA_MSCALE:
731 opCmp = op & ~0x7fff;
732 break;
733 default:
734 opCmp = ~0;
735 break;
736 }
737 if (FMAOpInfos[i].op == opCmp)
738 return FMAOpInfos[i];
739 }
740
741 struct fma_op_info info;
742 snprintf(info.name, sizeof(info.name), "op%04x", op);
743 info.op = op;
744 info.src_type = FMA_THREE_SRC;
745 return info;
746 }
747
748 static void dump_fcmp(FILE *fp, unsigned op)
749 {
750 switch (op) {
751 case 0:
752 fprintf(fp, ".OEQ");
753 break;
754 case 1:
755 fprintf(fp, ".OGT");
756 break;
757 case 2:
758 fprintf(fp, ".OGE");
759 break;
760 case 3:
761 fprintf(fp, ".UNE");
762 break;
763 case 4:
764 fprintf(fp, ".OLT");
765 break;
766 case 5:
767 fprintf(fp, ".OLE");
768 break;
769 default:
770 fprintf(fp, ".unk%d", op);
771 break;
772 }
773 }
774
775 static void dump_16swizzle(FILE *fp, unsigned swiz)
776 {
777 if (swiz == 2)
778 return;
779 fprintf(fp, ".%c%c", "xy"[swiz & 1], "xy"[(swiz >> 1) & 1]);
780 }
781
782 static void dump_fma_expand_src0(FILE *fp, unsigned ctrl)
783 {
784 switch (ctrl) {
785 case 3:
786 case 4:
787 case 6:
788 fprintf(fp, ".x");
789 break;
790 case 5:
791 case 7:
792 fprintf(fp, ".y");
793 break;
794 case 0:
795 case 1:
796 case 2:
797 break;
798 default:
799 fprintf(fp, ".unk");
800 break;
801 }
802 }
803
804 static void dump_fma_expand_src1(FILE *fp, unsigned ctrl)
805 {
806 switch (ctrl) {
807 case 1:
808 case 3:
809 fprintf(fp, ".x");
810 break;
811 case 2:
812 case 4:
813 case 5:
814 fprintf(fp, ".y");
815 break;
816 case 0:
817 case 6:
818 case 7:
819 break;
820 default:
821 fprintf(fp, ".unk");
822 break;
823 }
824 }
825
826 static void dump_fma(FILE *fp, uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, bool verbose)
827 {
828 if (verbose) {
829 fprintf(fp, "# FMA: %016" PRIx64 "\n", word);
830 }
831 struct bifrost_fma_inst FMA;
832 memcpy((char *) &FMA, (char *) &word, sizeof(struct bifrost_fma_inst));
833 struct fma_op_info info = find_fma_op_info(FMA.op);
834
835 fprintf(fp, "%s", info.name);
836 if (info.src_type == FMA_FADD ||
837 info.src_type == FMA_FMINMAX ||
838 info.src_type == FMA_FMA ||
839 info.src_type == FMA_FADD16 ||
840 info.src_type == FMA_FMINMAX16 ||
841 info.src_type == FMA_FMA16) {
842 dump_output_mod(fp, bits(FMA.op, 12, 14));
843 switch (info.src_type) {
844 case FMA_FADD:
845 case FMA_FMA:
846 case FMA_FADD16:
847 case FMA_FMA16:
848 dump_round_mode(fp, bits(FMA.op, 10, 12));
849 break;
850 case FMA_FMINMAX:
851 case FMA_FMINMAX16:
852 dump_minmax_mode(fp, bits(FMA.op, 10, 12));
853 break;
854 default:
855 assert(0);
856 }
857 } else if (info.src_type == FMA_FCMP || info.src_type == FMA_FCMP16) {
858 dump_fcmp(fp, bits(FMA.op, 10, 13));
859 if (info.src_type == FMA_FCMP)
860 fprintf(fp, ".f32");
861 else
862 fprintf(fp, ".v2f16");
863 } else if (info.src_type == FMA_FMA_MSCALE) {
864 if (FMA.op & (1 << 11)) {
865 switch ((FMA.op >> 9) & 0x3) {
866 case 0:
867 /* This mode seems to do a few things:
868 * - Makes 0 * infinity (and incidentally 0 * nan) return 0,
869 * since generating a nan would poison the result of
870 * 1/infinity and 1/0.
871 * - Fiddles with which nan is returned in nan * nan,
872 * presumably to make sure that the same exact nan is
873 * returned for 1/nan.
874 */
875 fprintf(fp, ".rcp_mode");
876 break;
877 case 3:
878 /* Similar to the above, but src0 always wins when multiplying
879 * 0 by infinity.
880 */
881 fprintf(fp, ".sqrt_mode");
882 break;
883 default:
884 fprintf(fp, ".unk%d_mode", (int) (FMA.op >> 9) & 0x3);
885 }
886 } else {
887 dump_output_mod(fp, bits(FMA.op, 9, 11));
888 }
889 } else if (info.src_type == FMA_SHIFT) {
890 struct bifrost_shift_fma shift;
891 memcpy(&shift, &FMA, sizeof(shift));
892
893 if (shift.half == 0x7)
894 fprintf(fp, ".v2i16");
895 else if (shift.half == 0)
896 fprintf(fp, ".i32");
897 else if (shift.half == 0x4)
898 fprintf(fp, ".v4i8");
899 else
900 fprintf(fp, ".unk%u", shift.half);
901
902 if (!shift.unk)
903 fprintf(fp, ".no_unk");
904
905 if (shift.invert_1)
906 fprintf(fp, ".invert_1");
907
908 if (shift.invert_2)
909 fprintf(fp, ".invert_2");
910 }
911
912 fprintf(fp, " ");
913
914 struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(fp, next_regs);
915 if (next_ctrl.fma_write_unit != REG_WRITE_NONE) {
916 fprintf(fp, "{R%d, T0}, ", GetRegToWrite(next_ctrl.fma_write_unit, next_regs));
917 } else {
918 fprintf(fp, "T0, ");
919 }
920
921 switch (info.src_type) {
922 case FMA_ONE_SRC:
923 dump_src(fp, FMA.src0, regs, consts, true);
924 break;
925 case FMA_TWO_SRC:
926 dump_src(fp, FMA.src0, regs, consts, true);
927 fprintf(fp, ", ");
928 dump_src(fp, FMA.op & 0x7, regs, consts, true);
929 break;
930 case FMA_FADD:
931 case FMA_FMINMAX:
932 if (FMA.op & 0x10)
933 fprintf(fp, "-");
934 if (FMA.op & 0x200)
935 fprintf(fp, "abs(");
936 dump_src(fp, FMA.src0, regs, consts, true);
937 dump_fma_expand_src0(fp, (FMA.op >> 6) & 0x7);
938 if (FMA.op & 0x200)
939 fprintf(fp, ")");
940 fprintf(fp, ", ");
941 if (FMA.op & 0x20)
942 fprintf(fp, "-");
943 if (FMA.op & 0x8)
944 fprintf(fp, "abs(");
945 dump_src(fp, FMA.op & 0x7, regs, consts, true);
946 dump_fma_expand_src1(fp, (FMA.op >> 6) & 0x7);
947 if (FMA.op & 0x8)
948 fprintf(fp, ")");
949 break;
950 case FMA_FADD16:
951 case FMA_FMINMAX16: {
952 bool abs1 = FMA.op & 0x8;
953 bool abs2 = (FMA.op & 0x7) < FMA.src0;
954 if (FMA.op & 0x10)
955 fprintf(fp, "-");
956 if (abs1 || abs2)
957 fprintf(fp, "abs(");
958 dump_src(fp, FMA.src0, regs, consts, true);
959 dump_16swizzle(fp, (FMA.op >> 6) & 0x3);
960 if (abs1 || abs2)
961 fprintf(fp, ")");
962 fprintf(fp, ", ");
963 if (FMA.op & 0x20)
964 fprintf(fp, "-");
965 if (abs1 && abs2)
966 fprintf(fp, "abs(");
967 dump_src(fp, FMA.op & 0x7, regs, consts, true);
968 dump_16swizzle(fp, (FMA.op >> 8) & 0x3);
969 if (abs1 && abs2)
970 fprintf(fp, ")");
971 break;
972 }
973 case FMA_FCMP:
974 if (FMA.op & 0x200)
975 fprintf(fp, "abs(");
976 dump_src(fp, FMA.src0, regs, consts, true);
977 dump_fma_expand_src0(fp, (FMA.op >> 6) & 0x7);
978 if (FMA.op & 0x200)
979 fprintf(fp, ")");
980 fprintf(fp, ", ");
981 if (FMA.op & 0x20)
982 fprintf(fp, "-");
983 if (FMA.op & 0x8)
984 fprintf(fp, "abs(");
985 dump_src(fp, FMA.op & 0x7, regs, consts, true);
986 dump_fma_expand_src1(fp, (FMA.op >> 6) & 0x7);
987 if (FMA.op & 0x8)
988 fprintf(fp, ")");
989 break;
990 case FMA_FCMP16:
991 dump_src(fp, FMA.src0, regs, consts, true);
992 // Note: this is kinda a guess, I haven't seen the blob set this to
993 // anything other than the identity, but it matches FMA_TWO_SRCFmod16
994 dump_16swizzle(fp, (FMA.op >> 6) & 0x3);
995 fprintf(fp, ", ");
996 dump_src(fp, FMA.op & 0x7, regs, consts, true);
997 dump_16swizzle(fp, (FMA.op >> 8) & 0x3);
998 break;
999 case FMA_SHIFT_ADD64:
1000 dump_src(fp, FMA.src0, regs, consts, true);
1001 fprintf(fp, ", ");
1002 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1003 fprintf(fp, ", ");
1004 fprintf(fp, "shift:%u", (FMA.op >> 3) & 0x7);
1005 break;
1006 case FMA_THREE_SRC:
1007 dump_src(fp, FMA.src0, regs, consts, true);
1008 fprintf(fp, ", ");
1009 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1010 fprintf(fp, ", ");
1011 dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true);
1012 break;
1013 case FMA_SHIFT: {
1014 struct bifrost_shift_fma shift;
1015 memcpy(&shift, &FMA, sizeof(shift));
1016
1017 dump_src(fp, shift.src0, regs, consts, true);
1018 fprintf(fp, ", ");
1019 dump_src(fp, shift.src1, regs, consts, true);
1020 fprintf(fp, ", ");
1021 dump_src(fp, shift.src2, regs, consts, true);
1022 break;
1023 }
1024 case FMA_FMA:
1025 if (FMA.op & (1 << 14))
1026 fprintf(fp, "-");
1027 if (FMA.op & (1 << 9))
1028 fprintf(fp, "abs(");
1029 dump_src(fp, FMA.src0, regs, consts, true);
1030 dump_fma_expand_src0(fp, (FMA.op >> 6) & 0x7);
1031 if (FMA.op & (1 << 9))
1032 fprintf(fp, ")");
1033 fprintf(fp, ", ");
1034 if (FMA.op & (1 << 16))
1035 fprintf(fp, "abs(");
1036 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1037 dump_fma_expand_src1(fp, (FMA.op >> 6) & 0x7);
1038 if (FMA.op & (1 << 16))
1039 fprintf(fp, ")");
1040 fprintf(fp, ", ");
1041 if (FMA.op & (1 << 15))
1042 fprintf(fp, "-");
1043 if (FMA.op & (1 << 17))
1044 fprintf(fp, "abs(");
1045 dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true);
1046 if (FMA.op & (1 << 17))
1047 fprintf(fp, ")");
1048 break;
1049 case FMA_FMA16:
1050 if (FMA.op & (1 << 14))
1051 fprintf(fp, "-");
1052 dump_src(fp, FMA.src0, regs, consts, true);
1053 dump_16swizzle(fp, (FMA.op >> 6) & 0x3);
1054 fprintf(fp, ", ");
1055 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1056 dump_16swizzle(fp, (FMA.op >> 8) & 0x3);
1057 fprintf(fp, ", ");
1058 if (FMA.op & (1 << 15))
1059 fprintf(fp, "-");
1060 dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true);
1061 dump_16swizzle(fp, (FMA.op >> 16) & 0x3);
1062 break;
1063 case FMA_CSEL4: {
1064 struct bifrost_csel4 csel;
1065 memcpy(&csel, &FMA, sizeof(csel));
1066 fprintf(fp, ".%s ", csel_cond_name(csel.cond));
1067
1068 dump_src(fp, csel.src0, regs, consts, true);
1069 fprintf(fp, ", ");
1070 dump_src(fp, csel.src1, regs, consts, true);
1071 fprintf(fp, ", ");
1072 dump_src(fp, csel.src2, regs, consts, true);
1073 fprintf(fp, ", ");
1074 dump_src(fp, csel.src3, regs, consts, true);
1075 break;
1076 }
1077 case FMA_FMA_MSCALE:
1078 if (FMA.op & (1 << 12))
1079 fprintf(fp, "abs(");
1080 dump_src(fp, FMA.src0, regs, consts, true);
1081 if (FMA.op & (1 << 12))
1082 fprintf(fp, ")");
1083 fprintf(fp, ", ");
1084 if (FMA.op & (1 << 13))
1085 fprintf(fp, "-");
1086 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1087 fprintf(fp, ", ");
1088 if (FMA.op & (1 << 14))
1089 fprintf(fp, "-");
1090 dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true);
1091 fprintf(fp, ", ");
1092 dump_src(fp, (FMA.op >> 6) & 0x7, regs, consts, true);
1093 break;
1094 }
1095 fprintf(fp, "\n");
1096 }
1097
1098 static const struct add_op_info add_op_infos[] = {
1099 { 0x00000, "MAX.f32", ADD_FMINMAX },
1100 { 0x02000, "MIN.f32", ADD_FMINMAX },
1101 { 0x04000, "ADD.f32", ADD_FADD },
1102 { 0x06000, "FCMP.GL", ADD_FCMP },
1103 { 0x07000, "FCMP.D3D", ADD_FCMP },
1104 { 0x07856, "F16_TO_I16", ADD_ONE_SRC },
1105 { 0x07857, "F16_TO_U16", ADD_ONE_SRC },
1106 { 0x078c0, "I16_TO_F16.XX", ADD_ONE_SRC },
1107 { 0x078c1, "U16_TO_F16.XX", ADD_ONE_SRC },
1108 { 0x078c8, "I16_TO_F16.YX", ADD_ONE_SRC },
1109 { 0x078c9, "U16_TO_F16.YX", ADD_ONE_SRC },
1110 { 0x078d0, "I16_TO_F16.XY", ADD_ONE_SRC },
1111 { 0x078d1, "U16_TO_F16.XY", ADD_ONE_SRC },
1112 { 0x078d8, "I16_TO_F16.YY", ADD_ONE_SRC },
1113 { 0x078d9, "U16_TO_F16.YY", ADD_ONE_SRC },
1114 { 0x07936, "F32_TO_I32", ADD_ONE_SRC },
1115 { 0x07937, "F32_TO_U32", ADD_ONE_SRC },
1116 { 0x07978, "I32_TO_F32", ADD_ONE_SRC },
1117 { 0x07979, "U32_TO_F32", ADD_ONE_SRC },
1118 { 0x07998, "I16_TO_I32.X", ADD_ONE_SRC },
1119 { 0x07999, "U16_TO_U32.X", ADD_ONE_SRC },
1120 { 0x0799a, "I16_TO_I32.Y", ADD_ONE_SRC },
1121 { 0x0799b, "U16_TO_U32.Y", ADD_ONE_SRC },
1122 { 0x0799c, "I16_TO_F32.X", ADD_ONE_SRC },
1123 { 0x0799d, "U16_TO_F32.X", ADD_ONE_SRC },
1124 { 0x0799e, "I16_TO_F32.Y", ADD_ONE_SRC },
1125 { 0x0799f, "U16_TO_F32.Y", ADD_ONE_SRC },
1126 { 0x079a2, "F16_TO_F32.X", ADD_ONE_SRC },
1127 { 0x079a3, "F16_TO_F32.Y", ADD_ONE_SRC },
1128 { 0x07b2b, "SWZ.YX.v2i16", ADD_ONE_SRC },
1129 { 0x07b2c, "NOP", ADD_ONE_SRC },
1130 { 0x07b29, "SWZ.XX.v2i16", ADD_ONE_SRC },
1131 { 0x07b2d, "MOV", ADD_ONE_SRC },
1132 { 0x07b2f, "SWZ.YY.v2i16", ADD_ONE_SRC },
1133 { 0x07b65, "FRCP_FREXPM", ADD_ONE_SRC },
1134 { 0x07b75, "FSQRT_FREXPM", ADD_ONE_SRC },
1135 { 0x07b8d, "FRCP_FREXPE", ADD_ONE_SRC },
1136 { 0x07ba5, "FSQRT_FREXPE", ADD_ONE_SRC },
1137 { 0x07bad, "FRSQ_FREXPE", ADD_ONE_SRC },
1138 { 0x07bc5, "FLOG_FREXPE", ADD_ONE_SRC },
1139 { 0x07d45, "CEIL", ADD_ONE_SRC },
1140 { 0x07d85, "FLOOR", ADD_ONE_SRC },
1141 { 0x07dc5, "TRUNC", ADD_ONE_SRC },
1142 { 0x07f18, "LSHIFT_ADD_HIGH32.i32", ADD_TWO_SRC },
1143 { 0x08000, "LD_ATTR.f16", ADD_LOAD_ATTR, true },
1144 { 0x08100, "LD_ATTR.v2f16", ADD_LOAD_ATTR, true },
1145 { 0x08200, "LD_ATTR.v3f16", ADD_LOAD_ATTR, true },
1146 { 0x08300, "LD_ATTR.v4f16", ADD_LOAD_ATTR, true },
1147 { 0x08400, "LD_ATTR.f32", ADD_LOAD_ATTR, true },
1148 { 0x08500, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true },
1149 { 0x08600, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true },
1150 { 0x08700, "LD_ATTR.v4f32", ADD_LOAD_ATTR, true },
1151 { 0x08800, "LD_ATTR.i32", ADD_LOAD_ATTR, true },
1152 { 0x08900, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true },
1153 { 0x08a00, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true },
1154 { 0x08b00, "LD_ATTR.v4i32", ADD_LOAD_ATTR, true },
1155 { 0x08c00, "LD_ATTR.u32", ADD_LOAD_ATTR, true },
1156 { 0x08d00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true },
1157 { 0x08e00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true },
1158 { 0x08f00, "LD_ATTR.v4u32", ADD_LOAD_ATTR, true },
1159 { 0x0a000, "LD_VAR.32", ADD_VARYING_INTERP, true },
1160 { 0x0b000, "TEX", ADD_TEX_COMPACT, true },
1161 { 0x0c188, "LOAD.i32", ADD_TWO_SRC, true },
1162 { 0x0c1a0, "LD_UBO.i32", ADD_TWO_SRC, true },
1163 { 0x0c1b8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true },
1164 { 0x0c1c8, "LOAD.v2i32", ADD_TWO_SRC, true },
1165 { 0x0c1e0, "LD_UBO.v2i32", ADD_TWO_SRC, true },
1166 { 0x0c1f8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true },
1167 { 0x0c208, "LOAD.v4i32", ADD_TWO_SRC, true },
1168 { 0x0c220, "LD_UBO.v4i32", ADD_TWO_SRC, true },
1169 { 0x0c238, "LD_SCRATCH.v4i32", ADD_TWO_SRC, true },
1170 { 0x0c248, "STORE.v4i32", ADD_TWO_SRC, true },
1171 { 0x0c278, "ST_SCRATCH.v4i32", ADD_TWO_SRC, true },
1172 { 0x0c588, "STORE.i32", ADD_TWO_SRC, true },
1173 { 0x0c5b8, "ST_SCRATCH.i32", ADD_TWO_SRC, true },
1174 { 0x0c5c8, "STORE.v2i32", ADD_TWO_SRC, true },
1175 { 0x0c5f8, "ST_SCRATCH.v2i32", ADD_TWO_SRC, true },
1176 { 0x0c648, "LOAD.u16", ADD_TWO_SRC, true }, // zero-extends
1177 { 0x0ca88, "LOAD.v3i32", ADD_TWO_SRC, true },
1178 { 0x0caa0, "LD_UBO.v3i32", ADD_TWO_SRC, true },
1179 { 0x0cab8, "LD_SCRATCH.v3i32", ADD_TWO_SRC, true },
1180 { 0x0cb88, "STORE.v3i32", ADD_TWO_SRC, true },
1181 { 0x0cbb8, "ST_SCRATCH.v3i32", ADD_TWO_SRC, true },
1182 { 0x0cc00, "FRCP_FAST.f32", ADD_ONE_SRC },
1183 { 0x0cc20, "FRSQ_FAST.f32", ADD_ONE_SRC },
1184 { 0x0ce00, "FRCP_TABLE", ADD_ONE_SRC },
1185 { 0x0ce10, "FRCP_FAST.f16.X", ADD_ONE_SRC },
1186 { 0x0ce20, "FRSQ_TABLE", ADD_ONE_SRC },
1187 { 0x0ce30, "FRCP_FAST.f16.Y", ADD_ONE_SRC },
1188 { 0x0ce50, "FRSQ_FAST.f16.X", ADD_ONE_SRC },
1189 { 0x0ce60, "FRCP_APPROX", ADD_ONE_SRC },
1190 { 0x0ce70, "FRSQ_FAST.f16.Y", ADD_ONE_SRC },
1191 { 0x0cf40, "ATAN_ASSIST", ADD_TWO_SRC },
1192 { 0x0cf48, "ATAN_TABLE", ADD_TWO_SRC },
1193 { 0x0cf50, "SIN_TABLE", ADD_ONE_SRC },
1194 { 0x0cf51, "COS_TABLE", ADD_ONE_SRC },
1195 { 0x0cf58, "EXP_TABLE", ADD_ONE_SRC },
1196 { 0x0cf60, "FLOG2_TABLE", ADD_ONE_SRC },
1197 { 0x0cf64, "FLOGE_TABLE", ADD_ONE_SRC },
1198 { 0x0d000, "BRANCH", ADD_BRANCH },
1199 { 0x0e8c0, "MUX", ADD_THREE_SRC },
1200 { 0x0e9b0, "ATAN_LDEXP.Y.f32", ADD_TWO_SRC },
1201 { 0x0e9b8, "ATAN_LDEXP.X.f32", ADD_TWO_SRC },
1202 { 0x0ea60, "SEL.XX.i16", ADD_TWO_SRC },
1203 { 0x0ea70, "SEL.XY.i16", ADD_TWO_SRC },
1204 { 0x0ea68, "SEL.YX.i16", ADD_TWO_SRC },
1205 { 0x0ea78, "SEL.YY.i16", ADD_TWO_SRC },
1206 { 0x0ec00, "F32_TO_F16", ADD_TWO_SRC },
1207 { 0x0f640, "ICMP.GL.GT", ADD_TWO_SRC }, // src0 > src1 ? 1 : 0
1208 { 0x0f648, "ICMP.GL.GE", ADD_TWO_SRC },
1209 { 0x0f650, "UCMP.GL.GT", ADD_TWO_SRC },
1210 { 0x0f658, "UCMP.GL.GE", ADD_TWO_SRC },
1211 { 0x0f660, "ICMP.GL.EQ", ADD_TWO_SRC },
1212 { 0x0f669, "ICMP.GL.NEQ", ADD_TWO_SRC },
1213 { 0x0f6c0, "ICMP.D3D.GT", ADD_TWO_SRC }, // src0 > src1 ? ~0 : 0
1214 { 0x0f6c8, "ICMP.D3D.GE", ADD_TWO_SRC },
1215 { 0x0f6d0, "UCMP.D3D.GT", ADD_TWO_SRC },
1216 { 0x0f6d8, "UCMP.D3D.GE", ADD_TWO_SRC },
1217 { 0x0f6e0, "ICMP.D3D.EQ", ADD_TWO_SRC },
1218 { 0x10000, "MAX.v2f16", ADD_FMINMAX16 },
1219 { 0x11000, "ADD_MSCALE.f32", ADD_FADDMscale },
1220 { 0x12000, "MIN.v2f16", ADD_FMINMAX16 },
1221 { 0x14000, "ADD.v2f16", ADD_FADD16 },
1222 { 0x17000, "FCMP.D3D", ADD_FCMP16 },
1223 { 0x178c0, "ADD.i32", ADD_TWO_SRC },
1224 { 0x17900, "ADD.v2i16", ADD_TWO_SRC },
1225 { 0x17ac0, "SUB.i32", ADD_TWO_SRC },
1226 { 0x17c10, "ADDC.i32", ADD_TWO_SRC }, // adds src0 to the bottom bit of src1
1227 { 0x17d80, "ADD.i32.i16.X", ADD_TWO_SRC },
1228 { 0x17d90, "ADD.i32.u16.X", ADD_TWO_SRC },
1229 { 0x17dc0, "ADD.i32.i16.Y", ADD_TWO_SRC },
1230 { 0x17dd0, "ADD.i32.u16.Y", ADD_TWO_SRC },
1231 { 0x18000, "LD_VAR_ADDR.f16", ADD_VARYING_ADDRESS, true },
1232 { 0x18100, "LD_VAR_ADDR.f32", ADD_VARYING_ADDRESS, true },
1233 { 0x18200, "LD_VAR_ADDR.i32", ADD_VARYING_ADDRESS, true },
1234 { 0x18300, "LD_VAR_ADDR.u32", ADD_VARYING_ADDRESS, true },
1235 { 0x19181, "DISCARD.FEQ.f32", ADD_TWO_SRC, true },
1236 { 0x19189, "DISCARD.FNE.f32", ADD_TWO_SRC, true },
1237 { 0x1918C, "DISCARD.GL.f32", ADD_TWO_SRC, true }, /* Consumes ICMP.GL/etc with fixed 0 argument */
1238 { 0x19190, "DISCARD.FLE.f32", ADD_TWO_SRC, true },
1239 { 0x19198, "DISCARD.FLT.f32", ADD_TWO_SRC, true },
1240 { 0x191e8, "ATEST.f32", ADD_TWO_SRC, true },
1241 { 0x191f0, "ATEST.X.f16", ADD_TWO_SRC, true },
1242 { 0x191f8, "ATEST.Y.f16", ADD_TWO_SRC, true },
1243 { 0x19300, "ST_VAR.v1", ADD_THREE_SRC, true },
1244 { 0x19340, "ST_VAR.v2", ADD_THREE_SRC, true },
1245 { 0x19380, "ST_VAR.v3", ADD_THREE_SRC, true },
1246 { 0x193c0, "ST_VAR.v4", ADD_THREE_SRC, true },
1247 { 0x1952c, "BLEND", ADD_BLENDING, true },
1248 { 0x1a000, "LD_VAR.16", ADD_VARYING_INTERP, true },
1249 { 0x1ae60, "TEX", ADD_TEX, true },
1250 { 0x1c000, "RSHIFT_NAND.i32", ADD_THREE_SRC },
1251 { 0x1c300, "RSHIFT_OR.i32", ADD_THREE_SRC },
1252 { 0x1c400, "RSHIFT_AND.i32", ADD_THREE_SRC },
1253 { 0x1c700, "RSHIFT_NOR.i32", ADD_THREE_SRC },
1254 { 0x1c800, "LSHIFT_NAND.i32", ADD_THREE_SRC },
1255 { 0x1cb00, "LSHIFT_OR.i32", ADD_THREE_SRC },
1256 { 0x1cc00, "LSHIFT_AND.i32", ADD_THREE_SRC },
1257 { 0x1cf00, "LSHIFT_NOR.i32", ADD_THREE_SRC },
1258 { 0x1d000, "RSHIFT_XOR.i32", ADD_THREE_SRC },
1259 { 0x1d100, "RSHIFT_XNOR.i32", ADD_THREE_SRC },
1260 { 0x1d200, "LSHIFT_XOR.i32", ADD_THREE_SRC },
1261 { 0x1d300, "LSHIFT_XNOR.i32", ADD_THREE_SRC },
1262 { 0x1d400, "LSHIFT_ADD.i32", ADD_THREE_SRC },
1263 { 0x1d500, "LSHIFT_SUB.i32", ADD_THREE_SRC },
1264 { 0x1d500, "LSHIFT_RSUB.i32", ADD_THREE_SRC },
1265 { 0x1d700, "RSHIFT_ADD.i32", ADD_THREE_SRC },
1266 { 0x1d800, "RSHIFT_SUB.i32", ADD_THREE_SRC },
1267 { 0x1d900, "RSHIFT_RSUB.i32", ADD_THREE_SRC },
1268 { 0x1da00, "ARSHIFT_ADD.i32", ADD_THREE_SRC },
1269 { 0x1db00, "ARSHIFT_SUB.i32", ADD_THREE_SRC },
1270 { 0x1dc00, "ARSHIFT_RSUB.i32", ADD_THREE_SRC },
1271 { 0x1dd18, "OR.i32", ADD_TWO_SRC },
1272 { 0x1dd20, "AND.i32", ADD_TWO_SRC },
1273 { 0x1dd60, "LSHIFT.i32", ADD_TWO_SRC },
1274 { 0x1dd50, "XOR.i32", ADD_TWO_SRC },
1275 { 0x1dd80, "RSHIFT.i32", ADD_TWO_SRC },
1276 { 0x1dda0, "ARSHIFT.i32", ADD_TWO_SRC },
1277 };
1278
1279 static struct add_op_info find_add_op_info(unsigned op)
1280 {
1281 for (unsigned i = 0; i < ARRAY_SIZE(add_op_infos); i++) {
1282 unsigned opCmp = ~0;
1283 switch (add_op_infos[i].src_type) {
1284 case ADD_ONE_SRC:
1285 case ADD_BLENDING:
1286 opCmp = op;
1287 break;
1288 case ADD_TWO_SRC:
1289 opCmp = op & ~0x7;
1290 break;
1291 case ADD_THREE_SRC:
1292 opCmp = op & ~0x3f;
1293 break;
1294 case ADD_TEX:
1295 opCmp = op & ~0xf;
1296 break;
1297 case ADD_FADD:
1298 case ADD_FMINMAX:
1299 case ADD_FADD16:
1300 opCmp = op & ~0x1fff;
1301 break;
1302 case ADD_FMINMAX16:
1303 case ADD_FADDMscale:
1304 opCmp = op & ~0xfff;
1305 break;
1306 case ADD_FCMP:
1307 case ADD_FCMP16:
1308 opCmp = op & ~0x7ff;
1309 break;
1310 case ADD_TEX_COMPACT:
1311 opCmp = op & ~0x3ff;
1312 break;
1313 case ADD_VARYING_INTERP:
1314 opCmp = op & ~0x7ff;
1315 break;
1316 case ADD_VARYING_ADDRESS:
1317 opCmp = op & ~0xff;
1318 break;
1319 case ADD_LOAD_ATTR:
1320 opCmp = op & ~0x7f;
1321 break;
1322 case ADD_BRANCH:
1323 opCmp = op & ~0xfff;
1324 break;
1325 default:
1326 opCmp = ~0;
1327 break;
1328 }
1329 if (add_op_infos[i].op == opCmp)
1330 return add_op_infos[i];
1331 }
1332
1333 struct add_op_info info;
1334 snprintf(info.name, sizeof(info.name), "op%04x", op);
1335 info.op = op;
1336 info.src_type = ADD_TWO_SRC;
1337 info.has_data_reg = true;
1338 return info;
1339 }
1340
1341 static void dump_add(FILE *fp, uint64_t word, struct bifrost_regs regs,
1342 struct bifrost_regs next_regs, uint64_t *consts,
1343 unsigned data_reg, unsigned offset, bool verbose)
1344 {
1345 if (verbose) {
1346 fprintf(fp, "# ADD: %016" PRIx64 "\n", word);
1347 }
1348 struct bifrost_add_inst ADD;
1349 memcpy((char *) &ADD, (char *) &word, sizeof(ADD));
1350 struct add_op_info info = find_add_op_info(ADD.op);
1351
1352 fprintf(fp, "%s", info.name);
1353
1354 // float16 seems like it doesn't support output modifiers
1355 if (info.src_type == ADD_FADD || info.src_type == ADD_FMINMAX) {
1356 // output modifiers
1357 dump_output_mod(fp, bits(ADD.op, 8, 10));
1358 if (info.src_type == ADD_FADD)
1359 dump_round_mode(fp, bits(ADD.op, 10, 12));
1360 else
1361 dump_minmax_mode(fp, bits(ADD.op, 10, 12));
1362 } else if (info.src_type == ADD_FCMP || info.src_type == ADD_FCMP16) {
1363 dump_fcmp(fp, bits(ADD.op, 3, 6));
1364 if (info.src_type == ADD_FCMP)
1365 fprintf(fp, ".f32");
1366 else
1367 fprintf(fp, ".v2f16");
1368 } else if (info.src_type == ADD_FADDMscale) {
1369 switch ((ADD.op >> 6) & 0x7) {
1370 case 0:
1371 break;
1372 // causes GPU hangs on G71
1373 case 1:
1374 fprintf(fp, ".invalid");
1375 break;
1376 // Same as usual outmod value.
1377 case 2:
1378 fprintf(fp, ".clamp_0_1");
1379 break;
1380 // If src0 is infinite or NaN, flush it to zero so that the other
1381 // source is passed through unmodified.
1382 case 3:
1383 fprintf(fp, ".flush_src0_inf_nan");
1384 break;
1385 // Vice versa.
1386 case 4:
1387 fprintf(fp, ".flush_src1_inf_nan");
1388 break;
1389 // Every other case seems to behave the same as the above?
1390 default:
1391 fprintf(fp, ".unk%d", (ADD.op >> 6) & 0x7);
1392 break;
1393 }
1394 } else if (info.src_type == ADD_VARYING_INTERP) {
1395 if (ADD.op & 0x200)
1396 fprintf(fp, ".reuse");
1397 if (ADD.op & 0x400)
1398 fprintf(fp, ".flat");
1399 switch ((ADD.op >> 7) & 0x3) {
1400 case 0:
1401 fprintf(fp, ".per_frag");
1402 break;
1403 case 1:
1404 fprintf(fp, ".centroid");
1405 break;
1406 case 2:
1407 break;
1408 case 3:
1409 fprintf(fp, ".explicit");
1410 break;
1411 }
1412 fprintf(fp, ".v%d", ((ADD.op >> 5) & 0x3) + 1);
1413 } else if (info.src_type == ADD_BRANCH) {
1414 enum branch_code branchCode = (enum branch_code) ((ADD.op >> 6) & 0x3f);
1415 if (branchCode == BR_ALWAYS) {
1416 // unconditional branch
1417 } else {
1418 enum branch_cond cond = (enum branch_cond) ((ADD.op >> 6) & 0x7);
1419 enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7);
1420 bool portSwapped = (ADD.op & 0x7) < ADD.src0;
1421 // See the comment in branch_bit_size
1422 if (size == BR_SIZE_16YX0)
1423 portSwapped = true;
1424 if (size == BR_SIZE_16YX1)
1425 portSwapped = false;
1426 // These sizes are only for floating point comparisons, so the
1427 // non-floating-point comparisons are reused to encode the flipped
1428 // versions.
1429 if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y)
1430 portSwapped = false;
1431 // There's only one argument, so we reuse the extra argument to
1432 // encode this.
1433 if (size == BR_SIZE_ZERO)
1434 portSwapped = !(ADD.op & 1);
1435
1436 switch (cond) {
1437 case BR_COND_LT:
1438 if (portSwapped)
1439 fprintf(fp, ".LT.u");
1440 else
1441 fprintf(fp, ".LT.i");
1442 break;
1443 case BR_COND_LE:
1444 if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y) {
1445 fprintf(fp, ".UNE.f");
1446 } else {
1447 if (portSwapped)
1448 fprintf(fp, ".LE.u");
1449 else
1450 fprintf(fp, ".LE.i");
1451 }
1452 break;
1453 case BR_COND_GT:
1454 if (portSwapped)
1455 fprintf(fp, ".GT.u");
1456 else
1457 fprintf(fp, ".GT.i");
1458 break;
1459 case BR_COND_GE:
1460 if (portSwapped)
1461 fprintf(fp, ".GE.u");
1462 else
1463 fprintf(fp, ".GE.i");
1464 break;
1465 case BR_COND_EQ:
1466 if (portSwapped)
1467 fprintf(fp, ".NE.i");
1468 else
1469 fprintf(fp, ".EQ.i");
1470 break;
1471 case BR_COND_OEQ:
1472 if (portSwapped)
1473 fprintf(fp, ".UNE.f");
1474 else
1475 fprintf(fp, ".OEQ.f");
1476 break;
1477 case BR_COND_OGT:
1478 if (portSwapped)
1479 fprintf(fp, ".OGT.unk.f");
1480 else
1481 fprintf(fp, ".OGT.f");
1482 break;
1483 case BR_COND_OLT:
1484 if (portSwapped)
1485 fprintf(fp, ".OLT.unk.f");
1486 else
1487 fprintf(fp, ".OLT.f");
1488 break;
1489 }
1490 switch (size) {
1491 case BR_SIZE_32:
1492 case BR_SIZE_32_AND_16X:
1493 case BR_SIZE_32_AND_16Y:
1494 fprintf(fp, "32");
1495 break;
1496 case BR_SIZE_16XX:
1497 case BR_SIZE_16YY:
1498 case BR_SIZE_16YX0:
1499 case BR_SIZE_16YX1:
1500 fprintf(fp, "16");
1501 break;
1502 case BR_SIZE_ZERO: {
1503 unsigned ctrl = (ADD.op >> 1) & 0x3;
1504 if (ctrl == 0)
1505 fprintf(fp, "32.Z");
1506 else
1507 fprintf(fp, "16.Z");
1508 break;
1509 }
1510 }
1511 }
1512 }
1513 fprintf(fp, " ");
1514
1515 struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(fp, next_regs);
1516 if (next_ctrl.add_write_unit != REG_WRITE_NONE) {
1517 fprintf(fp, "{R%d, T1}, ", GetRegToWrite(next_ctrl.add_write_unit, next_regs));
1518 } else {
1519 fprintf(fp, "T1, ");
1520 }
1521
1522 switch (info.src_type) {
1523 case ADD_BLENDING:
1524 // Note: in this case, regs.uniform_const == location | 0x8
1525 // This probably means we can't load uniforms or immediates in the
1526 // same instruction. This re-uses the encoding that normally means
1527 // "disabled", where the low 4 bits are ignored. Perhaps the extra
1528 // 0x8 or'd in indicates this is happening.
1529 fprintf(fp, "location:%d, ", regs.uniform_const & 0x7);
1530 // fallthrough
1531 case ADD_ONE_SRC:
1532 dump_src(fp, ADD.src0, regs, consts, false);
1533 break;
1534 case ADD_TEX:
1535 case ADD_TEX_COMPACT: {
1536 int tex_index;
1537 int sampler_index;
1538 bool dualTex = false;
1539 if (info.src_type == ADD_TEX_COMPACT) {
1540 tex_index = (ADD.op >> 3) & 0x7;
1541 sampler_index = (ADD.op >> 7) & 0x7;
1542 bool unknown = (ADD.op & 0x40);
1543 // TODO: figure out if the unknown bit is ever 0
1544 if (!unknown)
1545 fprintf(fp, "unknown ");
1546 } else {
1547 uint64_t constVal = get_const(consts, regs);
1548 uint32_t controlBits = (ADD.op & 0x8) ? (constVal >> 32) : constVal;
1549 struct bifrost_tex_ctrl ctrl;
1550 memcpy((char *) &ctrl, (char *) &controlBits, sizeof(ctrl));
1551
1552 // TODO: figure out what actually triggers dual-tex
1553 if (ctrl.result_type == 9) {
1554 struct bifrost_dual_tex_ctrl dualCtrl;
1555 memcpy((char *) &dualCtrl, (char *) &controlBits, sizeof(ctrl));
1556 fprintf(fp, "(dualtex) tex0:%d samp0:%d tex1:%d samp1:%d ",
1557 dualCtrl.tex_index0, dualCtrl.sampler_index0,
1558 dualCtrl.tex_index1, dualCtrl.sampler_index1);
1559 if (dualCtrl.unk0 != 3)
1560 fprintf(fp, "unk:%d ", dualCtrl.unk0);
1561 dualTex = true;
1562 } else {
1563 if (ctrl.no_merge_index) {
1564 tex_index = ctrl.tex_index;
1565 sampler_index = ctrl.sampler_index;
1566 } else {
1567 tex_index = sampler_index = ctrl.tex_index;
1568 unsigned unk = ctrl.sampler_index >> 2;
1569 if (unk != 3)
1570 fprintf(fp, "unk:%d ", unk);
1571 if (ctrl.sampler_index & 1)
1572 tex_index = -1;
1573 if (ctrl.sampler_index & 2)
1574 sampler_index = -1;
1575 }
1576
1577 if (ctrl.unk0 != 3)
1578 fprintf(fp, "unk0:%d ", ctrl.unk0);
1579 if (ctrl.unk1)
1580 fprintf(fp, "unk1 ");
1581 if (ctrl.unk2 != 0xf)
1582 fprintf(fp, "unk2:%x ", ctrl.unk2);
1583
1584 switch (ctrl.result_type) {
1585 case 0x4:
1586 fprintf(fp, "f32 ");
1587 break;
1588 case 0xe:
1589 fprintf(fp, "i32 ");
1590 break;
1591 case 0xf:
1592 fprintf(fp, "u32 ");
1593 break;
1594 default:
1595 fprintf(fp, "unktype(%x) ", ctrl.result_type);
1596 }
1597
1598 switch (ctrl.tex_type) {
1599 case 0:
1600 fprintf(fp, "cube ");
1601 break;
1602 case 1:
1603 fprintf(fp, "buffer ");
1604 break;
1605 case 2:
1606 fprintf(fp, "2D ");
1607 break;
1608 case 3:
1609 fprintf(fp, "3D ");
1610 break;
1611 }
1612
1613 if (ctrl.is_shadow)
1614 fprintf(fp, "shadow ");
1615 if (ctrl.is_array)
1616 fprintf(fp, "array ");
1617
1618 if (!ctrl.filter) {
1619 if (ctrl.calc_gradients) {
1620 int comp = (controlBits >> 20) & 0x3;
1621 fprintf(fp, "txg comp:%d ", comp);
1622 } else {
1623 fprintf(fp, "txf ");
1624 }
1625 } else {
1626 if (!ctrl.not_supply_lod) {
1627 if (ctrl.compute_lod)
1628 fprintf(fp, "lod_bias ");
1629 else
1630 fprintf(fp, "lod ");
1631 }
1632
1633 if (!ctrl.calc_gradients)
1634 fprintf(fp, "grad ");
1635 }
1636
1637 if (ctrl.texel_offset)
1638 fprintf(fp, "offset ");
1639 }
1640 }
1641
1642 if (!dualTex) {
1643 if (tex_index == -1)
1644 fprintf(fp, "tex:indirect ");
1645 else
1646 fprintf(fp, "tex:%d ", tex_index);
1647
1648 if (sampler_index == -1)
1649 fprintf(fp, "samp:indirect ");
1650 else
1651 fprintf(fp, "samp:%d ", sampler_index);
1652 }
1653 break;
1654 }
1655 case ADD_VARYING_INTERP: {
1656 unsigned addr = ADD.op & 0x1f;
1657 if (addr < 0b10100) {
1658 // direct addr
1659 fprintf(fp, "%d", addr);
1660 } else if (addr < 0b11000) {
1661 if (addr == 22)
1662 fprintf(fp, "fragw");
1663 else if (addr == 23)
1664 fprintf(fp, "fragz");
1665 else
1666 fprintf(fp, "unk%d", addr);
1667 } else {
1668 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1669 }
1670 fprintf(fp, ", ");
1671 dump_src(fp, ADD.src0, regs, consts, false);
1672 break;
1673 }
1674 case ADD_VARYING_ADDRESS: {
1675 dump_src(fp, ADD.src0, regs, consts, false);
1676 fprintf(fp, ", ");
1677 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1678 fprintf(fp, ", ");
1679 unsigned location = (ADD.op >> 3) & 0x1f;
1680 if (location < 16) {
1681 fprintf(fp, "location:%d", location);
1682 } else if (location == 20) {
1683 fprintf(fp, "location:%u", (uint32_t) get_const(consts, regs));
1684 } else if (location == 21) {
1685 fprintf(fp, "location:%u", (uint32_t) (get_const(consts, regs) >> 32));
1686 } else {
1687 fprintf(fp, "location:%d(unk)", location);
1688 }
1689 break;
1690 }
1691 case ADD_LOAD_ATTR:
1692 fprintf(fp, "location:%d, ", (ADD.op >> 3) & 0xf);
1693 case ADD_TWO_SRC:
1694 dump_src(fp, ADD.src0, regs, consts, false);
1695 fprintf(fp, ", ");
1696 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1697 break;
1698 case ADD_THREE_SRC:
1699 dump_src(fp, ADD.src0, regs, consts, false);
1700 fprintf(fp, ", ");
1701 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1702 fprintf(fp, ", ");
1703 dump_src(fp, (ADD.op >> 3) & 0x7, regs, consts, false);
1704 break;
1705 case ADD_FADD:
1706 case ADD_FMINMAX:
1707 if (ADD.op & 0x10)
1708 fprintf(fp, "-");
1709 if (ADD.op & 0x1000)
1710 fprintf(fp, "abs(");
1711 dump_src(fp, ADD.src0, regs, consts, false);
1712 switch ((ADD.op >> 6) & 0x3) {
1713 case 3:
1714 fprintf(fp, ".x");
1715 break;
1716 default:
1717 break;
1718 }
1719 if (ADD.op & 0x1000)
1720 fprintf(fp, ")");
1721 fprintf(fp, ", ");
1722 if (ADD.op & 0x20)
1723 fprintf(fp, "-");
1724 if (ADD.op & 0x8)
1725 fprintf(fp, "abs(");
1726 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1727 switch ((ADD.op >> 6) & 0x3) {
1728 case 1:
1729 case 3:
1730 fprintf(fp, ".x");
1731 break;
1732 case 2:
1733 fprintf(fp, ".y");
1734 break;
1735 case 0:
1736 break;
1737 default:
1738 fprintf(fp, ".unk");
1739 break;
1740 }
1741 if (ADD.op & 0x8)
1742 fprintf(fp, ")");
1743 break;
1744 case ADD_FADD16:
1745 if (ADD.op & 0x10)
1746 fprintf(fp, "-");
1747 if (ADD.op & 0x1000)
1748 fprintf(fp, "abs(");
1749 dump_src(fp, ADD.src0, regs, consts, false);
1750 if (ADD.op & 0x1000)
1751 fprintf(fp, ")");
1752 dump_16swizzle(fp, (ADD.op >> 6) & 0x3);
1753 fprintf(fp, ", ");
1754 if (ADD.op & 0x20)
1755 fprintf(fp, "-");
1756 if (ADD.op & 0x8)
1757 fprintf(fp, "abs(");
1758 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1759 dump_16swizzle(fp, (ADD.op >> 8) & 0x3);
1760 if (ADD.op & 0x8)
1761 fprintf(fp, ")");
1762 break;
1763 case ADD_FMINMAX16: {
1764 bool abs1 = ADD.op & 0x8;
1765 bool abs2 = (ADD.op & 0x7) < ADD.src0;
1766 if (ADD.op & 0x10)
1767 fprintf(fp, "-");
1768 if (abs1 || abs2)
1769 fprintf(fp, "abs(");
1770 dump_src(fp, ADD.src0, regs, consts, false);
1771 dump_16swizzle(fp, (ADD.op >> 6) & 0x3);
1772 if (abs1 || abs2)
1773 fprintf(fp, ")");
1774 fprintf(fp, ", ");
1775 if (ADD.op & 0x20)
1776 fprintf(fp, "-");
1777 if (abs1 && abs2)
1778 fprintf(fp, "abs(");
1779 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1780 dump_16swizzle(fp, (ADD.op >> 8) & 0x3);
1781 if (abs1 && abs2)
1782 fprintf(fp, ")");
1783 break;
1784 }
1785 case ADD_FADDMscale: {
1786 if (ADD.op & 0x400)
1787 fprintf(fp, "-");
1788 if (ADD.op & 0x200)
1789 fprintf(fp, "abs(");
1790 dump_src(fp, ADD.src0, regs, consts, false);
1791 if (ADD.op & 0x200)
1792 fprintf(fp, ")");
1793
1794 fprintf(fp, ", ");
1795
1796 if (ADD.op & 0x800)
1797 fprintf(fp, "-");
1798 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1799
1800 fprintf(fp, ", ");
1801
1802 dump_src(fp, (ADD.op >> 3) & 0x7, regs, consts, false);
1803 break;
1804 }
1805 case ADD_FCMP:
1806 if (ADD.op & 0x400) {
1807 fprintf(fp, "-");
1808 }
1809 if (ADD.op & 0x100) {
1810 fprintf(fp, "abs(");
1811 }
1812 dump_src(fp, ADD.src0, regs, consts, false);
1813 switch ((ADD.op >> 6) & 0x3) {
1814 case 3:
1815 fprintf(fp, ".x");
1816 break;
1817 default:
1818 break;
1819 }
1820 if (ADD.op & 0x100) {
1821 fprintf(fp, ")");
1822 }
1823 fprintf(fp, ", ");
1824 if (ADD.op & 0x200) {
1825 fprintf(fp, "abs(");
1826 }
1827 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1828 switch ((ADD.op >> 6) & 0x3) {
1829 case 1:
1830 case 3:
1831 fprintf(fp, ".x");
1832 break;
1833 case 2:
1834 fprintf(fp, ".y");
1835 break;
1836 case 0:
1837 break;
1838 default:
1839 fprintf(fp, ".unk");
1840 break;
1841 }
1842 if (ADD.op & 0x200) {
1843 fprintf(fp, ")");
1844 }
1845 break;
1846 case ADD_FCMP16:
1847 dump_src(fp, ADD.src0, regs, consts, false);
1848 dump_16swizzle(fp, (ADD.op >> 6) & 0x3);
1849 fprintf(fp, ", ");
1850 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1851 dump_16swizzle(fp, (ADD.op >> 8) & 0x3);
1852 break;
1853 case ADD_BRANCH: {
1854 enum branch_code code = (enum branch_code) ((ADD.op >> 6) & 0x3f);
1855 enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7);
1856 if (code != BR_ALWAYS) {
1857 dump_src(fp, ADD.src0, regs, consts, false);
1858 switch (size) {
1859 case BR_SIZE_16XX:
1860 fprintf(fp, ".x");
1861 break;
1862 case BR_SIZE_16YY:
1863 case BR_SIZE_16YX0:
1864 case BR_SIZE_16YX1:
1865 fprintf(fp, ".y");
1866 break;
1867 case BR_SIZE_ZERO: {
1868 unsigned ctrl = (ADD.op >> 1) & 0x3;
1869 switch (ctrl) {
1870 case 1:
1871 fprintf(fp, ".y");
1872 break;
1873 case 2:
1874 fprintf(fp, ".x");
1875 break;
1876 default:
1877 break;
1878 }
1879 }
1880 default:
1881 break;
1882 }
1883 fprintf(fp, ", ");
1884 }
1885 if (code != BR_ALWAYS && size != BR_SIZE_ZERO) {
1886 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1887 switch (size) {
1888 case BR_SIZE_16XX:
1889 case BR_SIZE_16YX0:
1890 case BR_SIZE_16YX1:
1891 case BR_SIZE_32_AND_16X:
1892 fprintf(fp, ".x");
1893 break;
1894 case BR_SIZE_16YY:
1895 case BR_SIZE_32_AND_16Y:
1896 fprintf(fp, ".y");
1897 break;
1898 default:
1899 break;
1900 }
1901 fprintf(fp, ", ");
1902 }
1903 // I haven't had the chance to test if this actually specifies the
1904 // branch offset, since I couldn't get it to produce values other
1905 // than 5 (uniform/const high), but these three bits are always
1906 // consistent across branch instructions, so it makes sense...
1907 int offsetSrc = (ADD.op >> 3) & 0x7;
1908 if (offsetSrc == 4 || offsetSrc == 5) {
1909 // If the offset is known/constant, we can decode it
1910 uint32_t raw_offset;
1911 if (offsetSrc == 4)
1912 raw_offset = get_const(consts, regs);
1913 else
1914 raw_offset = get_const(consts, regs) >> 32;
1915 // The high 4 bits are flags, while the rest is the
1916 // twos-complement offset in bytes (here we convert to
1917 // clauses).
1918 int32_t branch_offset = ((int32_t) raw_offset << 4) >> 8;
1919
1920 // If high4 is the high 4 bits of the last 64-bit constant,
1921 // this is calculated as (high4 + 4) & 0xf, or 0 if the branch
1922 // offset itself is the last constant. Not sure if this is
1923 // actually used, or just garbage in unused bits, but in any
1924 // case, we can just ignore it here since it's redundant. Note
1925 // that if there is any padding, this will be 4 since the
1926 // padding counts as the last constant.
1927 unsigned flags = raw_offset >> 28;
1928 (void) flags;
1929
1930 // Note: the offset is in bytes, relative to the beginning of the
1931 // current clause, so a zero offset would be a loop back to the
1932 // same clause (annoyingly different from Midgard).
1933 fprintf(fp, "clause_%d", offset + branch_offset);
1934 } else {
1935 dump_src(fp, offsetSrc, regs, consts, false);
1936 }
1937 }
1938 }
1939 if (info.has_data_reg) {
1940 fprintf(fp, ", R%d", data_reg);
1941 }
1942 fprintf(fp, "\n");
1943 }
1944
1945 void dump_instr(FILE *fp, const struct bifrost_alu_inst *instr,
1946 struct bifrost_regs next_regs, uint64_t *consts,
1947 unsigned data_reg, unsigned offset, bool verbose)
1948 {
1949 struct bifrost_regs regs;
1950 memcpy((char *) &regs, (char *) &instr->reg_bits, sizeof(regs));
1951
1952 if (verbose) {
1953 fprintf(fp, "# regs: %016" PRIx64 "\n", instr->reg_bits);
1954 dump_regs(fp, regs);
1955 }
1956 dump_fma(fp, instr->fma_bits, regs, next_regs, consts, verbose);
1957 dump_add(fp, instr->add_bits, regs, next_regs, consts, data_reg, offset, verbose);
1958 }
1959
1960 bool dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offset, bool verbose)
1961 {
1962 // State for a decoded clause
1963 struct bifrost_alu_inst instrs[8] = {};
1964 uint64_t consts[6] = {};
1965 unsigned num_instrs = 0;
1966 unsigned num_consts = 0;
1967 uint64_t header_bits = 0;
1968 bool stopbit = false;
1969
1970 unsigned i;
1971 for (i = 0; ; i++, words += 4) {
1972 if (verbose) {
1973 fprintf(fp, "# ");
1974 for (int j = 0; j < 4; j++)
1975 fprintf(fp, "%08x ", words[3 - j]); // low bit on the right
1976 fprintf(fp, "\n");
1977 }
1978 unsigned tag = bits(words[0], 0, 8);
1979
1980 // speculatively decode some things that are common between many formats, so we can share some code
1981 struct bifrost_alu_inst main_instr = {};
1982 // 20 bits
1983 main_instr.add_bits = bits(words[2], 2, 32 - 13);
1984 // 23 bits
1985 main_instr.fma_bits = bits(words[1], 11, 32) | bits(words[2], 0, 2) << (32 - 11);
1986 // 35 bits
1987 main_instr.reg_bits = ((uint64_t) bits(words[1], 0, 11)) << 24 | (uint64_t) bits(words[0], 8, 32);
1988
1989 uint64_t const0 = bits(words[0], 8, 32) << 4 | (uint64_t) words[1] << 28 | bits(words[2], 0, 4) << 60;
1990 uint64_t const1 = bits(words[2], 4, 32) << 4 | (uint64_t) words[3] << 32;
1991
1992 bool stop = tag & 0x40;
1993
1994 if (verbose) {
1995 fprintf(fp, "# tag: 0x%02x\n", tag);
1996 }
1997 if (tag & 0x80) {
1998 unsigned idx = stop ? 5 : 2;
1999 main_instr.add_bits |= ((tag >> 3) & 0x7) << 17;
2000 instrs[idx + 1] = main_instr;
2001 instrs[idx].add_bits = bits(words[3], 0, 17) | ((tag & 0x7) << 17);
2002 instrs[idx].fma_bits |= bits(words[2], 19, 32) << 10;
2003 consts[0] = bits(words[3], 17, 32) << 4;
2004 } else {
2005 bool done = false;
2006 switch ((tag >> 3) & 0x7) {
2007 case 0x0:
2008 switch (tag & 0x7) {
2009 case 0x3:
2010 main_instr.add_bits |= bits(words[3], 29, 32) << 17;
2011 instrs[1] = main_instr;
2012 num_instrs = 2;
2013 done = stop;
2014 break;
2015 case 0x4:
2016 instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
2017 instrs[2].fma_bits |= bits(words[2], 19, 32) << 10;
2018 consts[0] = const0;
2019 num_instrs = 3;
2020 num_consts = 1;
2021 done = stop;
2022 break;
2023 case 0x1:
2024 case 0x5:
2025 instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
2026 instrs[2].fma_bits |= bits(words[2], 19, 32) << 10;
2027 main_instr.add_bits |= bits(words[3], 26, 29) << 17;
2028 instrs[3] = main_instr;
2029 if ((tag & 0x7) == 0x5) {
2030 num_instrs = 4;
2031 done = stop;
2032 }
2033 break;
2034 case 0x6:
2035 instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
2036 instrs[5].fma_bits |= bits(words[2], 19, 32) << 10;
2037 consts[0] = const0;
2038 num_instrs = 6;
2039 num_consts = 1;
2040 done = stop;
2041 break;
2042 case 0x7:
2043 instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
2044 instrs[5].fma_bits |= bits(words[2], 19, 32) << 10;
2045 main_instr.add_bits |= bits(words[3], 26, 29) << 17;
2046 instrs[6] = main_instr;
2047 num_instrs = 7;
2048 done = stop;
2049 break;
2050 default:
2051 fprintf(fp, "unknown tag bits 0x%02x\n", tag);
2052 }
2053 break;
2054 case 0x2:
2055 case 0x3: {
2056 unsigned idx = ((tag >> 3) & 0x7) == 2 ? 4 : 7;
2057 main_instr.add_bits |= (tag & 0x7) << 17;
2058 instrs[idx] = main_instr;
2059 consts[0] |= (bits(words[2], 19, 32) | ((uint64_t) words[3] << 13)) << 19;
2060 num_consts = 1;
2061 num_instrs = idx + 1;
2062 done = stop;
2063 break;
2064 }
2065 case 0x4: {
2066 unsigned idx = stop ? 4 : 1;
2067 main_instr.add_bits |= (tag & 0x7) << 17;
2068 instrs[idx] = main_instr;
2069 instrs[idx + 1].fma_bits |= bits(words[3], 22, 32);
2070 instrs[idx + 1].reg_bits = bits(words[2], 19, 32) | (bits(words[3], 0, 22) << (32 - 19));
2071 break;
2072 }
2073 case 0x1:
2074 // only constants can come after this
2075 num_instrs = 1;
2076 done = stop;
2077 case 0x5:
2078 header_bits = bits(words[2], 19, 32) | ((uint64_t) words[3] << (32 - 19));
2079 main_instr.add_bits |= (tag & 0x7) << 17;
2080 instrs[0] = main_instr;
2081 break;
2082 case 0x6:
2083 case 0x7: {
2084 unsigned pos = tag & 0xf;
2085 // note that `pos' encodes both the total number of
2086 // instructions and the position in the constant stream,
2087 // presumably because decoded constants and instructions
2088 // share a buffer in the decoder, but we only care about
2089 // the position in the constant stream; the total number of
2090 // instructions is redundant.
2091 unsigned const_idx = 0;
2092 switch (pos) {
2093 case 0:
2094 case 1:
2095 case 2:
2096 case 6:
2097 const_idx = 0;
2098 break;
2099 case 3:
2100 case 4:
2101 case 7:
2102 case 9:
2103 const_idx = 1;
2104 break;
2105 case 5:
2106 case 0xa:
2107 const_idx = 2;
2108 break;
2109 case 8:
2110 case 0xb:
2111 case 0xc:
2112 const_idx = 3;
2113 break;
2114 case 0xd:
2115 const_idx = 4;
2116 break;
2117 default:
2118 fprintf(fp, "# unknown pos 0x%x\n", pos);
2119 break;
2120 }
2121
2122 if (num_consts < const_idx + 2)
2123 num_consts = const_idx + 2;
2124
2125 consts[const_idx] = const0;
2126 consts[const_idx + 1] = const1;
2127 done = stop;
2128 break;
2129 }
2130 default:
2131 break;
2132 }
2133
2134 if (done)
2135 break;
2136 }
2137 }
2138
2139 *size = i + 1;
2140
2141 if (verbose) {
2142 fprintf(fp, "# header: %012" PRIx64 "\n", header_bits);
2143 }
2144
2145 struct bifrost_header header;
2146 memcpy((char *) &header, (char *) &header_bits, sizeof(struct bifrost_header));
2147 dump_header(fp, header, verbose);
2148 if (!header.no_end_of_shader)
2149 stopbit = true;
2150
2151 fprintf(fp, "{\n");
2152 for (i = 0; i < num_instrs; i++) {
2153 struct bifrost_regs next_regs;
2154 if (i + 1 == num_instrs) {
2155 memcpy((char *) &next_regs, (char *) &instrs[0].reg_bits,
2156 sizeof(next_regs));
2157 } else {
2158 memcpy((char *) &next_regs, (char *) &instrs[i + 1].reg_bits,
2159 sizeof(next_regs));
2160 }
2161
2162 dump_instr(fp, &instrs[i], next_regs, consts, header.datareg, offset, verbose);
2163 }
2164 fprintf(fp, "}\n");
2165
2166 if (verbose) {
2167 for (unsigned i = 0; i < num_consts; i++) {
2168 fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i, consts[i] & 0xffffffff);
2169 fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i + 1, consts[i] >> 32);
2170 }
2171 }
2172 return stopbit;
2173 }
2174
2175 void disassemble_bifrost(FILE *fp, uint8_t *code, size_t size, bool verbose)
2176 {
2177 uint32_t *words = (uint32_t *) code;
2178 uint32_t *words_end = words + (size / 4);
2179 // used for displaying branch targets
2180 unsigned offset = 0;
2181 while (words != words_end) {
2182 // we don't know what the program-end bit is quite yet, so for now just
2183 // assume that an all-0 quadword is padding
2184 uint32_t zero[4] = {};
2185 if (memcmp(words, zero, 4 * sizeof(uint32_t)) == 0)
2186 break;
2187 fprintf(fp, "clause_%d:\n", offset);
2188 unsigned size;
2189 if (dump_clause(fp, words, &size, offset, verbose) == true) {
2190 break;
2191 }
2192 words += size * 4;
2193 offset += size;
2194 }
2195 }
2196