src/panfrost/bifrost/disassemble.c

   1 /*
   2  * Copyright (C) 2019 Connor Abbott <cwabbott0@gmail.com>
   3  * Copyright (C) 2019 Lyude Paul <thatslyude@gmail.com>
   4  * Copyright (C) 2019 Ryan Houdek <Sonicadvance1@gmail.com>
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23  * SOFTWARE.
  24  */
  25
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdint.h>
  29 #include <assert.h>
  30 #include <inttypes.h>
  31 #include <string.h>
  32
  33 #include "bifrost.h"
  34 #include "bifrost_ops.h"
  35 #include "disassemble.h"
  36 #include "util/macros.h"
  37
  38 // return bits (high, lo]
  39 static uint64_t bits(uint32_t word, unsigned lo, unsigned high)
  40 {
  41         if (high == 32)
  42                 return word >> lo;
  43         return (word & ((1 << high) - 1)) >> lo;
  44 }
  45
  46 // each of these structs represents an instruction that's dispatched in one
  47 // cycle. Note that these instructions are packed in funny ways within the
  48 // clause, hence the need for a separate struct.
  49 struct bifrost_alu_inst {
  50         uint32_t fma_bits;
  51         uint32_t add_bits;
  52         uint64_t reg_bits;
  53 };
  54
  55 struct bifrost_regs {
  56         unsigned uniform_const : 8;
  57         unsigned reg2 : 6;
  58         unsigned reg3 : 6;
  59         unsigned reg0 : 5;
  60         unsigned reg1 : 6;
  61         unsigned ctrl : 4;
  62 };
  63
  64 static unsigned get_reg0(struct bifrost_regs regs)
  65 {
  66         if (regs.ctrl == 0)
  67                 return regs.reg0 | ((regs.reg1 & 0x1) << 5);
  68
  69         return regs.reg0 <= regs.reg1 ? regs.reg0 : 63 - regs.reg0;
  70 }
  71
  72 static unsigned get_reg1(struct bifrost_regs regs)
  73 {
  74         return regs.reg0 <= regs.reg1 ? regs.reg1 : 63 - regs.reg1;
  75 }
  76
  77 enum bifrost_reg_write_unit {
  78         REG_WRITE_NONE = 0, // don't write
  79         REG_WRITE_TWO, // write using reg2
  80         REG_WRITE_THREE, // write using reg3
  81 };
  82
  83 // this represents the decoded version of the ctrl register field.
  84 struct bifrost_reg_ctrl {
  85         bool read_reg0;
  86         bool read_reg1;
  87         bool read_reg3;
  88         enum bifrost_reg_write_unit fma_write_unit;
  89         enum bifrost_reg_write_unit add_write_unit;
  90         bool clause_start;
  91 };
  92
  93 enum fma_src_type {
  94         FMA_ONE_SRC,
  95         FMA_TWO_SRC,
  96         FMA_FADD,
  97         FMA_FMINMAX,
  98         FMA_FADD16,
  99         FMA_FMINMAX16,
 100         FMA_FCMP,
 101         FMA_FCMP16,
 102         FMA_THREE_SRC,
 103         FMA_SHIFT,
 104         FMA_FMA,
 105         FMA_FMA16,
 106         FMA_CSEL4,
 107         FMA_FMA_MSCALE,
 108         FMA_SHIFT_ADD64,
 109 };
 110
 111 struct fma_op_info {
 112         unsigned op;
 113         char name[30];
 114         enum fma_src_type src_type;
 115 };
 116
 117 enum add_src_type {
 118         ADD_ONE_SRC,
 119         ADD_TWO_SRC,
 120         ADD_FADD,
 121         ADD_FMINMAX,
 122         ADD_FADD16,
 123         ADD_FMINMAX16,
 124         ADD_THREE_SRC,
 125         ADD_FADDMscale,
 126         ADD_FCMP,
 127         ADD_FCMP16,
 128         ADD_TEX_COMPACT, // texture instruction with embedded sampler
 129         ADD_TEX, // texture instruction with sampler/etc. in uniform port
 130         ADD_VARYING_INTERP,
 131         ADD_BLENDING,
 132         ADD_LOAD_ATTR,
 133         ADD_VARYING_ADDRESS,
 134         ADD_BRANCH,
 135 };
 136
 137 struct add_op_info {
 138         unsigned op;
 139         char name[30];
 140         enum add_src_type src_type;
 141         bool has_data_reg;
 142 };
 143
 144 struct bifrost_tex_ctrl {
 145         unsigned sampler_index : 4; // also used to signal indirects
 146         unsigned tex_index : 7;
 147         bool no_merge_index : 1; // whether to merge (direct) sampler & texture indices
 148         bool filter : 1; // use the usual filtering pipeline (0 for texelFetch & textureGather)
 149         unsigned unk0 : 2;
 150         bool texel_offset : 1; // *Offset()
 151         bool is_shadow : 1;
 152         bool is_array : 1;
 153         unsigned tex_type : 2; // 2D, 3D, Cube, Buffer
 154         bool compute_lod : 1; // 0 for *Lod()
 155         bool not_supply_lod : 1; // 0 for *Lod() or when a bias is applied
 156         bool calc_gradients : 1; // 0 for *Grad()
 157         unsigned unk1 : 1;
 158         unsigned result_type : 4; // integer, unsigned, float TODO: why is this 4 bits?
 159         unsigned unk2 : 4;
 160 };
 161
 162 struct bifrost_dual_tex_ctrl {
 163         unsigned sampler_index0 : 2;
 164         unsigned unk0 : 2;
 165         unsigned tex_index0 : 2;
 166         unsigned sampler_index1 : 2;
 167         unsigned tex_index1 : 2;
 168         unsigned unk1 : 22;
 169 };
 170
 171 enum branch_bit_size {
 172         BR_SIZE_32 = 0,
 173         BR_SIZE_16XX = 1,
 174         BR_SIZE_16YY = 2,
 175         // For the above combinations of bitsize and location, an extra bit is
 176         // encoded via comparing the sources. The only possible source of ambiguity
 177         // would be if the sources were the same, but then the branch condition
 178         // would be always true or always false anyways, so we can ignore it. But
 179         // this no longer works when comparing the y component to the x component,
 180         // since it's valid to compare the y component of a source against its own
 181         // x component. Instead, the extra bit is encoded via an extra bitsize.
 182         BR_SIZE_16YX0 = 3,
 183         BR_SIZE_16YX1 = 4,
 184         BR_SIZE_32_AND_16X = 5,
 185         BR_SIZE_32_AND_16Y = 6,
 186         // Used for comparisons with zero and always-true, see below. I think this
 187         // only works for integer comparisons.
 188         BR_SIZE_ZERO = 7,
 189 };
 190
 191 void dump_header(FILE *fp, struct bifrost_header header, bool verbose);
 192 void dump_instr(FILE *fp, const struct bifrost_alu_inst *instr,
 193                 struct bifrost_regs next_regs, uint64_t *consts,
 194                 unsigned data_reg, unsigned offset, bool verbose);
 195 bool dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offset, bool verbose);
 196
 197 void dump_header(FILE *fp, struct bifrost_header header, bool verbose)
 198 {
 199         if (header.clause_type != 0) {
 200                 fprintf(fp, "id(%du) ", header.scoreboard_index);
 201         }
 202
 203         if (header.scoreboard_deps != 0) {
 204                 fprintf(fp, "next-wait(");
 205                 bool first = true;
 206                 for (unsigned i = 0; i < 8; i++) {
 207                         if (header.scoreboard_deps & (1 << i)) {
 208                                 if (!first) {
 209                                         fprintf(fp, ", ");
 210                                 }
 211                                 fprintf(fp, "%d", i);
 212                                 first = false;
 213                         }
 214                 }
 215                 fprintf(fp, ") ");
 216         }
 217
 218         if (header.datareg_writebarrier)
 219                 fprintf(fp, "data-reg-barrier ");
 220
 221         if (!header.no_end_of_shader)
 222                 fprintf(fp, "eos ");
 223
 224         if (!header.back_to_back) {
 225                 fprintf(fp, "nbb ");
 226                 if (header.branch_cond)
 227                         fprintf(fp, "branch-cond ");
 228                 else
 229                         fprintf(fp, "branch-uncond ");
 230         }
 231
 232         if (header.elide_writes)
 233                 fprintf(fp, "we ");
 234
 235         if (header.suppress_inf)
 236                 fprintf(fp, "suppress-inf ");
 237         if (header.suppress_nan)
 238                 fprintf(fp, "suppress-nan ");
 239
 240         if (header.unk0)
 241                 fprintf(fp, "unk0 ");
 242         if (header.unk1)
 243                 fprintf(fp, "unk1 ");
 244         if  (header.unk2)
 245                 fprintf(fp, "unk2 ");
 246         if (header.unk3)
 247                 fprintf(fp, "unk3 ");
 248         if (header.unk4)
 249                 fprintf(fp, "unk4 ");
 250
 251         fprintf(fp, "\n");
 252
 253         if (verbose) {
 254                 fprintf(fp, "# clause type %d, next clause type %d\n",
 255                        header.clause_type, header.next_clause_type);
 256         }
 257 }
 258
 259 static struct bifrost_reg_ctrl DecodeRegCtrl(FILE *fp, struct bifrost_regs regs)
 260 {
 261         struct bifrost_reg_ctrl decoded = {};
 262         unsigned ctrl;
 263         if (regs.ctrl == 0) {
 264                 ctrl = regs.reg1 >> 2;
 265                 decoded.read_reg0 = !(regs.reg1 & 0x2);
 266                 decoded.read_reg1 = false;
 267         } else {
 268                 ctrl = regs.ctrl;
 269                 decoded.read_reg0 = decoded.read_reg1 = true;
 270         }
 271         switch (ctrl) {
 272         case 1:
 273                 decoded.fma_write_unit = REG_WRITE_TWO;
 274                 break;
 275         case 2:
 276         case 3:
 277                 decoded.fma_write_unit = REG_WRITE_TWO;
 278                 decoded.read_reg3 = true;
 279                 break;
 280         case 4:
 281                 decoded.read_reg3 = true;
 282                 break;
 283         case 5:
 284                 decoded.add_write_unit = REG_WRITE_TWO;
 285                 break;
 286         case 6:
 287                 decoded.add_write_unit = REG_WRITE_TWO;
 288                 decoded.read_reg3 = true;
 289                 break;
 290         case 8:
 291                 decoded.clause_start = true;
 292                 break;
 293         case 9:
 294                 decoded.fma_write_unit = REG_WRITE_TWO;
 295                 decoded.clause_start = true;
 296                 break;
 297         case 11:
 298                 break;
 299         case 12:
 300                 decoded.read_reg3 = true;
 301                 decoded.clause_start = true;
 302                 break;
 303         case 13:
 304                 decoded.add_write_unit = REG_WRITE_TWO;
 305                 decoded.clause_start = true;
 306                 break;
 307
 308         case 7:
 309         case 15:
 310                 decoded.fma_write_unit = REG_WRITE_THREE;
 311                 decoded.add_write_unit = REG_WRITE_TWO;
 312                 break;
 313         default:
 314                 fprintf(fp, "# unknown reg ctrl %d\n", ctrl);
 315         }
 316
 317         return decoded;
 318 }
 319
 320 // Pass in the add_write_unit or fma_write_unit, and this returns which register
 321 // the ADD/FMA units are writing to
 322 static unsigned GetRegToWrite(enum bifrost_reg_write_unit unit, struct bifrost_regs regs)
 323 {
 324         switch (unit) {
 325         case REG_WRITE_TWO:
 326                 return regs.reg2;
 327         case REG_WRITE_THREE:
 328                 return regs.reg3;
 329         default: /* REG_WRITE_NONE */
 330                 assert(0);
 331                 return 0;
 332         }
 333 }
 334
 335 static void dump_regs(FILE *fp, struct bifrost_regs srcs)
 336 {
 337         struct bifrost_reg_ctrl ctrl = DecodeRegCtrl(fp, srcs);
 338         fprintf(fp, "# ");
 339         if (ctrl.read_reg0)
 340                 fprintf(fp, "port 0: R%d ", get_reg0(srcs));
 341         if (ctrl.read_reg1)
 342                 fprintf(fp, "port 1: R%d ", get_reg1(srcs));
 343
 344         if (ctrl.fma_write_unit == REG_WRITE_TWO)
 345                 fprintf(fp, "port 2: R%d (write FMA) ", srcs.reg2);
 346         else if (ctrl.add_write_unit == REG_WRITE_TWO)
 347                 fprintf(fp, "port 2: R%d (write ADD) ", srcs.reg2);
 348
 349         if (ctrl.fma_write_unit == REG_WRITE_THREE)
 350                 fprintf(fp, "port 3: R%d (write FMA) ", srcs.reg3);
 351         else if (ctrl.add_write_unit == REG_WRITE_THREE)
 352                 fprintf(fp, "port 3: R%d (write ADD) ", srcs.reg3);
 353         else if (ctrl.read_reg3)
 354                 fprintf(fp, "port 3: R%d (read) ", srcs.reg3);
 355
 356         if (srcs.uniform_const) {
 357                 if (srcs.uniform_const & 0x80) {
 358                         fprintf(fp, "uniform: U%d", (srcs.uniform_const & 0x7f) * 2);
 359                 }
 360         }
 361
 362         fprintf(fp, "\n");
 363 }
 364 static void dump_const_imm(FILE *fp, uint32_t imm)
 365 {
 366         union {
 367                 float f;
 368                 uint32_t i;
 369         } fi;
 370         fi.i = imm;
 371         fprintf(fp, "0x%08x /* %f */", imm, fi.f);
 372 }
 373
 374 static uint64_t get_const(uint64_t *consts, struct bifrost_regs srcs)
 375 {
 376         unsigned low_bits = srcs.uniform_const & 0xf;
 377         uint64_t imm;
 378         switch (srcs.uniform_const >> 4) {
 379         case 4:
 380                 imm = consts[0];
 381                 break;
 382         case 5:
 383                 imm = consts[1];
 384                 break;
 385         case 6:
 386                 imm = consts[2];
 387                 break;
 388         case 7:
 389                 imm = consts[3];
 390                 break;
 391         case 2:
 392                 imm = consts[4];
 393                 break;
 394         case 3:
 395                 imm = consts[5];
 396                 break;
 397         default:
 398                 assert(0);
 399                 break;
 400         }
 401         return imm | low_bits;
 402 }
 403
 404 static void dump_uniform_const_src(FILE *fp, struct bifrost_regs srcs, uint64_t *consts, bool high32)
 405 {
 406         if (srcs.uniform_const & 0x80) {
 407                 unsigned uniform = (srcs.uniform_const & 0x7f) * 2;
 408                 fprintf(fp, "U%d", uniform + (high32 ? 1 : 0));
 409         } else if (srcs.uniform_const >= 0x20) {
 410                 uint64_t imm = get_const(consts, srcs);
 411                 if (high32)
 412                         dump_const_imm(fp, imm >> 32);
 413                 else
 414                         dump_const_imm(fp, imm);
 415         } else {
 416                 switch (srcs.uniform_const) {
 417                 case 0:
 418                         fprintf(fp, "0");
 419                         break;
 420                 case 5:
 421                         fprintf(fp, "atest-data");
 422                         break;
 423                 case 6:
 424                         fprintf(fp, "sample-ptr");
 425                         break;
 426                 case 8:
 427                 case 9:
 428                 case 10:
 429                 case 11:
 430                 case 12:
 431                 case 13:
 432                 case 14:
 433                 case 15:
 434                         fprintf(fp, "blend-descriptor%u", (unsigned) srcs.uniform_const - 8);
 435                         break;
 436                 default:
 437                         fprintf(fp, "unkConst%u", (unsigned) srcs.uniform_const);
 438                         break;
 439                 }
 440
 441                 if (high32)
 442                         fprintf(fp, ".y");
 443                 else
 444                         fprintf(fp, ".x");
 445         }
 446 }
 447
 448 static void dump_src(FILE *fp, unsigned src, struct bifrost_regs srcs, uint64_t *consts, bool isFMA)
 449 {
 450         switch (src) {
 451         case 0:
 452                 fprintf(fp, "R%d", get_reg0(srcs));
 453                 break;
 454         case 1:
 455                 fprintf(fp, "R%d", get_reg1(srcs));
 456                 break;
 457         case 2:
 458                 fprintf(fp, "R%d", srcs.reg3);
 459                 break;
 460         case 3:
 461                 if (isFMA)
 462                         fprintf(fp, "0");
 463                 else
 464                         fprintf(fp, "T"); // i.e. the output of FMA this cycle
 465                 break;
 466         case 4:
 467                 dump_uniform_const_src(fp, srcs, consts, false);
 468                 break;
 469         case 5:
 470                 dump_uniform_const_src(fp, srcs, consts, true);
 471                 break;
 472         case 6:
 473                 fprintf(fp, "T0");
 474                 break;
 475         case 7:
 476                 fprintf(fp, "T1");
 477                 break;
 478         }
 479 }
 480
 481 static void dump_output_mod(FILE *fp, unsigned mod)
 482 {
 483         switch (mod) {
 484         case 0:
 485                 break;
 486         case 1:
 487                 fprintf(fp, ".clamp_0_inf");
 488                 break; // max(out, 0)
 489         case 2:
 490                 fprintf(fp, ".clamp_m1_1");
 491                 break; // clamp(out, -1, 1)
 492         case 3:
 493                 fprintf(fp, ".clamp_0_1");
 494                 break; // clamp(out, 0, 1)
 495         default:
 496                 break;
 497         }
 498 }
 499
 500 static void dump_minmax_mode(FILE *fp, unsigned mod)
 501 {
 502         switch (mod) {
 503         case 0:
 504                 /* Same as fmax() and fmin() -- return the other number if any
 505                  * number is NaN.  Also always return +0 if one argument is +0 and
 506                  * the other is -0.
 507                  */
 508                 break;
 509         case 1:
 510                 /* Instead of never returning a NaN, always return one. The
 511                  * "greater"/"lesser" NaN is always returned, first by checking the
 512                  * sign and then the mantissa bits.
 513                  */
 514                 fprintf(fp, ".nan_wins");
 515                 break;
 516         case 2:
 517                 /* For max, implement src0 > src1 ? src0 : src1
 518                  * For min, implement src0 < src1 ? src0 : src1
 519                  *
 520                  * This includes handling NaN's and signedness of 0 differently
 521                  * from above, since +0 and -0 compare equal and comparisons always
 522                  * return false for NaN's. As a result, this mode is *not*
 523                  * commutative.
 524                  */
 525                 fprintf(fp, ".src1_wins");
 526                 break;
 527         case 3:
 528                 /* For max, implement src0 < src1 ? src1 : src0
 529                  * For min, implement src0 > src1 ? src1 : src0
 530                  */
 531                 fprintf(fp, ".src0_wins");
 532                 break;
 533         default:
 534                 break;
 535         }
 536 }
 537
 538 static void dump_round_mode(FILE *fp, unsigned mod)
 539 {
 540         switch (mod) {
 541         case 0:
 542                 /* roundTiesToEven, the IEEE default. */
 543                 break;
 544         case 1:
 545                 /* roundTowardPositive in the IEEE spec. */
 546                 fprintf(fp, ".round_pos");
 547                 break;
 548         case 2:
 549                 /* roundTowardNegative in the IEEE spec. */
 550                 fprintf(fp, ".round_neg");
 551                 break;
 552         case 3:
 553                 /* roundTowardZero in the IEEE spec. */
 554                 fprintf(fp, ".round_zero");
 555                 break;
 556         default:
 557                 break;
 558         }
 559 }
 560
 561 static const char *
 562 csel_cond_name(enum bifrost_csel_cond cond)
 563 {
 564         switch (cond) {
 565         case BIFROST_FEQ_F: return "feq.f";
 566         case BIFROST_FGT_F: return "fgt.f";
 567         case BIFROST_FGE_F: return "fge.f";
 568         case BIFROST_IEQ_F: return "ieq.f";
 569         case BIFROST_IGT_I: return "igt.i";
 570         case BIFROST_IGE_I: return "uge.i";
 571         case BIFROST_UGT_I: return "ugt.i";
 572         case BIFROST_UGE_I: return "uge.i";
 573         default: return "invalid";
 574         }
 575 }
 576
 577 static const struct fma_op_info FMAOpInfos[] = {
 578         { 0x00000, "FMA.f32",  FMA_FMA },
 579         { 0x40000, "MAX.f32", FMA_FMINMAX },
 580         { 0x44000, "MIN.f32", FMA_FMINMAX },
 581         { 0x48000, "FCMP.GL", FMA_FCMP },
 582         { 0x4c000, "FCMP.D3D", FMA_FCMP },
 583         { 0x4ff98, "ADD.i32", FMA_TWO_SRC },
 584         { 0x4ffd8, "SUB.i32", FMA_TWO_SRC },
 585         { 0x4fff0, "SUBB.i32", FMA_TWO_SRC },
 586         { 0x50000, "FMA_MSCALE", FMA_FMA_MSCALE },
 587         { 0x58000, "ADD.f32", FMA_FADD },
 588         { 0x5c000, "CSEL4", FMA_CSEL4 },
 589         { 0x5d8d0, "ICMP.D3D.GT.v2i16", FMA_TWO_SRC },
 590         { 0x5d9d0, "UCMP.D3D.GT.v2i16", FMA_TWO_SRC },
 591         { 0x5dad0, "ICMP.D3D.GE.v2i16", FMA_TWO_SRC },
 592         { 0x5dbd0, "UCMP.D3D.GE.v2i16", FMA_TWO_SRC },
 593         { 0x5dcd0, "ICMP.D3D.EQ.v2i16", FMA_TWO_SRC },
 594         { 0x5de40, "ICMP.GL.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? 1 : 0
 595         { 0x5de48, "ICMP.GL.GE.i32", FMA_TWO_SRC },
 596         { 0x5de50, "UCMP.GL.GT.i32", FMA_TWO_SRC },
 597         { 0x5de58, "UCMP.GL.GE.i32", FMA_TWO_SRC },
 598         { 0x5de60, "ICMP.GL.EQ.i32", FMA_TWO_SRC },
 599         { 0x5dec0, "ICMP.D3D.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? ~0 : 0
 600         { 0x5dec8, "ICMP.D3D.GE.i32", FMA_TWO_SRC },
 601         { 0x5ded0, "UCMP.D3D.GT.i32", FMA_TWO_SRC },
 602         { 0x5ded8, "UCMP.D3D.GE.i32", FMA_TWO_SRC },
 603         { 0x5dee0, "ICMP.D3D.EQ.i32", FMA_TWO_SRC },
 604         { 0x60000, "RSHIFT_NAND", FMA_SHIFT },
 605         { 0x61000, "RSHIFT_AND", FMA_SHIFT },
 606         { 0x62000, "LSHIFT_NAND", FMA_SHIFT },
 607         { 0x63000, "LSHIFT_AND", FMA_SHIFT }, // (src0 << src2) & src1
 608         { 0x64000, "RSHIFT_XOR", FMA_SHIFT },
 609         { 0x65200, "LSHIFT_ADD.i32", FMA_THREE_SRC },
 610         { 0x65600, "LSHIFT_SUB.i32", FMA_THREE_SRC }, // (src0 << src2) - src1
 611         { 0x65a00, "LSHIFT_RSUB.i32", FMA_THREE_SRC }, // src1 - (src0 << src2)
 612         { 0x65e00, "RSHIFT_ADD.i32", FMA_THREE_SRC },
 613         { 0x66200, "RSHIFT_SUB.i32", FMA_THREE_SRC },
 614         { 0x66600, "RSHIFT_RSUB.i32", FMA_THREE_SRC },
 615         { 0x66a00, "ARSHIFT_ADD.i32", FMA_THREE_SRC },
 616         { 0x66e00, "ARSHIFT_SUB.i32", FMA_THREE_SRC },
 617         { 0x67200, "ARSHIFT_RSUB.i32", FMA_THREE_SRC },
 618         { 0x80000, "FMA.v2f16",  FMA_FMA16 },
 619         { 0xc0000, "MAX.v2f16", FMA_FMINMAX16 },
 620         { 0xc4000, "MIN.v2f16", FMA_FMINMAX16 },
 621         { 0xc8000, "FCMP.GL", FMA_FCMP16 },
 622         { 0xcc000, "FCMP.D3D", FMA_FCMP16 },
 623         { 0xcf900, "ADD.v2i16", FMA_TWO_SRC },
 624         { 0xcfc10, "ADDC.i32", FMA_TWO_SRC },
 625         { 0xcfd80, "ADD.i32.i16.X", FMA_TWO_SRC },
 626         { 0xcfd90, "ADD.i32.u16.X", FMA_TWO_SRC },
 627         { 0xcfdc0, "ADD.i32.i16.Y", FMA_TWO_SRC },
 628         { 0xcfdd0, "ADD.i32.u16.Y", FMA_TWO_SRC },
 629         { 0xd8000, "ADD.v2f16", FMA_FADD16 },
 630         { 0xdc000, "CSEL4.v16", FMA_CSEL4 },
 631         { 0xdd000, "F32_TO_F16", FMA_TWO_SRC },
 632         { 0xe0046, "F16_TO_I16.XX", FMA_ONE_SRC },
 633         { 0xe0047, "F16_TO_U16.XX", FMA_ONE_SRC },
 634         { 0xe004e, "F16_TO_I16.YX", FMA_ONE_SRC },
 635         { 0xe004f, "F16_TO_U16.YX", FMA_ONE_SRC },
 636         { 0xe0056, "F16_TO_I16.XY", FMA_ONE_SRC },
 637         { 0xe0057, "F16_TO_U16.XY", FMA_ONE_SRC },
 638         { 0xe005e, "F16_TO_I16.YY", FMA_ONE_SRC },
 639         { 0xe005f, "F16_TO_U16.YY", FMA_ONE_SRC },
 640         { 0xe00c0, "I16_TO_F16.XX", FMA_ONE_SRC },
 641         { 0xe00c1, "U16_TO_F16.XX", FMA_ONE_SRC },
 642         { 0xe00c8, "I16_TO_F16.YX", FMA_ONE_SRC },
 643         { 0xe00c9, "U16_TO_F16.YX", FMA_ONE_SRC },
 644         { 0xe00d0, "I16_TO_F16.XY", FMA_ONE_SRC },
 645         { 0xe00d1, "U16_TO_F16.XY", FMA_ONE_SRC },
 646         { 0xe00d8, "I16_TO_F16.YY", FMA_ONE_SRC },
 647         { 0xe00d9, "U16_TO_F16.YY", FMA_ONE_SRC },
 648         { 0xe0136, "F32_TO_I32", FMA_ONE_SRC },
 649         { 0xe0137, "F32_TO_U32", FMA_ONE_SRC },
 650         { 0xe0178, "I32_TO_F32", FMA_ONE_SRC },
 651         { 0xe0179, "U32_TO_F32", FMA_ONE_SRC },
 652         { 0xe0198, "I16_TO_I32.X", FMA_ONE_SRC },
 653         { 0xe0199, "U16_TO_U32.X", FMA_ONE_SRC },
 654         { 0xe019a, "I16_TO_I32.Y", FMA_ONE_SRC },
 655         { 0xe019b, "U16_TO_U32.Y", FMA_ONE_SRC },
 656         { 0xe019c, "I16_TO_F32.X", FMA_ONE_SRC },
 657         { 0xe019d, "U16_TO_F32.X", FMA_ONE_SRC },
 658         { 0xe019e, "I16_TO_F32.Y", FMA_ONE_SRC },
 659         { 0xe019f, "U16_TO_F32.Y", FMA_ONE_SRC },
 660         { 0xe01a2, "F16_TO_F32.X", FMA_ONE_SRC },
 661         { 0xe01a3, "F16_TO_F32.Y", FMA_ONE_SRC },
 662         { 0xe032c, "NOP",  FMA_ONE_SRC },
 663         { 0xe032d, "MOV",  FMA_ONE_SRC },
 664         { 0xe032f, "SWZ.YY.v2i16",  FMA_ONE_SRC },
 665         { 0xe0345, "LOG_FREXPM", FMA_ONE_SRC },
 666         { 0xe0365, "FRCP_FREXPM", FMA_ONE_SRC },
 667         { 0xe0375, "FSQRT_FREXPM", FMA_ONE_SRC },
 668         { 0xe038d, "FRCP_FREXPE", FMA_ONE_SRC },
 669         { 0xe03a5, "FSQRT_FREXPE", FMA_ONE_SRC },
 670         { 0xe03ad, "FRSQ_FREXPE", FMA_ONE_SRC },
 671         { 0xe03c5, "LOG_FREXPE", FMA_ONE_SRC },
 672         { 0xe03fa, "CLZ", FMA_ONE_SRC },
 673         { 0xe0b80, "IMAX3", FMA_THREE_SRC },
 674         { 0xe0bc0, "UMAX3", FMA_THREE_SRC },
 675         { 0xe0c00, "IMIN3", FMA_THREE_SRC },
 676         { 0xe0c40, "UMIN3", FMA_THREE_SRC },
 677         { 0xe0ec5, "ROUND", FMA_ONE_SRC },
 678         { 0xe0f40, "CSEL", FMA_THREE_SRC }, // src2 != 0 ? src1 : src0
 679         { 0xe0fc0, "MUX.i32", FMA_THREE_SRC }, // see ADD comment
 680         { 0xe1805, "ROUNDEVEN", FMA_ONE_SRC },
 681         { 0xe1845, "CEIL", FMA_ONE_SRC },
 682         { 0xe1885, "FLOOR", FMA_ONE_SRC },
 683         { 0xe18c5, "TRUNC", FMA_ONE_SRC },
 684         { 0xe19b0, "ATAN_LDEXP.Y.f32", FMA_TWO_SRC },
 685         { 0xe19b8, "ATAN_LDEXP.X.f32", FMA_TWO_SRC },
 686         { 0xe1c80, "LSHIFT_ADD_LOW32.u32", FMA_SHIFT_ADD64 },
 687         { 0xe1cc0, "LSHIFT_ADD_LOW32.i64", FMA_SHIFT_ADD64 },
 688         { 0xe1d80, "LSHIFT_ADD_LOW32.i32", FMA_SHIFT_ADD64 },
 689         { 0xe1e00, "SEL.XX.i16", FMA_TWO_SRC },
 690         { 0xe1e08, "SEL.YX.i16", FMA_TWO_SRC },
 691         { 0xe1e10, "SEL.XY.i16", FMA_TWO_SRC },
 692         { 0xe1e18, "SEL.YY.i16", FMA_TWO_SRC },
 693         { 0xe7800, "IMAD", FMA_THREE_SRC },
 694         { 0xe78db, "POPCNT", FMA_ONE_SRC },
 695 };
 696
 697 static struct fma_op_info find_fma_op_info(unsigned op)
 698 {
 699         for (unsigned i = 0; i < ARRAY_SIZE(FMAOpInfos); i++) {
 700                 unsigned opCmp = ~0;
 701                 switch (FMAOpInfos[i].src_type) {
 702                 case FMA_ONE_SRC:
 703                         opCmp = op;
 704                         break;
 705                 case FMA_TWO_SRC:
 706                         opCmp = op & ~0x7;
 707                         break;
 708                 case FMA_FCMP:
 709                 case FMA_FCMP16:
 710                         opCmp = op & ~0x1fff;
 711                         break;
 712                 case FMA_THREE_SRC:
 713                 case FMA_SHIFT_ADD64:
 714                         opCmp = op & ~0x3f;
 715                         break;
 716                 case FMA_FADD:
 717                 case FMA_FMINMAX:
 718                 case FMA_FADD16:
 719                 case FMA_FMINMAX16:
 720                         opCmp = op & ~0x3fff;
 721                         break;
 722                 case FMA_FMA:
 723                 case FMA_FMA16:
 724                         opCmp = op & ~0x3ffff;
 725                         break;
 726                 case FMA_CSEL4:
 727                 case FMA_SHIFT:
 728                         opCmp = op & ~0xfff;
 729                         break;
 730                 case FMA_FMA_MSCALE:
 731                         opCmp = op & ~0x7fff;
 732                         break;
 733                 default:
 734                         opCmp = ~0;
 735                         break;
 736                 }
 737                 if (FMAOpInfos[i].op == opCmp)
 738                         return FMAOpInfos[i];
 739         }
 740
 741         struct fma_op_info info;
 742         snprintf(info.name, sizeof(info.name), "op%04x", op);
 743         info.op = op;
 744         info.src_type = FMA_THREE_SRC;
 745         return info;
 746 }
 747
 748 static void dump_fcmp(FILE *fp, unsigned op)
 749 {
 750         switch (op) {
 751         case 0:
 752                 fprintf(fp, ".OEQ");
 753                 break;
 754         case 1:
 755                 fprintf(fp, ".OGT");
 756                 break;
 757         case 2:
 758                 fprintf(fp, ".OGE");
 759                 break;
 760         case 3:
 761                 fprintf(fp, ".UNE");
 762                 break;
 763         case 4:
 764                 fprintf(fp, ".OLT");
 765                 break;
 766         case 5:
 767                 fprintf(fp, ".OLE");
 768                 break;
 769         default:
 770                 fprintf(fp, ".unk%d", op);
 771                 break;
 772         }
 773 }
 774
 775 static void dump_16swizzle(FILE *fp, unsigned swiz)
 776 {
 777         if (swiz == 2)
 778                 return;
 779         fprintf(fp, ".%c%c", "xy"[swiz & 1], "xy"[(swiz >> 1) & 1]);
 780 }
 781
 782 static void dump_fma_expand_src0(FILE *fp, unsigned ctrl)
 783 {
 784         switch (ctrl) {
 785         case 3:
 786         case 4:
 787         case 6:
 788                 fprintf(fp, ".x");
 789                 break;
 790         case 5:
 791         case 7:
 792                 fprintf(fp, ".y");
 793                 break;
 794         case 0:
 795         case 1:
 796         case 2:
 797                 break;
 798         default:
 799                 fprintf(fp, ".unk");
 800                 break;
 801         }
 802 }
 803
 804 static void dump_fma_expand_src1(FILE *fp, unsigned ctrl)
 805 {
 806         switch (ctrl) {
 807         case 1:
 808         case 3:
 809                 fprintf(fp, ".x");
 810                 break;
 811         case 2:
 812         case 4:
 813         case 5:
 814                 fprintf(fp, ".y");
 815                 break;
 816         case 0:
 817         case 6:
 818         case 7:
 819                 break;
 820         default:
 821                 fprintf(fp, ".unk");
 822                 break;
 823         }
 824 }
 825
 826 static void dump_fma(FILE *fp, uint64_t word, struct bifrost_regs regs, struct bifrost_regs next_regs, uint64_t *consts, bool verbose)
 827 {
 828         if (verbose) {
 829                 fprintf(fp, "# FMA: %016" PRIx64 "\n", word);
 830         }
 831         struct bifrost_fma_inst FMA;
 832         memcpy((char *) &FMA, (char *) &word, sizeof(struct bifrost_fma_inst));
 833         struct fma_op_info info = find_fma_op_info(FMA.op);
 834
 835         fprintf(fp, "%s", info.name);
 836         if (info.src_type == FMA_FADD ||
 837             info.src_type == FMA_FMINMAX ||
 838             info.src_type == FMA_FMA ||
 839             info.src_type == FMA_FADD16 ||
 840             info.src_type == FMA_FMINMAX16 ||
 841             info.src_type == FMA_FMA16) {
 842                 dump_output_mod(fp, bits(FMA.op, 12, 14));
 843                 switch (info.src_type) {
 844                 case FMA_FADD:
 845                 case FMA_FMA:
 846                 case FMA_FADD16:
 847                 case FMA_FMA16:
 848                         dump_round_mode(fp, bits(FMA.op, 10, 12));
 849                         break;
 850                 case FMA_FMINMAX:
 851                 case FMA_FMINMAX16:
 852                         dump_minmax_mode(fp, bits(FMA.op, 10, 12));
 853                         break;
 854                 default:
 855                         assert(0);
 856                 }
 857         } else if (info.src_type == FMA_FCMP || info.src_type == FMA_FCMP16) {
 858                 dump_fcmp(fp, bits(FMA.op, 10, 13));
 859                 if (info.src_type == FMA_FCMP)
 860                         fprintf(fp, ".f32");
 861                 else
 862                         fprintf(fp, ".v2f16");
 863         } else if (info.src_type == FMA_FMA_MSCALE) {
 864                 if (FMA.op & (1 << 11)) {
 865                         switch ((FMA.op >> 9) & 0x3) {
 866                         case 0:
 867                                 /* This mode seems to do a few things:
 868                                  * - Makes 0 * infinity (and incidentally 0 * nan) return 0,
 869                                  *   since generating a nan would poison the result of
 870                                  *   1/infinity and 1/0.
 871                                  * - Fiddles with which nan is returned in nan * nan,
 872                                  *   presumably to make sure that the same exact nan is
 873                                  *   returned for 1/nan.
 874                                  */
 875                                 fprintf(fp, ".rcp_mode");
 876                                 break;
 877                         case 3:
 878                                 /* Similar to the above, but src0 always wins when multiplying
 879                                  * 0 by infinity.
 880                                  */
 881                                 fprintf(fp, ".sqrt_mode");
 882                                 break;
 883                         default:
 884                                 fprintf(fp, ".unk%d_mode", (int) (FMA.op >> 9) & 0x3);
 885                         }
 886                 } else {
 887                         dump_output_mod(fp, bits(FMA.op, 9, 11));
 888                 }
 889         } else if (info.src_type == FMA_SHIFT) {
 890                 struct bifrost_shift_fma shift;
 891                 memcpy(&shift, &FMA, sizeof(shift));
 892
 893                 if (shift.half == 0x7)
 894                         fprintf(fp, ".v2i16");
 895                 else if (shift.half == 0)
 896                         fprintf(fp, ".i32");
 897                 else if (shift.half == 0x4)
 898                         fprintf(fp, ".v4i8");
 899                 else
 900                         fprintf(fp, ".unk%u", shift.half);
 901
 902                 if (!shift.unk)
 903                         fprintf(fp, ".no_unk");
 904
 905                 if (shift.invert_1)
 906                         fprintf(fp, ".invert_1");
 907
 908                 if (shift.invert_2)
 909                         fprintf(fp, ".invert_2");
 910         }
 911
 912         fprintf(fp, " ");
 913
 914         struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(fp, next_regs);
 915         if (next_ctrl.fma_write_unit != REG_WRITE_NONE) {
 916                 fprintf(fp, "{R%d, T0}, ", GetRegToWrite(next_ctrl.fma_write_unit, next_regs));
 917         } else {
 918                 fprintf(fp, "T0, ");
 919         }
 920
 921         switch (info.src_type) {
 922         case FMA_ONE_SRC:
 923                 dump_src(fp, FMA.src0, regs, consts, true);
 924                 break;
 925         case FMA_TWO_SRC:
 926                 dump_src(fp, FMA.src0, regs, consts, true);
 927                 fprintf(fp, ", ");
 928                 dump_src(fp, FMA.op & 0x7, regs, consts, true);
 929                 break;
 930         case FMA_FADD:
 931         case FMA_FMINMAX:
 932                 if (FMA.op & 0x10)
 933                         fprintf(fp, "-");
 934                 if (FMA.op & 0x200)
 935                         fprintf(fp, "abs(");
 936                 dump_src(fp, FMA.src0, regs, consts, true);
 937                 dump_fma_expand_src0(fp, (FMA.op >> 6) & 0x7);
 938                 if (FMA.op & 0x200)
 939                         fprintf(fp, ")");
 940                 fprintf(fp, ", ");
 941                 if (FMA.op & 0x20)
 942                         fprintf(fp, "-");
 943                 if (FMA.op & 0x8)
 944                         fprintf(fp, "abs(");
 945                 dump_src(fp, FMA.op & 0x7, regs, consts, true);
 946                 dump_fma_expand_src1(fp, (FMA.op >> 6) & 0x7);
 947                 if (FMA.op & 0x8)
 948                         fprintf(fp, ")");
 949                 break;
 950         case FMA_FADD16:
 951         case FMA_FMINMAX16: {
 952                 bool abs1 = FMA.op & 0x8;
 953                 bool abs2 = (FMA.op & 0x7) < FMA.src0;
 954                 if (FMA.op & 0x10)
 955                         fprintf(fp, "-");
 956                 if (abs1 || abs2)
 957                         fprintf(fp, "abs(");
 958                 dump_src(fp, FMA.src0, regs, consts, true);
 959                 dump_16swizzle(fp, (FMA.op >> 6) & 0x3);
 960                 if (abs1 || abs2)
 961                         fprintf(fp, ")");
 962                 fprintf(fp, ", ");
 963                 if (FMA.op & 0x20)
 964                         fprintf(fp, "-");
 965                 if (abs1 && abs2)
 966                         fprintf(fp, "abs(");
 967                 dump_src(fp, FMA.op & 0x7, regs, consts, true);
 968                 dump_16swizzle(fp, (FMA.op >> 8) & 0x3);
 969                 if (abs1 && abs2)
 970                         fprintf(fp, ")");
 971                 break;
 972         }
 973         case FMA_FCMP:
 974                 if (FMA.op & 0x200)
 975                         fprintf(fp, "abs(");
 976                 dump_src(fp, FMA.src0, regs, consts, true);
 977                 dump_fma_expand_src0(fp, (FMA.op >> 6) & 0x7);
 978                 if (FMA.op & 0x200)
 979                         fprintf(fp, ")");
 980                 fprintf(fp, ", ");
 981                 if (FMA.op & 0x20)
 982                         fprintf(fp, "-");
 983                 if (FMA.op & 0x8)
 984                         fprintf(fp, "abs(");
 985                 dump_src(fp, FMA.op & 0x7, regs, consts, true);
 986                 dump_fma_expand_src1(fp, (FMA.op >> 6) & 0x7);
 987                 if (FMA.op & 0x8)
 988                         fprintf(fp, ")");
 989                 break;
 990         case FMA_FCMP16:
 991                 dump_src(fp, FMA.src0, regs, consts, true);
 992                 // Note: this is kinda a guess, I haven't seen the blob set this to
 993                 // anything other than the identity, but it matches FMA_TWO_SRCFmod16
 994                 dump_16swizzle(fp, (FMA.op >> 6) & 0x3);
 995                 fprintf(fp, ", ");
 996                 dump_src(fp, FMA.op & 0x7, regs, consts, true);
 997                 dump_16swizzle(fp, (FMA.op >> 8) & 0x3);
 998                 break;
 999         case FMA_SHIFT_ADD64:
1000                 dump_src(fp, FMA.src0, regs, consts, true);
1001                 fprintf(fp, ", ");
1002                 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1003                 fprintf(fp, ", ");
1004                 fprintf(fp, "shift:%u", (FMA.op >> 3) & 0x7);
1005                 break;
1006         case FMA_THREE_SRC:
1007                 dump_src(fp, FMA.src0, regs, consts, true);
1008                 fprintf(fp, ", ");
1009                 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1010                 fprintf(fp, ", ");
1011                 dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true);
1012                 break;
1013         case FMA_SHIFT: {
1014                 struct bifrost_shift_fma shift;
1015                 memcpy(&shift, &FMA, sizeof(shift));
1016
1017                 dump_src(fp, shift.src0, regs, consts, true);
1018                 fprintf(fp, ", ");
1019                 dump_src(fp, shift.src1, regs, consts, true);
1020                 fprintf(fp, ", ");
1021                 dump_src(fp, shift.src2, regs, consts, true);
1022                 break;
1023         }
1024         case FMA_FMA:
1025                 if (FMA.op & (1 << 14))
1026                         fprintf(fp, "-");
1027                 if (FMA.op & (1 << 9))
1028                         fprintf(fp, "abs(");
1029                 dump_src(fp, FMA.src0, regs, consts, true);
1030                 dump_fma_expand_src0(fp, (FMA.op >> 6) & 0x7);
1031                 if (FMA.op & (1 << 9))
1032                         fprintf(fp, ")");
1033                 fprintf(fp, ", ");
1034                 if (FMA.op & (1 << 16))
1035                         fprintf(fp, "abs(");
1036                 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1037                 dump_fma_expand_src1(fp, (FMA.op >> 6) & 0x7);
1038                 if (FMA.op & (1 << 16))
1039                         fprintf(fp, ")");
1040                 fprintf(fp, ", ");
1041                 if (FMA.op & (1 << 15))
1042                         fprintf(fp, "-");
1043                 if (FMA.op & (1 << 17))
1044                         fprintf(fp, "abs(");
1045                 dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true);
1046                 if (FMA.op & (1 << 17))
1047                         fprintf(fp, ")");
1048                 break;
1049         case FMA_FMA16:
1050                 if (FMA.op & (1 << 14))
1051                         fprintf(fp, "-");
1052                 dump_src(fp, FMA.src0, regs, consts, true);
1053                 dump_16swizzle(fp, (FMA.op >> 6) & 0x3);
1054                 fprintf(fp, ", ");
1055                 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1056                 dump_16swizzle(fp, (FMA.op >> 8) & 0x3);
1057                 fprintf(fp, ", ");
1058                 if (FMA.op & (1 << 15))
1059                         fprintf(fp, "-");
1060                 dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true);
1061                 dump_16swizzle(fp, (FMA.op >> 16) & 0x3);
1062                 break;
1063         case FMA_CSEL4: {
1064                 struct bifrost_csel4 csel;
1065                 memcpy(&csel, &FMA, sizeof(csel));
1066                 fprintf(fp, ".%s ", csel_cond_name(csel.cond));
1067
1068                 dump_src(fp, csel.src0, regs, consts, true);
1069                 fprintf(fp, ", ");
1070                 dump_src(fp, csel.src1, regs, consts, true);
1071                 fprintf(fp, ", ");
1072                 dump_src(fp, csel.src2, regs, consts, true);
1073                 fprintf(fp, ", ");
1074                 dump_src(fp, csel.src3, regs, consts, true);
1075                 break;
1076         }
1077         case FMA_FMA_MSCALE:
1078                 if (FMA.op & (1 << 12))
1079                         fprintf(fp, "abs(");
1080                 dump_src(fp, FMA.src0, regs, consts, true);
1081                 if (FMA.op & (1 << 12))
1082                         fprintf(fp, ")");
1083                 fprintf(fp, ", ");
1084                 if (FMA.op & (1 << 13))
1085                         fprintf(fp, "-");
1086                 dump_src(fp, FMA.op & 0x7, regs, consts, true);
1087                 fprintf(fp, ", ");
1088                 if (FMA.op & (1 << 14))
1089                         fprintf(fp, "-");
1090                 dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true);
1091                 fprintf(fp, ", ");
1092                 dump_src(fp, (FMA.op >> 6) & 0x7, regs, consts, true);
1093                 break;
1094         }
1095         fprintf(fp, "\n");
1096 }
1097
1098 static const struct add_op_info add_op_infos[] = {
1099         { 0x00000, "MAX.f32", ADD_FMINMAX },
1100         { 0x02000, "MIN.f32", ADD_FMINMAX },
1101         { 0x04000, "ADD.f32", ADD_FADD },
1102         { 0x06000, "FCMP.GL", ADD_FCMP },
1103         { 0x07000, "FCMP.D3D", ADD_FCMP },
1104         { 0x07856, "F16_TO_I16", ADD_ONE_SRC },
1105         { 0x07857, "F16_TO_U16", ADD_ONE_SRC },
1106         { 0x078c0, "I16_TO_F16.XX", ADD_ONE_SRC },
1107         { 0x078c1, "U16_TO_F16.XX", ADD_ONE_SRC },
1108         { 0x078c8, "I16_TO_F16.YX", ADD_ONE_SRC },
1109         { 0x078c9, "U16_TO_F16.YX", ADD_ONE_SRC },
1110         { 0x078d0, "I16_TO_F16.XY", ADD_ONE_SRC },
1111         { 0x078d1, "U16_TO_F16.XY", ADD_ONE_SRC },
1112         { 0x078d8, "I16_TO_F16.YY", ADD_ONE_SRC },
1113         { 0x078d9, "U16_TO_F16.YY", ADD_ONE_SRC },
1114         { 0x07936, "F32_TO_I32", ADD_ONE_SRC },
1115         { 0x07937, "F32_TO_U32", ADD_ONE_SRC },
1116         { 0x07978, "I32_TO_F32", ADD_ONE_SRC },
1117         { 0x07979, "U32_TO_F32", ADD_ONE_SRC },
1118         { 0x07998, "I16_TO_I32.X", ADD_ONE_SRC },
1119         { 0x07999, "U16_TO_U32.X", ADD_ONE_SRC },
1120         { 0x0799a, "I16_TO_I32.Y", ADD_ONE_SRC },
1121         { 0x0799b, "U16_TO_U32.Y", ADD_ONE_SRC },
1122         { 0x0799c, "I16_TO_F32.X", ADD_ONE_SRC },
1123         { 0x0799d, "U16_TO_F32.X", ADD_ONE_SRC },
1124         { 0x0799e, "I16_TO_F32.Y", ADD_ONE_SRC },
1125         { 0x0799f, "U16_TO_F32.Y", ADD_ONE_SRC },
1126         { 0x079a2, "F16_TO_F32.X", ADD_ONE_SRC },
1127         { 0x079a3, "F16_TO_F32.Y", ADD_ONE_SRC },
1128         { 0x07b2b, "SWZ.YX.v2i16",  ADD_ONE_SRC },
1129         { 0x07b2c, "NOP",  ADD_ONE_SRC },
1130         { 0x07b29, "SWZ.XX.v2i16",  ADD_ONE_SRC },
1131         { 0x07b2d, "MOV",  ADD_ONE_SRC },
1132         { 0x07b2f, "SWZ.YY.v2i16",  ADD_ONE_SRC },
1133         { 0x07b65, "FRCP_FREXPM", ADD_ONE_SRC },
1134         { 0x07b75, "FSQRT_FREXPM", ADD_ONE_SRC },
1135         { 0x07b8d, "FRCP_FREXPE", ADD_ONE_SRC },
1136         { 0x07ba5, "FSQRT_FREXPE", ADD_ONE_SRC },
1137         { 0x07bad, "FRSQ_FREXPE", ADD_ONE_SRC },
1138         { 0x07bc5, "FLOG_FREXPE", ADD_ONE_SRC },
1139         { 0x07d45, "CEIL", ADD_ONE_SRC },
1140         { 0x07d85, "FLOOR", ADD_ONE_SRC },
1141         { 0x07dc5, "TRUNC", ADD_ONE_SRC },
1142         { 0x07f18, "LSHIFT_ADD_HIGH32.i32", ADD_TWO_SRC },
1143         { 0x08000, "LD_ATTR.f16", ADD_LOAD_ATTR, true },
1144         { 0x08100, "LD_ATTR.v2f16", ADD_LOAD_ATTR, true },
1145         { 0x08200, "LD_ATTR.v3f16", ADD_LOAD_ATTR, true },
1146         { 0x08300, "LD_ATTR.v4f16", ADD_LOAD_ATTR, true },
1147         { 0x08400, "LD_ATTR.f32", ADD_LOAD_ATTR, true },
1148         { 0x08500, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true },
1149         { 0x08600, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true },
1150         { 0x08700, "LD_ATTR.v4f32", ADD_LOAD_ATTR, true },
1151         { 0x08800, "LD_ATTR.i32", ADD_LOAD_ATTR, true },
1152         { 0x08900, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true },
1153         { 0x08a00, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true },
1154         { 0x08b00, "LD_ATTR.v4i32", ADD_LOAD_ATTR, true },
1155         { 0x08c00, "LD_ATTR.u32", ADD_LOAD_ATTR, true },
1156         { 0x08d00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true },
1157         { 0x08e00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true },
1158         { 0x08f00, "LD_ATTR.v4u32", ADD_LOAD_ATTR, true },
1159         { 0x0a000, "LD_VAR.32", ADD_VARYING_INTERP, true },
1160         { 0x0b000, "TEX", ADD_TEX_COMPACT, true },
1161         { 0x0c188, "LOAD.i32", ADD_TWO_SRC, true },
1162         { 0x0c1a0, "LD_UBO.i32", ADD_TWO_SRC, true },
1163         { 0x0c1b8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true },
1164         { 0x0c1c8, "LOAD.v2i32", ADD_TWO_SRC, true },
1165         { 0x0c1e0, "LD_UBO.v2i32", ADD_TWO_SRC, true },
1166         { 0x0c1f8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true },
1167         { 0x0c208, "LOAD.v4i32", ADD_TWO_SRC, true },
1168         { 0x0c220, "LD_UBO.v4i32", ADD_TWO_SRC, true },
1169         { 0x0c238, "LD_SCRATCH.v4i32", ADD_TWO_SRC, true },
1170         { 0x0c248, "STORE.v4i32", ADD_TWO_SRC, true },
1171         { 0x0c278, "ST_SCRATCH.v4i32", ADD_TWO_SRC, true },
1172         { 0x0c588, "STORE.i32", ADD_TWO_SRC, true },
1173         { 0x0c5b8, "ST_SCRATCH.i32", ADD_TWO_SRC, true },
1174         { 0x0c5c8, "STORE.v2i32", ADD_TWO_SRC, true },
1175         { 0x0c5f8, "ST_SCRATCH.v2i32", ADD_TWO_SRC, true },
1176         { 0x0c648, "LOAD.u16", ADD_TWO_SRC, true }, // zero-extends
1177         { 0x0ca88, "LOAD.v3i32", ADD_TWO_SRC, true },
1178         { 0x0caa0, "LD_UBO.v3i32", ADD_TWO_SRC, true },
1179         { 0x0cab8, "LD_SCRATCH.v3i32", ADD_TWO_SRC, true },
1180         { 0x0cb88, "STORE.v3i32", ADD_TWO_SRC, true },
1181         { 0x0cbb8, "ST_SCRATCH.v3i32", ADD_TWO_SRC, true },
1182         { 0x0cc00, "FRCP_FAST.f32", ADD_ONE_SRC },
1183         { 0x0cc20, "FRSQ_FAST.f32", ADD_ONE_SRC },
1184         { 0x0ce00, "FRCP_TABLE", ADD_ONE_SRC },
1185         { 0x0ce10, "FRCP_FAST.f16.X", ADD_ONE_SRC },
1186         { 0x0ce20, "FRSQ_TABLE", ADD_ONE_SRC },
1187         { 0x0ce30, "FRCP_FAST.f16.Y", ADD_ONE_SRC },
1188         { 0x0ce50, "FRSQ_FAST.f16.X", ADD_ONE_SRC },
1189         { 0x0ce60, "FRCP_APPROX", ADD_ONE_SRC },
1190         { 0x0ce70, "FRSQ_FAST.f16.Y", ADD_ONE_SRC },
1191         { 0x0cf40, "ATAN_ASSIST", ADD_TWO_SRC },
1192         { 0x0cf48, "ATAN_TABLE", ADD_TWO_SRC },
1193         { 0x0cf50, "SIN_TABLE", ADD_ONE_SRC },
1194         { 0x0cf51, "COS_TABLE", ADD_ONE_SRC },
1195         { 0x0cf58, "EXP_TABLE", ADD_ONE_SRC },
1196         { 0x0cf60, "FLOG2_TABLE", ADD_ONE_SRC },
1197         { 0x0cf64, "FLOGE_TABLE", ADD_ONE_SRC },
1198         { 0x0d000, "BRANCH", ADD_BRANCH },
1199         { 0x0e8c0, "MUX", ADD_THREE_SRC },
1200         { 0x0e9b0, "ATAN_LDEXP.Y.f32", ADD_TWO_SRC },
1201         { 0x0e9b8, "ATAN_LDEXP.X.f32", ADD_TWO_SRC },
1202         { 0x0ea60, "SEL.XX.i16", ADD_TWO_SRC },
1203         { 0x0ea70, "SEL.XY.i16", ADD_TWO_SRC },
1204         { 0x0ea68, "SEL.YX.i16", ADD_TWO_SRC },
1205         { 0x0ea78, "SEL.YY.i16", ADD_TWO_SRC },
1206         { 0x0ec00, "F32_TO_F16", ADD_TWO_SRC },
1207         { 0x0f640, "ICMP.GL.GT", ADD_TWO_SRC }, // src0 > src1 ? 1 : 0
1208         { 0x0f648, "ICMP.GL.GE", ADD_TWO_SRC },
1209         { 0x0f650, "UCMP.GL.GT", ADD_TWO_SRC },
1210         { 0x0f658, "UCMP.GL.GE", ADD_TWO_SRC },
1211         { 0x0f660, "ICMP.GL.EQ", ADD_TWO_SRC },
1212         { 0x0f669, "ICMP.GL.NEQ", ADD_TWO_SRC },
1213         { 0x0f6c0, "ICMP.D3D.GT", ADD_TWO_SRC }, // src0 > src1 ? ~0 : 0
1214         { 0x0f6c8, "ICMP.D3D.GE", ADD_TWO_SRC },
1215         { 0x0f6d0, "UCMP.D3D.GT", ADD_TWO_SRC },
1216         { 0x0f6d8, "UCMP.D3D.GE", ADD_TWO_SRC },
1217         { 0x0f6e0, "ICMP.D3D.EQ", ADD_TWO_SRC },
1218         { 0x10000, "MAX.v2f16", ADD_FMINMAX16 },
1219         { 0x11000, "ADD_MSCALE.f32", ADD_FADDMscale },
1220         { 0x12000, "MIN.v2f16", ADD_FMINMAX16 },
1221         { 0x14000, "ADD.v2f16", ADD_FADD16 },
1222         { 0x17000, "FCMP.D3D", ADD_FCMP16 },
1223         { 0x178c0, "ADD.i32",  ADD_TWO_SRC },
1224         { 0x17900, "ADD.v2i16", ADD_TWO_SRC },
1225         { 0x17ac0, "SUB.i32",  ADD_TWO_SRC },
1226         { 0x17c10, "ADDC.i32", ADD_TWO_SRC }, // adds src0 to the bottom bit of src1
1227         { 0x17d80, "ADD.i32.i16.X", ADD_TWO_SRC },
1228         { 0x17d90, "ADD.i32.u16.X", ADD_TWO_SRC },
1229         { 0x17dc0, "ADD.i32.i16.Y", ADD_TWO_SRC },
1230         { 0x17dd0, "ADD.i32.u16.Y", ADD_TWO_SRC },
1231         { 0x18000, "LD_VAR_ADDR.f16", ADD_VARYING_ADDRESS, true },
1232         { 0x18100, "LD_VAR_ADDR.f32", ADD_VARYING_ADDRESS, true },
1233         { 0x18200, "LD_VAR_ADDR.i32", ADD_VARYING_ADDRESS, true },
1234         { 0x18300, "LD_VAR_ADDR.u32", ADD_VARYING_ADDRESS, true },
1235         { 0x19181, "DISCARD.FEQ.f32", ADD_TWO_SRC, true },
1236         { 0x19189, "DISCARD.FNE.f32", ADD_TWO_SRC, true },
1237         { 0x1918C, "DISCARD.GL.f32", ADD_TWO_SRC, true }, /* Consumes ICMP.GL/etc with fixed 0 argument */
1238         { 0x19190, "DISCARD.FLE.f32", ADD_TWO_SRC, true },
1239         { 0x19198, "DISCARD.FLT.f32", ADD_TWO_SRC, true },
1240         { 0x191e8, "ATEST.f32", ADD_TWO_SRC, true },
1241         { 0x191f0, "ATEST.X.f16", ADD_TWO_SRC, true },
1242         { 0x191f8, "ATEST.Y.f16", ADD_TWO_SRC, true },
1243         { 0x19300, "ST_VAR.v1", ADD_THREE_SRC, true },
1244         { 0x19340, "ST_VAR.v2", ADD_THREE_SRC, true },
1245         { 0x19380, "ST_VAR.v3", ADD_THREE_SRC, true },
1246         { 0x193c0, "ST_VAR.v4", ADD_THREE_SRC, true },
1247         { 0x1952c, "BLEND", ADD_BLENDING, true },
1248         { 0x1a000, "LD_VAR.16", ADD_VARYING_INTERP, true },
1249         { 0x1ae60, "TEX", ADD_TEX, true },
1250         { 0x1c000, "RSHIFT_NAND.i32", ADD_THREE_SRC },
1251         { 0x1c300, "RSHIFT_OR.i32", ADD_THREE_SRC },
1252         { 0x1c400, "RSHIFT_AND.i32", ADD_THREE_SRC },
1253         { 0x1c700, "RSHIFT_NOR.i32", ADD_THREE_SRC },
1254         { 0x1c800, "LSHIFT_NAND.i32", ADD_THREE_SRC },
1255         { 0x1cb00, "LSHIFT_OR.i32", ADD_THREE_SRC },
1256         { 0x1cc00, "LSHIFT_AND.i32", ADD_THREE_SRC },
1257         { 0x1cf00, "LSHIFT_NOR.i32", ADD_THREE_SRC },
1258         { 0x1d000, "RSHIFT_XOR.i32", ADD_THREE_SRC },
1259         { 0x1d100, "RSHIFT_XNOR.i32", ADD_THREE_SRC },
1260         { 0x1d200, "LSHIFT_XOR.i32", ADD_THREE_SRC },
1261         { 0x1d300, "LSHIFT_XNOR.i32", ADD_THREE_SRC },
1262         { 0x1d400, "LSHIFT_ADD.i32", ADD_THREE_SRC },
1263         { 0x1d500, "LSHIFT_SUB.i32", ADD_THREE_SRC },
1264         { 0x1d500, "LSHIFT_RSUB.i32", ADD_THREE_SRC },
1265         { 0x1d700, "RSHIFT_ADD.i32", ADD_THREE_SRC },
1266         { 0x1d800, "RSHIFT_SUB.i32", ADD_THREE_SRC },
1267         { 0x1d900, "RSHIFT_RSUB.i32", ADD_THREE_SRC },
1268         { 0x1da00, "ARSHIFT_ADD.i32", ADD_THREE_SRC },
1269         { 0x1db00, "ARSHIFT_SUB.i32", ADD_THREE_SRC },
1270         { 0x1dc00, "ARSHIFT_RSUB.i32", ADD_THREE_SRC },
1271         { 0x1dd18, "OR.i32",  ADD_TWO_SRC },
1272         { 0x1dd20, "AND.i32",  ADD_TWO_SRC },
1273         { 0x1dd60, "LSHIFT.i32", ADD_TWO_SRC },
1274         { 0x1dd50, "XOR.i32",  ADD_TWO_SRC },
1275         { 0x1dd80, "RSHIFT.i32", ADD_TWO_SRC },
1276         { 0x1dda0, "ARSHIFT.i32", ADD_TWO_SRC },
1277 };
1278
1279 static struct add_op_info find_add_op_info(unsigned op)
1280 {
1281         for (unsigned i = 0; i < ARRAY_SIZE(add_op_infos); i++) {
1282                 unsigned opCmp = ~0;
1283                 switch (add_op_infos[i].src_type) {
1284                 case ADD_ONE_SRC:
1285                 case ADD_BLENDING:
1286                         opCmp = op;
1287                         break;
1288                 case ADD_TWO_SRC:
1289                         opCmp = op & ~0x7;
1290                         break;
1291                 case ADD_THREE_SRC:
1292                         opCmp = op & ~0x3f;
1293                         break;
1294                 case ADD_TEX:
1295                         opCmp = op & ~0xf;
1296                         break;
1297                 case ADD_FADD:
1298                 case ADD_FMINMAX:
1299                 case ADD_FADD16:
1300                         opCmp = op & ~0x1fff;
1301                         break;
1302                 case ADD_FMINMAX16:
1303                 case ADD_FADDMscale:
1304                         opCmp = op & ~0xfff;
1305                         break;
1306                 case ADD_FCMP:
1307                 case ADD_FCMP16:
1308                         opCmp = op & ~0x7ff;
1309                         break;
1310                 case ADD_TEX_COMPACT:
1311                         opCmp = op & ~0x3ff;
1312                         break;
1313                 case ADD_VARYING_INTERP:
1314                         opCmp = op & ~0x7ff;
1315                         break;
1316                 case ADD_VARYING_ADDRESS:
1317                         opCmp = op & ~0xff;
1318                         break;
1319                 case ADD_LOAD_ATTR:
1320                         opCmp = op & ~0x7f;
1321                         break;
1322                 case ADD_BRANCH:
1323                         opCmp = op & ~0xfff;
1324                         break;
1325                 default:
1326                         opCmp = ~0;
1327                         break;
1328                 }
1329                 if (add_op_infos[i].op == opCmp)
1330                         return add_op_infos[i];
1331         }
1332
1333         struct add_op_info info;
1334         snprintf(info.name, sizeof(info.name), "op%04x", op);
1335         info.op = op;
1336         info.src_type = ADD_TWO_SRC;
1337         info.has_data_reg = true;
1338         return info;
1339 }
1340
1341 static void dump_add(FILE *fp, uint64_t word, struct bifrost_regs regs,
1342                      struct bifrost_regs next_regs, uint64_t *consts,
1343                      unsigned data_reg, unsigned offset, bool verbose)
1344 {
1345         if (verbose) {
1346                 fprintf(fp, "# ADD: %016" PRIx64 "\n", word);
1347         }
1348         struct bifrost_add_inst ADD;
1349         memcpy((char *) &ADD, (char *) &word, sizeof(ADD));
1350         struct add_op_info info = find_add_op_info(ADD.op);
1351
1352         fprintf(fp, "%s", info.name);
1353
1354         // float16 seems like it doesn't support output modifiers
1355         if (info.src_type == ADD_FADD || info.src_type == ADD_FMINMAX) {
1356                 // output modifiers
1357                 dump_output_mod(fp, bits(ADD.op, 8, 10));
1358                 if (info.src_type == ADD_FADD)
1359                         dump_round_mode(fp, bits(ADD.op, 10, 12));
1360                 else
1361                         dump_minmax_mode(fp, bits(ADD.op, 10, 12));
1362         } else if (info.src_type == ADD_FCMP || info.src_type == ADD_FCMP16) {
1363                 dump_fcmp(fp, bits(ADD.op, 3, 6));
1364                 if (info.src_type == ADD_FCMP)
1365                         fprintf(fp, ".f32");
1366                 else
1367                         fprintf(fp, ".v2f16");
1368         } else if (info.src_type == ADD_FADDMscale) {
1369                 switch ((ADD.op >> 6) & 0x7) {
1370                 case 0:
1371                         break;
1372                 // causes GPU hangs on G71
1373                 case 1:
1374                         fprintf(fp, ".invalid");
1375                         break;
1376                 // Same as usual outmod value.
1377                 case 2:
1378                         fprintf(fp, ".clamp_0_1");
1379                         break;
1380                 // If src0 is infinite or NaN, flush it to zero so that the other
1381                 // source is passed through unmodified.
1382                 case 3:
1383                         fprintf(fp, ".flush_src0_inf_nan");
1384                         break;
1385                 // Vice versa.
1386                 case 4:
1387                         fprintf(fp, ".flush_src1_inf_nan");
1388                         break;
1389                 // Every other case seems to behave the same as the above?
1390                 default:
1391                         fprintf(fp, ".unk%d", (ADD.op >> 6) & 0x7);
1392                         break;
1393                 }
1394         } else if (info.src_type == ADD_VARYING_INTERP) {
1395                 if (ADD.op & 0x200)
1396                         fprintf(fp, ".reuse");
1397                 if (ADD.op & 0x400)
1398                         fprintf(fp, ".flat");
1399                 switch ((ADD.op >> 7) & 0x3) {
1400                 case 0:
1401                         fprintf(fp, ".per_frag");
1402                         break;
1403                 case 1:
1404                         fprintf(fp, ".centroid");
1405                         break;
1406                 case 2:
1407                         break;
1408                 case 3:
1409                         fprintf(fp, ".explicit");
1410                         break;
1411                 }
1412                 fprintf(fp, ".v%d", ((ADD.op >> 5) & 0x3) + 1);
1413         } else if (info.src_type == ADD_BRANCH) {
1414                 enum branch_code branchCode = (enum branch_code) ((ADD.op >> 6) & 0x3f);
1415                 if (branchCode == BR_ALWAYS) {
1416                         // unconditional branch
1417                 } else {
1418                         enum branch_cond cond = (enum branch_cond) ((ADD.op >> 6) & 0x7);
1419                         enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7);
1420                         bool portSwapped = (ADD.op & 0x7) < ADD.src0;
1421                         // See the comment in branch_bit_size
1422                         if (size == BR_SIZE_16YX0)
1423                                 portSwapped = true;
1424                         if (size == BR_SIZE_16YX1)
1425                                 portSwapped = false;
1426                         // These sizes are only for floating point comparisons, so the
1427                         // non-floating-point comparisons are reused to encode the flipped
1428                         // versions.
1429                         if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y)
1430                                 portSwapped = false;
1431                         // There's only one argument, so we reuse the extra argument to
1432                         // encode this.
1433                         if (size == BR_SIZE_ZERO)
1434                                 portSwapped = !(ADD.op & 1);
1435
1436                         switch (cond) {
1437                         case BR_COND_LT:
1438                                 if (portSwapped)
1439                                         fprintf(fp, ".LT.u");
1440                                 else
1441                                         fprintf(fp, ".LT.i");
1442                                 break;
1443                         case BR_COND_LE:
1444                                 if (size == BR_SIZE_32_AND_16X || size == BR_SIZE_32_AND_16Y) {
1445                                         fprintf(fp, ".UNE.f");
1446                                 } else {
1447                                         if (portSwapped)
1448                                                 fprintf(fp, ".LE.u");
1449                                         else
1450                                                 fprintf(fp, ".LE.i");
1451                                 }
1452                                 break;
1453                         case BR_COND_GT:
1454                                 if (portSwapped)
1455                                         fprintf(fp, ".GT.u");
1456                                 else
1457                                         fprintf(fp, ".GT.i");
1458                                 break;
1459                         case BR_COND_GE:
1460                                 if (portSwapped)
1461                                         fprintf(fp, ".GE.u");
1462                                 else
1463                                         fprintf(fp, ".GE.i");
1464                                 break;
1465                         case BR_COND_EQ:
1466                                 if (portSwapped)
1467                                         fprintf(fp, ".NE.i");
1468                                 else
1469                                         fprintf(fp, ".EQ.i");
1470                                 break;
1471                         case BR_COND_OEQ:
1472                                 if (portSwapped)
1473                                         fprintf(fp, ".UNE.f");
1474                                 else
1475                                         fprintf(fp, ".OEQ.f");
1476                                 break;
1477                         case BR_COND_OGT:
1478                                 if (portSwapped)
1479                                         fprintf(fp, ".OGT.unk.f");
1480                                 else
1481                                         fprintf(fp, ".OGT.f");
1482                                 break;
1483                         case BR_COND_OLT:
1484                                 if (portSwapped)
1485                                         fprintf(fp, ".OLT.unk.f");
1486                                 else
1487                                         fprintf(fp, ".OLT.f");
1488                                 break;
1489                         }
1490                         switch (size) {
1491                         case BR_SIZE_32:
1492                         case BR_SIZE_32_AND_16X:
1493                         case BR_SIZE_32_AND_16Y:
1494                                 fprintf(fp, "32");
1495                                 break;
1496                         case BR_SIZE_16XX:
1497                         case BR_SIZE_16YY:
1498                         case BR_SIZE_16YX0:
1499                         case BR_SIZE_16YX1:
1500                                 fprintf(fp, "16");
1501                                 break;
1502                         case BR_SIZE_ZERO: {
1503                                 unsigned ctrl = (ADD.op >> 1) & 0x3;
1504                                 if (ctrl == 0)
1505                                         fprintf(fp, "32.Z");
1506                                 else
1507                                         fprintf(fp, "16.Z");
1508                                 break;
1509                         }
1510                         }
1511                 }
1512         }
1513         fprintf(fp, " ");
1514
1515         struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(fp, next_regs);
1516         if (next_ctrl.add_write_unit != REG_WRITE_NONE) {
1517                 fprintf(fp, "{R%d, T1}, ", GetRegToWrite(next_ctrl.add_write_unit, next_regs));
1518         } else {
1519                 fprintf(fp, "T1, ");
1520         }
1521
1522         switch (info.src_type) {
1523         case ADD_BLENDING:
1524                 // Note: in this case, regs.uniform_const == location | 0x8
1525                 // This probably means we can't load uniforms or immediates in the
1526                 // same instruction. This re-uses the encoding that normally means
1527                 // "disabled", where the low 4 bits are ignored. Perhaps the extra
1528                 // 0x8 or'd in indicates this is happening.
1529                 fprintf(fp, "location:%d, ", regs.uniform_const & 0x7);
1530         // fallthrough
1531         case ADD_ONE_SRC:
1532                 dump_src(fp, ADD.src0, regs, consts, false);
1533                 break;
1534         case ADD_TEX:
1535         case ADD_TEX_COMPACT: {
1536                 int tex_index;
1537                 int sampler_index;
1538                 bool dualTex = false;
1539                 if (info.src_type == ADD_TEX_COMPACT) {
1540                         tex_index = (ADD.op >> 3) & 0x7;
1541                         sampler_index = (ADD.op >> 7) & 0x7;
1542                         bool unknown = (ADD.op & 0x40);
1543                         // TODO: figure out if the unknown bit is ever 0
1544                         if (!unknown)
1545                                 fprintf(fp, "unknown ");
1546                 } else {
1547                         uint64_t constVal = get_const(consts, regs);
1548                         uint32_t controlBits = (ADD.op & 0x8) ? (constVal >> 32) : constVal;
1549                         struct bifrost_tex_ctrl ctrl;
1550                         memcpy((char *) &ctrl, (char *) &controlBits, sizeof(ctrl));
1551
1552                         // TODO: figure out what actually triggers dual-tex
1553                         if (ctrl.result_type == 9) {
1554                                 struct bifrost_dual_tex_ctrl dualCtrl;
1555                                 memcpy((char *) &dualCtrl, (char *) &controlBits, sizeof(ctrl));
1556                                 fprintf(fp, "(dualtex) tex0:%d samp0:%d tex1:%d samp1:%d ",
1557                                        dualCtrl.tex_index0, dualCtrl.sampler_index0,
1558                                        dualCtrl.tex_index1, dualCtrl.sampler_index1);
1559                                 if (dualCtrl.unk0 != 3)
1560                                         fprintf(fp, "unk:%d ", dualCtrl.unk0);
1561                                 dualTex = true;
1562                         } else {
1563                                 if (ctrl.no_merge_index) {
1564                                         tex_index = ctrl.tex_index;
1565                                         sampler_index = ctrl.sampler_index;
1566                                 } else {
1567                                         tex_index = sampler_index = ctrl.tex_index;
1568                                         unsigned unk = ctrl.sampler_index >> 2;
1569                                         if (unk != 3)
1570                                                 fprintf(fp, "unk:%d ", unk);
1571                                         if (ctrl.sampler_index & 1)
1572                                                 tex_index = -1;
1573                                         if (ctrl.sampler_index & 2)
1574                                                 sampler_index = -1;
1575                                 }
1576
1577                                 if (ctrl.unk0 != 3)
1578                                         fprintf(fp, "unk0:%d ", ctrl.unk0);
1579                                 if (ctrl.unk1)
1580                                         fprintf(fp, "unk1 ");
1581                                 if (ctrl.unk2 != 0xf)
1582                                         fprintf(fp, "unk2:%x ", ctrl.unk2);
1583
1584                                 switch (ctrl.result_type) {
1585                                 case 0x4:
1586                                         fprintf(fp, "f32 ");
1587                                         break;
1588                                 case 0xe:
1589                                         fprintf(fp, "i32 ");
1590                                         break;
1591                                 case 0xf:
1592                                         fprintf(fp, "u32 ");
1593                                         break;
1594                                 default:
1595                                         fprintf(fp, "unktype(%x) ", ctrl.result_type);
1596                                 }
1597
1598                                 switch (ctrl.tex_type) {
1599                                 case 0:
1600                                         fprintf(fp, "cube ");
1601                                         break;
1602                                 case 1:
1603                                         fprintf(fp, "buffer ");
1604                                         break;
1605                                 case 2:
1606                                         fprintf(fp, "2D ");
1607                                         break;
1608                                 case 3:
1609                                         fprintf(fp, "3D ");
1610                                         break;
1611                                 }
1612
1613                                 if (ctrl.is_shadow)
1614                                         fprintf(fp, "shadow ");
1615                                 if (ctrl.is_array)
1616                                         fprintf(fp, "array ");
1617
1618                                 if (!ctrl.filter) {
1619                                         if (ctrl.calc_gradients) {
1620                                                 int comp = (controlBits >> 20) & 0x3;
1621                                                 fprintf(fp, "txg comp:%d ", comp);
1622                                         } else {
1623                                                 fprintf(fp, "txf ");
1624                                         }
1625                                 } else {
1626                                         if (!ctrl.not_supply_lod) {
1627                                                 if (ctrl.compute_lod)
1628                                                         fprintf(fp, "lod_bias ");
1629                                                 else
1630                                                         fprintf(fp, "lod ");
1631                                         }
1632
1633                                         if (!ctrl.calc_gradients)
1634                                                 fprintf(fp, "grad ");
1635                                 }
1636
1637                                 if (ctrl.texel_offset)
1638                                         fprintf(fp, "offset ");
1639                         }
1640                 }
1641
1642                 if (!dualTex) {
1643                         if (tex_index == -1)
1644                                 fprintf(fp, "tex:indirect ");
1645                         else
1646                                 fprintf(fp, "tex:%d ", tex_index);
1647
1648                         if (sampler_index == -1)
1649                                 fprintf(fp, "samp:indirect ");
1650                         else
1651                                 fprintf(fp, "samp:%d ", sampler_index);
1652                 }
1653                 break;
1654         }
1655         case ADD_VARYING_INTERP: {
1656                 unsigned addr = ADD.op & 0x1f;
1657                 if (addr < 0b10100) {
1658                         // direct addr
1659                         fprintf(fp, "%d", addr);
1660                 } else if (addr < 0b11000) {
1661                         if (addr == 22)
1662                                 fprintf(fp, "fragw");
1663                         else if (addr == 23)
1664                                 fprintf(fp, "fragz");
1665                         else
1666                                 fprintf(fp, "unk%d", addr);
1667                 } else {
1668                         dump_src(fp, ADD.op & 0x7, regs, consts, false);
1669                 }
1670                 fprintf(fp, ", ");
1671                 dump_src(fp, ADD.src0, regs, consts, false);
1672                 break;
1673         }
1674         case ADD_VARYING_ADDRESS: {
1675                 dump_src(fp, ADD.src0, regs, consts, false);
1676                 fprintf(fp, ", ");
1677                 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1678                 fprintf(fp, ", ");
1679                 unsigned location = (ADD.op >> 3) & 0x1f;
1680                 if (location < 16) {
1681                         fprintf(fp, "location:%d", location);
1682                 } else if (location == 20) {
1683                         fprintf(fp, "location:%u", (uint32_t) get_const(consts, regs));
1684                 } else if (location == 21) {
1685                         fprintf(fp, "location:%u", (uint32_t) (get_const(consts, regs) >> 32));
1686                 } else {
1687                         fprintf(fp, "location:%d(unk)", location);
1688                 }
1689                 break;
1690         }
1691         case ADD_LOAD_ATTR:
1692                 fprintf(fp, "location:%d, ", (ADD.op >> 3) & 0xf);
1693         case ADD_TWO_SRC:
1694                 dump_src(fp, ADD.src0, regs, consts, false);
1695                 fprintf(fp, ", ");
1696                 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1697                 break;
1698         case ADD_THREE_SRC:
1699                 dump_src(fp, ADD.src0, regs, consts, false);
1700                 fprintf(fp, ", ");
1701                 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1702                 fprintf(fp, ", ");
1703                 dump_src(fp, (ADD.op >> 3) & 0x7, regs, consts, false);
1704                 break;
1705         case ADD_FADD:
1706         case ADD_FMINMAX:
1707                 if (ADD.op & 0x10)
1708                         fprintf(fp, "-");
1709                 if (ADD.op & 0x1000)
1710                         fprintf(fp, "abs(");
1711                 dump_src(fp, ADD.src0, regs, consts, false);
1712                 switch ((ADD.op >> 6) & 0x3) {
1713                 case 3:
1714                         fprintf(fp, ".x");
1715                         break;
1716                 default:
1717                         break;
1718                 }
1719                 if (ADD.op & 0x1000)
1720                         fprintf(fp, ")");
1721                 fprintf(fp, ", ");
1722                 if (ADD.op & 0x20)
1723                         fprintf(fp, "-");
1724                 if (ADD.op & 0x8)
1725                         fprintf(fp, "abs(");
1726                 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1727                 switch ((ADD.op >> 6) & 0x3) {
1728                 case 1:
1729                 case 3:
1730                         fprintf(fp, ".x");
1731                         break;
1732                 case 2:
1733                         fprintf(fp, ".y");
1734                         break;
1735                 case 0:
1736                         break;
1737                 default:
1738                         fprintf(fp, ".unk");
1739                         break;
1740                 }
1741                 if (ADD.op & 0x8)
1742                         fprintf(fp, ")");
1743                 break;
1744         case ADD_FADD16:
1745                 if (ADD.op & 0x10)
1746                         fprintf(fp, "-");
1747                 if (ADD.op & 0x1000)
1748                         fprintf(fp, "abs(");
1749                 dump_src(fp, ADD.src0, regs, consts, false);
1750                 if (ADD.op & 0x1000)
1751                         fprintf(fp, ")");
1752                 dump_16swizzle(fp, (ADD.op >> 6) & 0x3);
1753                 fprintf(fp, ", ");
1754                 if (ADD.op & 0x20)
1755                         fprintf(fp, "-");
1756                 if (ADD.op & 0x8)
1757                         fprintf(fp, "abs(");
1758                 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1759                 dump_16swizzle(fp, (ADD.op >> 8) & 0x3);
1760                 if (ADD.op & 0x8)
1761                         fprintf(fp, ")");
1762                 break;
1763         case ADD_FMINMAX16: {
1764                 bool abs1 = ADD.op & 0x8;
1765                 bool abs2 = (ADD.op & 0x7) < ADD.src0;
1766                 if (ADD.op & 0x10)
1767                         fprintf(fp, "-");
1768                 if (abs1 || abs2)
1769                         fprintf(fp, "abs(");
1770                 dump_src(fp, ADD.src0, regs, consts, false);
1771                 dump_16swizzle(fp, (ADD.op >> 6) & 0x3);
1772                 if (abs1 || abs2)
1773                         fprintf(fp, ")");
1774                 fprintf(fp, ", ");
1775                 if (ADD.op & 0x20)
1776                         fprintf(fp, "-");
1777                 if (abs1 && abs2)
1778                         fprintf(fp, "abs(");
1779                 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1780                 dump_16swizzle(fp, (ADD.op >> 8) & 0x3);
1781                 if (abs1 && abs2)
1782                         fprintf(fp, ")");
1783                 break;
1784         }
1785         case ADD_FADDMscale: {
1786                 if (ADD.op & 0x400)
1787                         fprintf(fp, "-");
1788                 if (ADD.op & 0x200)
1789                         fprintf(fp, "abs(");
1790                 dump_src(fp, ADD.src0, regs, consts, false);
1791                 if (ADD.op & 0x200)
1792                         fprintf(fp, ")");
1793
1794                 fprintf(fp, ", ");
1795
1796                 if (ADD.op & 0x800)
1797                         fprintf(fp, "-");
1798                 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1799
1800                 fprintf(fp, ", ");
1801
1802                 dump_src(fp, (ADD.op >> 3) & 0x7, regs, consts, false);
1803                 break;
1804         }
1805         case ADD_FCMP:
1806                 if (ADD.op & 0x400) {
1807                         fprintf(fp, "-");
1808                 }
1809                 if (ADD.op & 0x100) {
1810                         fprintf(fp, "abs(");
1811                 }
1812                 dump_src(fp, ADD.src0, regs, consts, false);
1813                 switch ((ADD.op >> 6) & 0x3) {
1814                 case 3:
1815                         fprintf(fp, ".x");
1816                         break;
1817                 default:
1818                         break;
1819                 }
1820                 if (ADD.op & 0x100) {
1821                         fprintf(fp, ")");
1822                 }
1823                 fprintf(fp, ", ");
1824                 if (ADD.op & 0x200) {
1825                         fprintf(fp, "abs(");
1826                 }
1827                 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1828                 switch ((ADD.op >> 6) & 0x3) {
1829                 case 1:
1830                 case 3:
1831                         fprintf(fp, ".x");
1832                         break;
1833                 case 2:
1834                         fprintf(fp, ".y");
1835                         break;
1836                 case 0:
1837                         break;
1838                 default:
1839                         fprintf(fp, ".unk");
1840                         break;
1841                 }
1842                 if (ADD.op & 0x200) {
1843                         fprintf(fp, ")");
1844                 }
1845                 break;
1846         case ADD_FCMP16:
1847                 dump_src(fp, ADD.src0, regs, consts, false);
1848                 dump_16swizzle(fp, (ADD.op >> 6) & 0x3);
1849                 fprintf(fp, ", ");
1850                 dump_src(fp, ADD.op & 0x7, regs, consts, false);
1851                 dump_16swizzle(fp, (ADD.op >> 8) & 0x3);
1852                 break;
1853         case ADD_BRANCH: {
1854                 enum branch_code code = (enum branch_code) ((ADD.op >> 6) & 0x3f);
1855                 enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7);
1856                 if (code != BR_ALWAYS) {
1857                         dump_src(fp, ADD.src0, regs, consts, false);
1858                         switch (size) {
1859                         case BR_SIZE_16XX:
1860                                 fprintf(fp, ".x");
1861                                 break;
1862                         case BR_SIZE_16YY:
1863                         case BR_SIZE_16YX0:
1864                         case BR_SIZE_16YX1:
1865                                 fprintf(fp, ".y");
1866                                 break;
1867                         case BR_SIZE_ZERO: {
1868                                 unsigned ctrl = (ADD.op >> 1) & 0x3;
1869                                 switch (ctrl) {
1870                                 case 1:
1871                                         fprintf(fp, ".y");
1872                                         break;
1873                                 case 2:
1874                                         fprintf(fp, ".x");
1875                                         break;
1876                                 default:
1877                                         break;
1878                                 }
1879                         }
1880                         default:
1881                                 break;
1882                         }
1883                         fprintf(fp, ", ");
1884                 }
1885                 if (code != BR_ALWAYS && size != BR_SIZE_ZERO) {
1886                         dump_src(fp, ADD.op & 0x7, regs, consts, false);
1887                         switch (size) {
1888                         case BR_SIZE_16XX:
1889                         case BR_SIZE_16YX0:
1890                         case BR_SIZE_16YX1:
1891                         case BR_SIZE_32_AND_16X:
1892                                 fprintf(fp, ".x");
1893                                 break;
1894                         case BR_SIZE_16YY:
1895                         case BR_SIZE_32_AND_16Y:
1896                                 fprintf(fp, ".y");
1897                                 break;
1898                         default:
1899                                 break;
1900                         }
1901                         fprintf(fp, ", ");
1902                 }
1903                 // I haven't had the chance to test if this actually specifies the
1904                 // branch offset, since I couldn't get it to produce values other
1905                 // than 5 (uniform/const high), but these three bits are always
1906                 // consistent across branch instructions, so it makes sense...
1907                 int offsetSrc = (ADD.op >> 3) & 0x7;
1908                 if (offsetSrc == 4 || offsetSrc == 5) {
1909                         // If the offset is known/constant, we can decode it
1910                         uint32_t raw_offset;
1911                         if (offsetSrc == 4)
1912                                 raw_offset = get_const(consts, regs);
1913                         else
1914                                 raw_offset = get_const(consts, regs) >> 32;
1915                         // The high 4 bits are flags, while the rest is the
1916                         // twos-complement offset in bytes (here we convert to
1917                         // clauses).
1918                         int32_t branch_offset = ((int32_t) raw_offset << 4) >> 8;
1919
1920                         // If high4 is the high 4 bits of the last 64-bit constant,
1921                         // this is calculated as (high4 + 4) & 0xf, or 0 if the branch
1922                         // offset itself is the last constant. Not sure if this is
1923                         // actually used, or just garbage in unused bits, but in any
1924                         // case, we can just ignore it here since it's redundant. Note
1925                         // that if there is any padding, this will be 4 since the
1926                         // padding counts as the last constant.
1927                         unsigned flags = raw_offset >> 28;
1928                         (void) flags;
1929
1930                         // Note: the offset is in bytes, relative to the beginning of the
1931                         // current clause, so a zero offset would be a loop back to the
1932                         // same clause (annoyingly different from Midgard).
1933                         fprintf(fp, "clause_%d", offset + branch_offset);
1934                 } else {
1935                         dump_src(fp, offsetSrc, regs, consts, false);
1936                 }
1937         }
1938         }
1939         if (info.has_data_reg) {
1940                 fprintf(fp, ", R%d", data_reg);
1941         }
1942         fprintf(fp, "\n");
1943 }
1944
1945 void dump_instr(FILE *fp, const struct bifrost_alu_inst *instr,
1946                 struct bifrost_regs next_regs, uint64_t *consts,
1947                 unsigned data_reg, unsigned offset, bool verbose)
1948 {
1949         struct bifrost_regs regs;
1950         memcpy((char *) &regs, (char *) &instr->reg_bits, sizeof(regs));
1951
1952         if (verbose) {
1953                 fprintf(fp, "# regs: %016" PRIx64 "\n", instr->reg_bits);
1954                 dump_regs(fp, regs);
1955         }
1956         dump_fma(fp, instr->fma_bits, regs, next_regs, consts, verbose);
1957         dump_add(fp, instr->add_bits, regs, next_regs, consts, data_reg, offset, verbose);
1958 }
1959
1960 bool dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offset, bool verbose)
1961 {
1962         // State for a decoded clause
1963         struct bifrost_alu_inst instrs[8] = {};
1964         uint64_t consts[6] = {};
1965         unsigned num_instrs = 0;
1966         unsigned num_consts = 0;
1967         uint64_t header_bits = 0;
1968         bool stopbit = false;
1969
1970         unsigned i;
1971         for (i = 0; ; i++, words += 4) {
1972                 if (verbose) {
1973                         fprintf(fp, "# ");
1974                         for (int j = 0; j < 4; j++)
1975                                 fprintf(fp, "%08x ", words[3 - j]); // low bit on the right
1976                         fprintf(fp, "\n");
1977                 }
1978                 unsigned tag = bits(words[0], 0, 8);
1979
1980                 // speculatively decode some things that are common between many formats, so we can share some code
1981                 struct bifrost_alu_inst main_instr = {};
1982                 // 20 bits
1983                 main_instr.add_bits = bits(words[2], 2, 32 - 13);
1984                 // 23 bits
1985                 main_instr.fma_bits = bits(words[1], 11, 32) | bits(words[2], 0, 2) << (32 - 11);
1986                 // 35 bits
1987                 main_instr.reg_bits = ((uint64_t) bits(words[1], 0, 11)) << 24 | (uint64_t) bits(words[0], 8, 32);
1988
1989                 uint64_t const0 = bits(words[0], 8, 32) << 4 | (uint64_t) words[1] << 28 | bits(words[2], 0, 4) << 60;
1990                 uint64_t const1 = bits(words[2], 4, 32) << 4 | (uint64_t) words[3] << 32;
1991
1992                 bool stop = tag & 0x40;
1993
1994                 if (verbose) {
1995                         fprintf(fp, "# tag: 0x%02x\n", tag);
1996                 }
1997                 if (tag & 0x80) {
1998                         unsigned idx = stop ? 5 : 2;
1999                         main_instr.add_bits |= ((tag >> 3) & 0x7) << 17;
2000                         instrs[idx + 1] = main_instr;
2001                         instrs[idx].add_bits = bits(words[3], 0, 17) | ((tag & 0x7) << 17);
2002                         instrs[idx].fma_bits |= bits(words[2], 19, 32) << 10;
2003                         consts[0] = bits(words[3], 17, 32) << 4;
2004                 } else {
2005                         bool done = false;
2006                         switch ((tag >> 3) & 0x7) {
2007                         case 0x0:
2008                                 switch (tag & 0x7) {
2009                                 case 0x3:
2010                                         main_instr.add_bits |= bits(words[3], 29, 32) << 17;
2011                                         instrs[1] = main_instr;
2012                                         num_instrs = 2;
2013                                         done = stop;
2014                                         break;
2015                                 case 0x4:
2016                                         instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
2017                                         instrs[2].fma_bits |= bits(words[2], 19, 32) << 10;
2018                                         consts[0] = const0;
2019                                         num_instrs = 3;
2020                                         num_consts = 1;
2021                                         done = stop;
2022                                         break;
2023                                 case 0x1:
2024                                 case 0x5:
2025                                         instrs[2].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
2026                                         instrs[2].fma_bits |= bits(words[2], 19, 32) << 10;
2027                                         main_instr.add_bits |= bits(words[3], 26, 29) << 17;
2028                                         instrs[3] = main_instr;
2029                                         if ((tag & 0x7) == 0x5) {
2030                                                 num_instrs = 4;
2031                                                 done = stop;
2032                                         }
2033                                         break;
2034                                 case 0x6:
2035                                         instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
2036                                         instrs[5].fma_bits |= bits(words[2], 19, 32) << 10;
2037                                         consts[0] = const0;
2038                                         num_instrs = 6;
2039                                         num_consts = 1;
2040                                         done = stop;
2041                                         break;
2042                                 case 0x7:
2043                                         instrs[5].add_bits = bits(words[3], 0, 17) | bits(words[3], 29, 32) << 17;
2044                                         instrs[5].fma_bits |= bits(words[2], 19, 32) << 10;
2045                                         main_instr.add_bits |= bits(words[3], 26, 29) << 17;
2046                                         instrs[6] = main_instr;
2047                                         num_instrs = 7;
2048                                         done = stop;
2049                                         break;
2050                                 default:
2051                                         fprintf(fp, "unknown tag bits 0x%02x\n", tag);
2052                                 }
2053                                 break;
2054                         case 0x2:
2055                         case 0x3: {
2056                                 unsigned idx = ((tag >> 3) & 0x7) == 2 ? 4 : 7;
2057                                 main_instr.add_bits |= (tag & 0x7) << 17;
2058                                 instrs[idx] = main_instr;
2059                                 consts[0] |= (bits(words[2], 19, 32) | ((uint64_t) words[3] << 13)) << 19;
2060                                 num_consts = 1;
2061                                 num_instrs = idx + 1;
2062                                 done = stop;
2063                                 break;
2064                         }
2065                         case 0x4: {
2066                                 unsigned idx = stop ? 4 : 1;
2067                                 main_instr.add_bits |= (tag & 0x7) << 17;
2068                                 instrs[idx] = main_instr;
2069                                 instrs[idx + 1].fma_bits |= bits(words[3], 22, 32);
2070                                 instrs[idx + 1].reg_bits = bits(words[2], 19, 32) | (bits(words[3], 0, 22) << (32 - 19));
2071                                 break;
2072                         }
2073                         case 0x1:
2074                                 // only constants can come after this
2075                                 num_instrs = 1;
2076                                 done = stop;
2077                         case 0x5:
2078                                 header_bits = bits(words[2], 19, 32) | ((uint64_t) words[3] << (32 - 19));
2079                                 main_instr.add_bits |= (tag & 0x7) << 17;
2080                                 instrs[0] = main_instr;
2081                                 break;
2082                         case 0x6:
2083                         case 0x7: {
2084                                 unsigned pos = tag & 0xf;
2085                                 // note that `pos' encodes both the total number of
2086                                 // instructions and the position in the constant stream,
2087                                 // presumably because decoded constants and instructions
2088                                 // share a buffer in the decoder, but we only care about
2089                                 // the position in the constant stream; the total number of
2090                                 // instructions is redundant.
2091                                 unsigned const_idx = 0;
2092                                 switch (pos) {
2093                                 case 0:
2094                                 case 1:
2095                                 case 2:
2096                                 case 6:
2097                                         const_idx = 0;
2098                                         break;
2099                                 case 3:
2100                                 case 4:
2101                                 case 7:
2102                                 case 9:
2103                                         const_idx = 1;
2104                                         break;
2105                                 case 5:
2106                                 case 0xa:
2107                                         const_idx = 2;
2108                                         break;
2109                                 case 8:
2110                                 case 0xb:
2111                                 case 0xc:
2112                                         const_idx = 3;
2113                                         break;
2114                                 case 0xd:
2115                                         const_idx = 4;
2116                                         break;
2117                                 default:
2118                                         fprintf(fp, "# unknown pos 0x%x\n", pos);
2119                                         break;
2120                                 }
2121
2122                                 if (num_consts < const_idx + 2)
2123                                         num_consts = const_idx + 2;
2124
2125                                 consts[const_idx] = const0;
2126                                 consts[const_idx + 1] = const1;
2127                                 done = stop;
2128                                 break;
2129                         }
2130                         default:
2131                                 break;
2132                         }
2133
2134                         if (done)
2135                                 break;
2136                 }
2137         }
2138
2139         *size = i + 1;
2140
2141         if (verbose) {
2142                 fprintf(fp, "# header: %012" PRIx64 "\n", header_bits);
2143         }
2144
2145         struct bifrost_header header;
2146         memcpy((char *) &header, (char *) &header_bits, sizeof(struct bifrost_header));
2147         dump_header(fp, header, verbose);
2148         if (!header.no_end_of_shader)
2149                 stopbit = true;
2150
2151         fprintf(fp, "{\n");
2152         for (i = 0; i < num_instrs; i++) {
2153                 struct bifrost_regs next_regs;
2154                 if (i + 1 == num_instrs) {
2155                         memcpy((char *) &next_regs, (char *) &instrs[0].reg_bits,
2156                                sizeof(next_regs));
2157                 } else {
2158                         memcpy((char *) &next_regs, (char *) &instrs[i + 1].reg_bits,
2159                                sizeof(next_regs));
2160                 }
2161
2162                 dump_instr(fp, &instrs[i], next_regs, consts, header.datareg, offset, verbose);
2163         }
2164         fprintf(fp, "}\n");
2165
2166         if (verbose) {
2167                 for (unsigned i = 0; i < num_consts; i++) {
2168                         fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i, consts[i] & 0xffffffff);
2169                         fprintf(fp, "# const%d: %08" PRIx64 "\n", 2 * i + 1, consts[i] >> 32);
2170                 }
2171         }
2172         return stopbit;
2173 }
2174
2175 void disassemble_bifrost(FILE *fp, uint8_t *code, size_t size, bool verbose)
2176 {
2177         uint32_t *words = (uint32_t *) code;
2178         uint32_t *words_end = words + (size / 4);
2179         // used for displaying branch targets
2180         unsigned offset = 0;
2181         while (words != words_end) {
2182                 // we don't know what the program-end bit is quite yet, so for now just
2183                 // assume that an all-0 quadword is padding
2184                 uint32_t zero[4] = {};
2185                 if (memcmp(words, zero, 4 * sizeof(uint32_t)) == 0)
2186                         break;
2187                 fprintf(fp, "clause_%d:\n", offset);
2188                 unsigned size;
2189                 if (dump_clause(fp, words, &size, offset, verbose) == true) {
2190                         break;
2191                 }
2192                 words += size * 4;
2193                 offset += size;
2194         }
2195 }
2196