X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fpanfrost%2Fbifrost%2Fdisassemble.c;h=38fcb8adc85f32122fbd0d231c26ea34c0aab629;hb=5c03340fd1a12c42fce43ba4060f39706663d541;hp=94bdd2501ea0a0514694ca760ccf6780aff5e235;hpb=7c96bd2dc5a0d3d5c7d620e0fe56f1931db52a3c;p=mesa.git diff --git a/src/panfrost/bifrost/disassemble.c b/src/panfrost/bifrost/disassemble.c index 94bdd2501ea..38fcb8adc85 100644 --- a/src/panfrost/bifrost/disassemble.c +++ b/src/panfrost/bifrost/disassemble.c @@ -31,8 +31,8 @@ #include #include "bifrost.h" -#include "bifrost_ops.h" #include "disassemble.h" +#include "bi_print.h" #include "util/macros.h" // return bits (high, lo] @@ -52,15 +52,6 @@ struct bifrost_alu_inst { uint64_t reg_bits; }; -struct bifrost_regs { - unsigned uniform_const : 8; - unsigned reg2 : 6; - unsigned reg3 : 6; - unsigned reg0 : 5; - unsigned reg1 : 6; - unsigned ctrl : 4; -}; - static unsigned get_reg0(struct bifrost_regs regs) { if (regs.ctrl == 0) @@ -74,12 +65,6 @@ static unsigned get_reg1(struct bifrost_regs regs) return regs.reg0 <= regs.reg1 ? regs.reg1 : 63 - regs.reg1; } -enum bifrost_reg_write_unit { - REG_WRITE_NONE = 0, // don't write - REG_WRITE_TWO, // write using reg2 - REG_WRITE_THREE, // write using reg3 -}; - // this represents the decoded version of the ctrl register field. struct bifrost_reg_ctrl { bool read_reg0; @@ -100,6 +85,7 @@ enum fma_src_type { FMA_FCMP, FMA_FCMP16, FMA_THREE_SRC, + FMA_SHIFT, FMA_FMA, FMA_FMA16, FMA_CSEL4, @@ -108,6 +94,7 @@ enum fma_src_type { }; struct fma_op_info { + bool extended; unsigned op; char name[30]; enum fma_src_type src_type; @@ -121,6 +108,7 @@ enum add_src_type { ADD_FADD16, ADD_FMINMAX16, ADD_THREE_SRC, + ADD_SHIFT, ADD_FADDMscale, ADD_FCMP, ADD_FCMP16, @@ -140,53 +128,6 @@ struct add_op_info { bool has_data_reg; }; -struct bifrost_tex_ctrl { - unsigned sampler_index : 4; // also used to signal indirects - unsigned tex_index : 7; - bool no_merge_index : 1; // whether to merge (direct) sampler & texture indices - bool filter : 1; // use the usual filtering pipeline (0 for texelFetch & textureGather) - unsigned unk0 : 2; - bool texel_offset : 1; // *Offset() - bool is_shadow : 1; - bool is_array : 1; - unsigned tex_type : 2; // 2D, 3D, Cube, Buffer - bool compute_lod : 1; // 0 for *Lod() - bool not_supply_lod : 1; // 0 for *Lod() or when a bias is applied - bool calc_gradients : 1; // 0 for *Grad() - unsigned unk1 : 1; - unsigned result_type : 4; // integer, unsigned, float TODO: why is this 4 bits? - unsigned unk2 : 4; -}; - -struct bifrost_dual_tex_ctrl { - unsigned sampler_index0 : 2; - unsigned unk0 : 2; - unsigned tex_index0 : 2; - unsigned sampler_index1 : 2; - unsigned tex_index1 : 2; - unsigned unk1 : 22; -}; - -enum branch_bit_size { - BR_SIZE_32 = 0, - BR_SIZE_16XX = 1, - BR_SIZE_16YY = 2, - // For the above combinations of bitsize and location, an extra bit is - // encoded via comparing the sources. The only possible source of ambiguity - // would be if the sources were the same, but then the branch condition - // would be always true or always false anyways, so we can ignore it. But - // this no longer works when comparing the y component to the x component, - // since it's valid to compare the y component of a source against its own - // x component. Instead, the extra bit is encoded via an extra bitsize. - BR_SIZE_16YX0 = 3, - BR_SIZE_16YX1 = 4, - BR_SIZE_32_AND_16X = 5, - BR_SIZE_32_AND_16Y = 6, - // Used for comparisons with zero and always-true, see below. I think this - // only works for integer comparisons. - BR_SIZE_ZERO = 7, -}; - void dump_header(FILE *fp, struct bifrost_header header, bool verbose); void dump_instr(FILE *fp, const struct bifrost_alu_inst *instr, struct bifrost_regs next_regs, uint64_t *consts, @@ -195,8 +136,15 @@ bool dump_clause(FILE *fp, uint32_t *words, unsigned *size, unsigned offset, boo void dump_header(FILE *fp, struct bifrost_header header, bool verbose) { + fprintf(fp, "id(%du) ", header.scoreboard_index); + if (header.clause_type != 0) { - fprintf(fp, "id(%du) ", header.scoreboard_index); + const char *name = bi_clause_type_name(header.clause_type); + + if (name[0] == '?') + fprintf(fp, "unk%u ", header.clause_type); + else + fprintf(fp, "%s ", name); } if (header.scoreboard_deps != 0) { @@ -477,245 +425,152 @@ static void dump_src(FILE *fp, unsigned src, struct bifrost_regs srcs, uint64_t } } -static void dump_output_mod(FILE *fp, unsigned mod) -{ - switch (mod) { - case 0: - break; - case 1: - fprintf(fp, ".clamp_0_inf"); - break; // max(out, 0) - case 2: - fprintf(fp, ".clamp_m1_1"); - break; // clamp(out, -1, 1) - case 3: - fprintf(fp, ".clamp_0_1"); - break; // clamp(out, 0, 1) - default: - break; - } -} - -static void dump_minmax_mode(FILE *fp, unsigned mod) -{ - switch (mod) { - case 0: - /* Same as fmax() and fmin() -- return the other number if any - * number is NaN. Also always return +0 if one argument is +0 and - * the other is -0. - */ - break; - case 1: - /* Instead of never returning a NaN, always return one. The - * "greater"/"lesser" NaN is always returned, first by checking the - * sign and then the mantissa bits. - */ - fprintf(fp, ".nan_wins"); - break; - case 2: - /* For max, implement src0 > src1 ? src0 : src1 - * For min, implement src0 < src1 ? src0 : src1 - * - * This includes handling NaN's and signedness of 0 differently - * from above, since +0 and -0 compare equal and comparisons always - * return false for NaN's. As a result, this mode is *not* - * commutative. - */ - fprintf(fp, ".src1_wins"); - break; - case 3: - /* For max, implement src0 < src1 ? src1 : src0 - * For min, implement src0 > src1 ? src1 : src0 - */ - fprintf(fp, ".src0_wins"); - break; - default: - break; - } -} - -static void dump_round_mode(FILE *fp, unsigned mod) -{ - switch (mod) { - case 0: - /* roundTiesToEven, the IEEE default. */ - break; - case 1: - /* roundTowardPositive in the IEEE spec. */ - fprintf(fp, ".round_pos"); - break; - case 2: - /* roundTowardNegative in the IEEE spec. */ - fprintf(fp, ".round_neg"); - break; - case 3: - /* roundTowardZero in the IEEE spec. */ - fprintf(fp, ".round_zero"); - break; - default: - break; - } -} - -static const char * -csel_cond_name(enum bifrost_csel_cond cond) -{ - switch (cond) { - case BIFROST_FEQ_F: return "feq.f"; - case BIFROST_FGT_F: return "fgt.f"; - case BIFROST_FGE_F: return "fge.f"; - case BIFROST_IEQ_F: return "ieq.f"; - case BIFROST_IGT_I: return "igt.i"; - case BIFROST_IGE_I: return "uge.i"; - case BIFROST_UGT_I: return "ugt.i"; - case BIFROST_UGE_I: return "uge.i"; - default: return "invalid"; - } -} - static const struct fma_op_info FMAOpInfos[] = { - { 0x00000, "FMA.f32", FMA_FMA }, - { 0x40000, "MAX.f32", FMA_FMINMAX }, - { 0x44000, "MIN.f32", FMA_FMINMAX }, - { 0x48000, "FCMP.GL", FMA_FCMP }, - { 0x4c000, "FCMP.D3D", FMA_FCMP }, - { 0x4ff98, "ADD.i32", FMA_TWO_SRC }, - { 0x4ffd8, "SUB.i32", FMA_TWO_SRC }, - { 0x4fff0, "SUBB.i32", FMA_TWO_SRC }, - { 0x50000, "FMA_MSCALE", FMA_FMA_MSCALE }, - { 0x58000, "ADD.f32", FMA_FADD }, - { 0x5c000, "CSEL4", FMA_CSEL4 }, - { 0x5d8d0, "ICMP.D3D.GT.v2i16", FMA_TWO_SRC }, - { 0x5d9d0, "UCMP.D3D.GT.v2i16", FMA_TWO_SRC }, - { 0x5dad0, "ICMP.D3D.GE.v2i16", FMA_TWO_SRC }, - { 0x5dbd0, "UCMP.D3D.GE.v2i16", FMA_TWO_SRC }, - { 0x5dcd0, "ICMP.D3D.EQ.v2i16", FMA_TWO_SRC }, - { 0x5de40, "ICMP.GL.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? 1 : 0 - { 0x5de48, "ICMP.GL.GE.i32", FMA_TWO_SRC }, - { 0x5de50, "UCMP.GL.GT.i32", FMA_TWO_SRC }, - { 0x5de58, "UCMP.GL.GE.i32", FMA_TWO_SRC }, - { 0x5de60, "ICMP.GL.EQ.i32", FMA_TWO_SRC }, - { 0x5dec0, "ICMP.D3D.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? ~0 : 0 - { 0x5dec8, "ICMP.D3D.GE.i32", FMA_TWO_SRC }, - { 0x5ded0, "UCMP.D3D.GT.i32", FMA_TWO_SRC }, - { 0x5ded8, "UCMP.D3D.GE.i32", FMA_TWO_SRC }, - { 0x5dee0, "ICMP.D3D.EQ.i32", FMA_TWO_SRC }, - { 0x60200, "RSHIFT_NAND.i32", FMA_THREE_SRC }, - { 0x603c0, "RSHIFT_NAND.v2i16", FMA_THREE_SRC }, - { 0x60e00, "RSHIFT_OR.i32", FMA_THREE_SRC }, - { 0x60fc0, "RSHIFT_OR.v2i16", FMA_THREE_SRC }, - { 0x61200, "RSHIFT_AND.i32", FMA_THREE_SRC }, - { 0x613c0, "RSHIFT_AND.v2i16", FMA_THREE_SRC }, - { 0x61e00, "RSHIFT_NOR.i32", FMA_THREE_SRC }, // ~((src0 << src2) | src1) - { 0x61fc0, "RSHIFT_NOR.v2i16", FMA_THREE_SRC }, // ~((src0 << src2) | src1) - { 0x62200, "LSHIFT_NAND.i32", FMA_THREE_SRC }, - { 0x623c0, "LSHIFT_NAND.v2i16", FMA_THREE_SRC }, - { 0x62e00, "LSHIFT_OR.i32", FMA_THREE_SRC }, // (src0 << src2) | src1 - { 0x62fc0, "LSHIFT_OR.v2i16", FMA_THREE_SRC }, // (src0 << src2) | src1 - { 0x63200, "LSHIFT_AND.i32", FMA_THREE_SRC }, // (src0 << src2) & src1 - { 0x633c0, "LSHIFT_AND.v2i16", FMA_THREE_SRC }, - { 0x63e00, "LSHIFT_NOR.i32", FMA_THREE_SRC }, - { 0x63fc0, "LSHIFT_NOR.v2i16", FMA_THREE_SRC }, - { 0x64200, "RSHIFT_XOR.i32", FMA_THREE_SRC }, - { 0x643c0, "RSHIFT_XOR.v2i16", FMA_THREE_SRC }, - { 0x64600, "RSHIFT_XNOR.i32", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) - { 0x647c0, "RSHIFT_XNOR.v2i16", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) - { 0x64a00, "LSHIFT_XOR.i32", FMA_THREE_SRC }, - { 0x64bc0, "LSHIFT_XOR.v2i16", FMA_THREE_SRC }, - { 0x64e00, "LSHIFT_XNOR.i32", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) - { 0x64fc0, "LSHIFT_XNOR.v2i16", FMA_THREE_SRC }, // ~((src0 >> src2) ^ src1) - { 0x65200, "LSHIFT_ADD.i32", FMA_THREE_SRC }, - { 0x65600, "LSHIFT_SUB.i32", FMA_THREE_SRC }, // (src0 << src2) - src1 - { 0x65a00, "LSHIFT_RSUB.i32", FMA_THREE_SRC }, // src1 - (src0 << src2) - { 0x65e00, "RSHIFT_ADD.i32", FMA_THREE_SRC }, - { 0x66200, "RSHIFT_SUB.i32", FMA_THREE_SRC }, - { 0x66600, "RSHIFT_RSUB.i32", FMA_THREE_SRC }, - { 0x66a00, "ARSHIFT_ADD.i32", FMA_THREE_SRC }, - { 0x66e00, "ARSHIFT_SUB.i32", FMA_THREE_SRC }, - { 0x67200, "ARSHIFT_RSUB.i32", FMA_THREE_SRC }, - { 0x80000, "FMA.v2f16", FMA_FMA16 }, - { 0xc0000, "MAX.v2f16", FMA_FMINMAX16 }, - { 0xc4000, "MIN.v2f16", FMA_FMINMAX16 }, - { 0xc8000, "FCMP.GL", FMA_FCMP16 }, - { 0xcc000, "FCMP.D3D", FMA_FCMP16 }, - { 0xcf900, "ADD.v2i16", FMA_TWO_SRC }, - { 0xcfc10, "ADDC.i32", FMA_TWO_SRC }, - { 0xcfd80, "ADD.i32.i16.X", FMA_TWO_SRC }, - { 0xcfd90, "ADD.i32.u16.X", FMA_TWO_SRC }, - { 0xcfdc0, "ADD.i32.i16.Y", FMA_TWO_SRC }, - { 0xcfdd0, "ADD.i32.u16.Y", FMA_TWO_SRC }, - { 0xd8000, "ADD.v2f16", FMA_FADD16 }, - { 0xdc000, "CSEL4.v16", FMA_CSEL4 }, - { 0xdd000, "F32_TO_F16", FMA_TWO_SRC }, - { 0xe0046, "F16_TO_I16.XX", FMA_ONE_SRC }, - { 0xe0047, "F16_TO_U16.XX", FMA_ONE_SRC }, - { 0xe004e, "F16_TO_I16.YX", FMA_ONE_SRC }, - { 0xe004f, "F16_TO_U16.YX", FMA_ONE_SRC }, - { 0xe0056, "F16_TO_I16.XY", FMA_ONE_SRC }, - { 0xe0057, "F16_TO_U16.XY", FMA_ONE_SRC }, - { 0xe005e, "F16_TO_I16.YY", FMA_ONE_SRC }, - { 0xe005f, "F16_TO_U16.YY", FMA_ONE_SRC }, - { 0xe00c0, "I16_TO_F16.XX", FMA_ONE_SRC }, - { 0xe00c1, "U16_TO_F16.XX", FMA_ONE_SRC }, - { 0xe00c8, "I16_TO_F16.YX", FMA_ONE_SRC }, - { 0xe00c9, "U16_TO_F16.YX", FMA_ONE_SRC }, - { 0xe00d0, "I16_TO_F16.XY", FMA_ONE_SRC }, - { 0xe00d1, "U16_TO_F16.XY", FMA_ONE_SRC }, - { 0xe00d8, "I16_TO_F16.YY", FMA_ONE_SRC }, - { 0xe00d9, "U16_TO_F16.YY", FMA_ONE_SRC }, - { 0xe0136, "F32_TO_I32", FMA_ONE_SRC }, - { 0xe0137, "F32_TO_U32", FMA_ONE_SRC }, - { 0xe0178, "I32_TO_F32", FMA_ONE_SRC }, - { 0xe0179, "U32_TO_F32", FMA_ONE_SRC }, - { 0xe0198, "I16_TO_I32.X", FMA_ONE_SRC }, - { 0xe0199, "U16_TO_U32.X", FMA_ONE_SRC }, - { 0xe019a, "I16_TO_I32.Y", FMA_ONE_SRC }, - { 0xe019b, "U16_TO_U32.Y", FMA_ONE_SRC }, - { 0xe019c, "I16_TO_F32.X", FMA_ONE_SRC }, - { 0xe019d, "U16_TO_F32.X", FMA_ONE_SRC }, - { 0xe019e, "I16_TO_F32.Y", FMA_ONE_SRC }, - { 0xe019f, "U16_TO_F32.Y", FMA_ONE_SRC }, - { 0xe01a2, "F16_TO_F32.X", FMA_ONE_SRC }, - { 0xe01a3, "F16_TO_F32.Y", FMA_ONE_SRC }, - { 0xe032c, "NOP", FMA_ONE_SRC }, - { 0xe032d, "MOV", FMA_ONE_SRC }, - { 0xe032f, "SWZ.YY.v2i16", FMA_ONE_SRC }, - { 0xe0345, "LOG_FREXPM", FMA_ONE_SRC }, - { 0xe0365, "FRCP_FREXPM", FMA_ONE_SRC }, - { 0xe0375, "FSQRT_FREXPM", FMA_ONE_SRC }, - { 0xe038d, "FRCP_FREXPE", FMA_ONE_SRC }, - { 0xe03a5, "FSQRT_FREXPE", FMA_ONE_SRC }, - { 0xe03ad, "FRSQ_FREXPE", FMA_ONE_SRC }, - { 0xe03c5, "LOG_FREXPE", FMA_ONE_SRC }, - { 0xe03fa, "CLZ", FMA_ONE_SRC }, - { 0xe0b80, "IMAX3", FMA_THREE_SRC }, - { 0xe0bc0, "UMAX3", FMA_THREE_SRC }, - { 0xe0c00, "IMIN3", FMA_THREE_SRC }, - { 0xe0c40, "UMIN3", FMA_THREE_SRC }, - { 0xe0ec5, "ROUND", FMA_ONE_SRC }, - { 0xe0f40, "CSEL", FMA_THREE_SRC }, // src2 != 0 ? src1 : src0 - { 0xe0fc0, "MUX.i32", FMA_THREE_SRC }, // see ADD comment - { 0xe1805, "ROUNDEVEN", FMA_ONE_SRC }, - { 0xe1845, "CEIL", FMA_ONE_SRC }, - { 0xe1885, "FLOOR", FMA_ONE_SRC }, - { 0xe18c5, "TRUNC", FMA_ONE_SRC }, - { 0xe19b0, "ATAN_LDEXP.Y.f32", FMA_TWO_SRC }, - { 0xe19b8, "ATAN_LDEXP.X.f32", FMA_TWO_SRC }, - { 0xe1c80, "LSHIFT_ADD_LOW32.u32", FMA_SHIFT_ADD64 }, - { 0xe1cc0, "LSHIFT_ADD_LOW32.i64", FMA_SHIFT_ADD64 }, - { 0xe1d80, "LSHIFT_ADD_LOW32.i32", FMA_SHIFT_ADD64 }, - { 0xe1e00, "SEL.XX.i16", FMA_TWO_SRC }, - { 0xe1e08, "SEL.YX.i16", FMA_TWO_SRC }, - { 0xe1e10, "SEL.XY.i16", FMA_TWO_SRC }, - { 0xe1e18, "SEL.YY.i16", FMA_TWO_SRC }, - { 0xe7800, "IMAD", FMA_THREE_SRC }, - { 0xe78db, "POPCNT", FMA_ONE_SRC }, + { false, 0x00000, "FMA.f32", FMA_FMA }, + { false, 0x40000, "MAX.f32", FMA_FMINMAX }, + { false, 0x44000, "MIN.f32", FMA_FMINMAX }, + { false, 0x48000, "FCMP.GL", FMA_FCMP }, + { false, 0x4c000, "FCMP.D3D", FMA_FCMP }, + { false, 0x4ff98, "ADD.i32", FMA_TWO_SRC }, + { false, 0x4ffd8, "SUB.i32", FMA_TWO_SRC }, + { false, 0x4fff0, "SUBB.i32", FMA_TWO_SRC }, + { false, 0x50000, "FMA_MSCALE", FMA_FMA_MSCALE }, + { false, 0x58000, "ADD.f32", FMA_FADD }, + { false, 0x5c000, "CSEL4", FMA_CSEL4 }, + { false, 0x5d8d0, "ICMP.D3D.GT.v2i16", FMA_TWO_SRC }, + { false, 0x5d9d0, "UCMP.D3D.GT.v2i16", FMA_TWO_SRC }, + { false, 0x5dad0, "ICMP.D3D.GE.v2i16", FMA_TWO_SRC }, + { false, 0x5dbd0, "UCMP.D3D.GE.v2i16", FMA_TWO_SRC }, + { false, 0x5dcd0, "ICMP.D3D.EQ.v2i16", FMA_TWO_SRC }, + { false, 0x5de40, "ICMP.GL.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? 1 : 0 + { false, 0x5de48, "ICMP.GL.GE.i32", FMA_TWO_SRC }, + { false, 0x5de50, "UCMP.GL.GT.i32", FMA_TWO_SRC }, + { false, 0x5de58, "UCMP.GL.GE.i32", FMA_TWO_SRC }, + { false, 0x5de60, "ICMP.GL.EQ.i32", FMA_TWO_SRC }, + { false, 0x5dec0, "ICMP.D3D.GT.i32", FMA_TWO_SRC }, // src0 > src1 ? ~0 : 0 + { false, 0x5dec8, "ICMP.D3D.GE.i32", FMA_TWO_SRC }, + { false, 0x5ded0, "UCMP.D3D.GT.i32", FMA_TWO_SRC }, + { false, 0x5ded8, "UCMP.D3D.GE.i32", FMA_TWO_SRC }, + { false, 0x5dee0, "ICMP.D3D.EQ.i32", FMA_TWO_SRC }, + { false, 0x60000, "RSHIFT_NAND", FMA_SHIFT }, + { false, 0x61000, "RSHIFT_AND", FMA_SHIFT }, + { false, 0x62000, "LSHIFT_NAND", FMA_SHIFT }, + { false, 0x63000, "LSHIFT_AND", FMA_SHIFT }, // (src0 << src2) & src1 + { false, 0x64000, "RSHIFT_XOR", FMA_SHIFT }, + { false, 0x65200, "LSHIFT_ADD.i32", FMA_THREE_SRC }, + { false, 0x65600, "LSHIFT_SUB.i32", FMA_THREE_SRC }, // (src0 << src2) - src1 + { false, 0x65a00, "LSHIFT_RSUB.i32", FMA_THREE_SRC }, // src1 - (src0 << src2) + { false, 0x65e00, "RSHIFT_ADD.i32", FMA_THREE_SRC }, + { false, 0x66200, "RSHIFT_SUB.i32", FMA_THREE_SRC }, + { false, 0x66600, "RSHIFT_RSUB.i32", FMA_THREE_SRC }, + { false, 0x66a00, "ARSHIFT_ADD.i32", FMA_THREE_SRC }, + { false, 0x66e00, "ARSHIFT_SUB.i32", FMA_THREE_SRC }, + { false, 0x67200, "ARSHIFT_RSUB.i32", FMA_THREE_SRC }, + { false, 0x80000, "FMA.v2f16", FMA_FMA16 }, + { false, 0xc0000, "MAX.v2f16", FMA_FMINMAX16 }, + { false, 0xc4000, "MIN.v2f16", FMA_FMINMAX16 }, + { false, 0xc8000, "FCMP.GL", FMA_FCMP16 }, + { false, 0xcc000, "FCMP.D3D", FMA_FCMP16 }, + { false, 0xcf900, "ADD.v2i16", FMA_TWO_SRC }, + { false, 0xcfc10, "ADDC.i32", FMA_TWO_SRC }, + { false, 0xcfd80, "ADD.i32.i16.X", FMA_TWO_SRC }, + { false, 0xcfd90, "ADD.i32.u16.X", FMA_TWO_SRC }, + { false, 0xcfdc0, "ADD.i32.i16.Y", FMA_TWO_SRC }, + { false, 0xcfdd0, "ADD.i32.u16.Y", FMA_TWO_SRC }, + { false, 0xd8000, "ADD.v2f16", FMA_FADD16 }, + { false, 0xdc000, "CSEL4.v16", FMA_CSEL4 }, + { false, 0xdd000, "F32_TO_F16", FMA_TWO_SRC }, + + /* TODO: Combine to bifrost_fma_f2i_i2f16 */ + { true, 0x00046, "F16_TO_I16.XX", FMA_ONE_SRC }, + { true, 0x00047, "F16_TO_U16.XX", FMA_ONE_SRC }, + { true, 0x0004e, "F16_TO_I16.YX", FMA_ONE_SRC }, + { true, 0x0004f, "F16_TO_U16.YX", FMA_ONE_SRC }, + { true, 0x00056, "F16_TO_I16.XY", FMA_ONE_SRC }, + { true, 0x00057, "F16_TO_U16.XY", FMA_ONE_SRC }, + { true, 0x0005e, "F16_TO_I16.YY", FMA_ONE_SRC }, + { true, 0x0005f, "F16_TO_U16.YY", FMA_ONE_SRC }, + { true, 0x000c0, "I16_TO_F16.XX", FMA_ONE_SRC }, + { true, 0x000c1, "U16_TO_F16.XX", FMA_ONE_SRC }, + { true, 0x000c8, "I16_TO_F16.YX", FMA_ONE_SRC }, + { true, 0x000c9, "U16_TO_F16.YX", FMA_ONE_SRC }, + { true, 0x000d0, "I16_TO_F16.XY", FMA_ONE_SRC }, + { true, 0x000d1, "U16_TO_F16.XY", FMA_ONE_SRC }, + { true, 0x000d8, "I16_TO_F16.YY", FMA_ONE_SRC }, + { true, 0x000d9, "U16_TO_F16.YY", FMA_ONE_SRC }, + + { true, 0x00136, "F32_TO_I32", FMA_ONE_SRC }, + { true, 0x00137, "F32_TO_U32", FMA_ONE_SRC }, + { true, 0x00178, "I32_TO_F32", FMA_ONE_SRC }, + { true, 0x00179, "U32_TO_F32", FMA_ONE_SRC }, + + /* TODO: cleanup to use bifrost_fma_int16_to_32 */ + { true, 0x00198, "I16_TO_I32.X", FMA_ONE_SRC }, + { true, 0x00199, "U16_TO_U32.X", FMA_ONE_SRC }, + { true, 0x0019a, "I16_TO_I32.Y", FMA_ONE_SRC }, + { true, 0x0019b, "U16_TO_U32.Y", FMA_ONE_SRC }, + { true, 0x0019c, "I16_TO_F32.X", FMA_ONE_SRC }, + { true, 0x0019d, "U16_TO_F32.X", FMA_ONE_SRC }, + { true, 0x0019e, "I16_TO_F32.Y", FMA_ONE_SRC }, + { true, 0x0019f, "U16_TO_F32.Y", FMA_ONE_SRC }, + + { true, 0x001a2, "F16_TO_F32.X", FMA_ONE_SRC }, + { true, 0x001a3, "F16_TO_F32.Y", FMA_ONE_SRC }, + + { true, 0x0032c, "NOP", FMA_ONE_SRC }, + { true, 0x0032d, "MOV", FMA_ONE_SRC }, + { true, 0x0032f, "SWZ.YY.v2i16", FMA_ONE_SRC }, + { true, 0x00345, "LOG_FREXPM", FMA_ONE_SRC }, + { true, 0x00365, "FRCP_FREXPM", FMA_ONE_SRC }, + { true, 0x00375, "FSQRT_FREXPM", FMA_ONE_SRC }, + { true, 0x0038d, "FRCP_FREXPE", FMA_ONE_SRC }, + { true, 0x003a5, "FSQRT_FREXPE", FMA_ONE_SRC }, + { true, 0x003ad, "FRSQ_FREXPE", FMA_ONE_SRC }, + { true, 0x003c5, "LOG_FREXPE", FMA_ONE_SRC }, + { true, 0x003fa, "CLZ", FMA_ONE_SRC }, + { true, 0x00b80, "IMAX3", FMA_THREE_SRC }, + { true, 0x00bc0, "UMAX3", FMA_THREE_SRC }, + { true, 0x00c00, "IMIN3", FMA_THREE_SRC }, + { true, 0x00c40, "UMIN3", FMA_THREE_SRC }, + { true, 0x00ec2, "ROUND.v2f16", FMA_ONE_SRC }, + { true, 0x00ec5, "ROUND.f32", FMA_ONE_SRC }, + { true, 0x00f40, "CSEL", FMA_THREE_SRC }, // src2 != 0 ? src1 : src0 + { true, 0x00fc0, "MUX.i32", FMA_THREE_SRC }, // see ADD comment + { true, 0x01802, "ROUNDEVEN.v2f16", FMA_ONE_SRC }, + { true, 0x01805, "ROUNDEVEN.f32", FMA_ONE_SRC }, + { true, 0x01842, "CEIL.v2f16", FMA_ONE_SRC }, + { true, 0x01845, "CEIL.f32", FMA_ONE_SRC }, + { true, 0x01882, "FLOOR.v2f16", FMA_ONE_SRC }, + { true, 0x01885, "FLOOR.f32", FMA_ONE_SRC }, + { true, 0x018c2, "TRUNC.v2f16", FMA_ONE_SRC }, + { true, 0x018c5, "TRUNC.f32", FMA_ONE_SRC }, + { true, 0x019b0, "ATAN_LDEXP.Y.f32", FMA_TWO_SRC }, + { true, 0x019b8, "ATAN_LDEXP.X.f32", FMA_TWO_SRC }, + { true, 0x01c80, "LSHIFT_ADD_LOW32.u32", FMA_SHIFT_ADD64 }, + { true, 0x01cc0, "LSHIFT_ADD_LOW32.i64", FMA_SHIFT_ADD64 }, + { true, 0x01d80, "LSHIFT_ADD_LOW32.i32", FMA_SHIFT_ADD64 }, + { true, 0x01e00, "SEL.XX.i16", FMA_TWO_SRC }, + { true, 0x01e08, "SEL.YX.i16", FMA_TWO_SRC }, + { true, 0x01e10, "SEL.XY.i16", FMA_TWO_SRC }, + { true, 0x01e18, "SEL.YY.i16", FMA_TWO_SRC }, + { true, 0x01e80, "ADD_FREXPM.f32", FMA_TWO_SRC }, + { true, 0x02000, "SWZ.XXXX.v4i8", FMA_ONE_SRC }, + { true, 0x03e00, "SWZ.ZZZZ.v4i8", FMA_ONE_SRC }, + { true, 0x00800, "IMAD", FMA_THREE_SRC }, + { true, 0x078db, "POPCNT", FMA_ONE_SRC }, }; -static struct fma_op_info find_fma_op_info(unsigned op) +static struct fma_op_info find_fma_op_info(unsigned op, bool extended) { for (unsigned i = 0; i < ARRAY_SIZE(FMAOpInfos); i++) { unsigned opCmp = ~0; + + if (FMAOpInfos[i].extended != extended) + continue; + + if (extended) + op &= ~0xe0000; + switch (FMAOpInfos[i].src_type) { case FMA_ONE_SRC: opCmp = op; @@ -742,6 +597,7 @@ static struct fma_op_info find_fma_op_info(unsigned op) opCmp = op & ~0x3ffff; break; case FMA_CSEL4: + case FMA_SHIFT: opCmp = op & ~0xfff; break; case FMA_FMA_MSCALE: @@ -847,7 +703,7 @@ static void dump_fma(FILE *fp, uint64_t word, struct bifrost_regs regs, struct b } struct bifrost_fma_inst FMA; memcpy((char *) &FMA, (char *) &word, sizeof(struct bifrost_fma_inst)); - struct fma_op_info info = find_fma_op_info(FMA.op); + struct fma_op_info info = find_fma_op_info(FMA.op, (FMA.op & 0xe0000) == 0xe0000); fprintf(fp, "%s", info.name); if (info.src_type == FMA_FADD || @@ -856,17 +712,17 @@ static void dump_fma(FILE *fp, uint64_t word, struct bifrost_regs regs, struct b info.src_type == FMA_FADD16 || info.src_type == FMA_FMINMAX16 || info.src_type == FMA_FMA16) { - dump_output_mod(fp, bits(FMA.op, 12, 14)); + fprintf(fp, "%s", bi_output_mod_name(bits(FMA.op, 12, 14))); switch (info.src_type) { case FMA_FADD: case FMA_FMA: case FMA_FADD16: case FMA_FMA16: - dump_round_mode(fp, bits(FMA.op, 10, 12)); + fprintf(fp, "%s", bi_round_mode_name(bits(FMA.op, 10, 12))); break; case FMA_FMINMAX: case FMA_FMINMAX16: - dump_minmax_mode(fp, bits(FMA.op, 10, 12)); + fprintf(fp, "%s", bi_minmax_mode_name(bits(FMA.op, 10, 12))); break; default: assert(0); @@ -901,8 +757,29 @@ static void dump_fma(FILE *fp, uint64_t word, struct bifrost_regs regs, struct b fprintf(fp, ".unk%d_mode", (int) (FMA.op >> 9) & 0x3); } } else { - dump_output_mod(fp, bits(FMA.op, 9, 11)); + fprintf(fp, "%s", bi_output_mod_name(bits(FMA.op, 9, 11))); } + } else if (info.src_type == FMA_SHIFT) { + struct bifrost_shift_fma shift; + memcpy(&shift, &FMA, sizeof(shift)); + + if (shift.half == 0x7) + fprintf(fp, ".v2i16"); + else if (shift.half == 0) + fprintf(fp, ".i32"); + else if (shift.half == 0x4) + fprintf(fp, ".v4i8"); + else + fprintf(fp, ".unk%u", shift.half); + + if (!shift.unk) + fprintf(fp, ".no_unk"); + + if (shift.invert_1) + fprintf(fp, ".invert_1"); + + if (shift.invert_2) + fprintf(fp, ".invert_2"); } fprintf(fp, " "); @@ -1006,6 +883,17 @@ static void dump_fma(FILE *fp, uint64_t word, struct bifrost_regs regs, struct b fprintf(fp, ", "); dump_src(fp, (FMA.op >> 3) & 0x7, regs, consts, true); break; + case FMA_SHIFT: { + struct bifrost_shift_fma shift; + memcpy(&shift, &FMA, sizeof(shift)); + + dump_src(fp, shift.src0, regs, consts, true); + fprintf(fp, ", "); + dump_src(fp, shift.src1, regs, consts, true); + fprintf(fp, ", "); + dump_src(fp, shift.src2, regs, consts, true); + break; + } case FMA_FMA: if (FMA.op & (1 << 14)) fprintf(fp, "-"); @@ -1048,7 +936,7 @@ static void dump_fma(FILE *fp, uint64_t word, struct bifrost_regs regs, struct b case FMA_CSEL4: { struct bifrost_csel4 csel; memcpy(&csel, &FMA, sizeof(csel)); - fprintf(fp, ".%s ", csel_cond_name(csel.cond)); + fprintf(fp, ".%s ", bi_csel_cond_name(csel.cond)); dump_src(fp, csel.src0, regs, consts, true); fprintf(fp, ", "); @@ -1096,8 +984,10 @@ static const struct add_op_info add_op_infos[] = { { 0x078d1, "U16_TO_F16.XY", ADD_ONE_SRC }, { 0x078d8, "I16_TO_F16.YY", ADD_ONE_SRC }, { 0x078d9, "U16_TO_F16.YY", ADD_ONE_SRC }, + { 0x07909, "B1_TO_F16", ADD_ONE_SRC }, { 0x07936, "F32_TO_I32", ADD_ONE_SRC }, { 0x07937, "F32_TO_U32", ADD_ONE_SRC }, + { 0x07971, "B1_TO_F32", ADD_ONE_SRC }, { 0x07978, "I32_TO_F32", ADD_ONE_SRC }, { 0x07979, "U32_TO_F32", ADD_ONE_SRC }, { 0x07998, "I16_TO_I32.X", ADD_ONE_SRC }, @@ -1108,51 +998,27 @@ static const struct add_op_info add_op_infos[] = { { 0x0799d, "U16_TO_F32.X", ADD_ONE_SRC }, { 0x0799e, "I16_TO_F32.Y", ADD_ONE_SRC }, { 0x0799f, "U16_TO_F32.Y", ADD_ONE_SRC }, - // take the low 16 bits, and expand it to a 32-bit float { 0x079a2, "F16_TO_F32.X", ADD_ONE_SRC }, - // take the high 16 bits, ... { 0x079a3, "F16_TO_F32.Y", ADD_ONE_SRC }, { 0x07b2b, "SWZ.YX.v2i16", ADD_ONE_SRC }, { 0x07b2c, "NOP", ADD_ONE_SRC }, { 0x07b29, "SWZ.XX.v2i16", ADD_ONE_SRC }, - // Logically, this should be SWZ.XY, but that's equivalent to a move, and - // this seems to be the canonical way the blob generates a MOV. { 0x07b2d, "MOV", ADD_ONE_SRC }, { 0x07b2f, "SWZ.YY.v2i16", ADD_ONE_SRC }, - // Given a floating point number m * 2^e, returns m ^ 2^{-1}. { 0x07b65, "FRCP_FREXPM", ADD_ONE_SRC }, { 0x07b75, "FSQRT_FREXPM", ADD_ONE_SRC }, { 0x07b8d, "FRCP_FREXPE", ADD_ONE_SRC }, { 0x07ba5, "FSQRT_FREXPE", ADD_ONE_SRC }, { 0x07bad, "FRSQ_FREXPE", ADD_ONE_SRC }, - // From the ARM patent US20160364209A1: - // "Decompose v (the input) into numbers x1 and s such that v = x1 * 2^s, - // and x1 is a floating point value in a predetermined range where the - // value 1 is within the range and not at one extremity of the range (e.g. - // choose a range where 1 is towards middle of range)." - // - // This computes s. { 0x07bc5, "FLOG_FREXPE", ADD_ONE_SRC }, - { 0x07d45, "CEIL", ADD_ONE_SRC }, - { 0x07d85, "FLOOR", ADD_ONE_SRC }, - { 0x07dc5, "TRUNC", ADD_ONE_SRC }, + { 0x07d42, "CEIL.v2f16", ADD_ONE_SRC }, + { 0x07d45, "CEIL.f32", ADD_ONE_SRC }, + { 0x07d82, "FLOOR.v2f16", ADD_ONE_SRC }, + { 0x07d85, "FLOOR.f32", ADD_ONE_SRC }, + { 0x07dc2, "TRUNC.v2f16", ADD_ONE_SRC }, + { 0x07dc5, "TRUNC.f32", ADD_ONE_SRC }, { 0x07f18, "LSHIFT_ADD_HIGH32.i32", ADD_TWO_SRC }, - { 0x08000, "LD_ATTR.f16", ADD_LOAD_ATTR, true }, - { 0x08100, "LD_ATTR.v2f16", ADD_LOAD_ATTR, true }, - { 0x08200, "LD_ATTR.v3f16", ADD_LOAD_ATTR, true }, - { 0x08300, "LD_ATTR.v4f16", ADD_LOAD_ATTR, true }, - { 0x08400, "LD_ATTR.f32", ADD_LOAD_ATTR, true }, - { 0x08500, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true }, - { 0x08600, "LD_ATTR.v3f32", ADD_LOAD_ATTR, true }, - { 0x08700, "LD_ATTR.v4f32", ADD_LOAD_ATTR, true }, - { 0x08800, "LD_ATTR.i32", ADD_LOAD_ATTR, true }, - { 0x08900, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true }, - { 0x08a00, "LD_ATTR.v3i32", ADD_LOAD_ATTR, true }, - { 0x08b00, "LD_ATTR.v4i32", ADD_LOAD_ATTR, true }, - { 0x08c00, "LD_ATTR.u32", ADD_LOAD_ATTR, true }, - { 0x08d00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true }, - { 0x08e00, "LD_ATTR.v3u32", ADD_LOAD_ATTR, true }, - { 0x08f00, "LD_ATTR.v4u32", ADD_LOAD_ATTR, true }, + { 0x08000, "LD_ATTR", ADD_LOAD_ATTR, true }, { 0x0a000, "LD_VAR.32", ADD_VARYING_INTERP, true }, { 0x0b000, "TEX", ADD_TEX_COMPACT, true }, { 0x0c188, "LOAD.i32", ADD_TWO_SRC, true }, @@ -1162,7 +1028,6 @@ static const struct add_op_info add_op_infos[] = { { 0x0c1e0, "LD_UBO.v2i32", ADD_TWO_SRC, true }, { 0x0c1f8, "LD_SCRATCH.v2i32", ADD_TWO_SRC, true }, { 0x0c208, "LOAD.v4i32", ADD_TWO_SRC, true }, - // src0 = offset, src1 = binding { 0x0c220, "LD_UBO.v4i32", ADD_TWO_SRC, true }, { 0x0c238, "LD_SCRATCH.v4i32", ADD_TWO_SRC, true }, { 0x0c248, "STORE.v4i32", ADD_TWO_SRC, true }, @@ -1177,25 +1042,15 @@ static const struct add_op_info add_op_infos[] = { { 0x0cab8, "LD_SCRATCH.v3i32", ADD_TWO_SRC, true }, { 0x0cb88, "STORE.v3i32", ADD_TWO_SRC, true }, { 0x0cbb8, "ST_SCRATCH.v3i32", ADD_TWO_SRC, true }, - // *_FAST does not exist on G71 (added to G51, G72, and everything after) { 0x0cc00, "FRCP_FAST.f32", ADD_ONE_SRC }, { 0x0cc20, "FRSQ_FAST.f32", ADD_ONE_SRC }, - // Given a floating point number m * 2^e, produces a table-based - // approximation of 2/m using the top 17 bits. Includes special cases for - // infinity, NaN, and zero, and copies the sign bit. + { 0x0cc68, "FLOG2_U.f32", ADD_ONE_SRC }, + { 0x0cd58, "FEXP2_FAST.f32", ADD_ONE_SRC }, { 0x0ce00, "FRCP_TABLE", ADD_ONE_SRC }, - // Exists on G71 { 0x0ce10, "FRCP_FAST.f16.X", ADD_ONE_SRC }, - // A similar table for inverse square root, using the high 17 bits of the - // mantissa as well as the low bit of the exponent. { 0x0ce20, "FRSQ_TABLE", ADD_ONE_SRC }, { 0x0ce30, "FRCP_FAST.f16.Y", ADD_ONE_SRC }, { 0x0ce50, "FRSQ_FAST.f16.X", ADD_ONE_SRC }, - // Used in the argument reduction for log. Given a floating-point number - // m * 2^e, uses the top 4 bits of m to produce an approximation to 1/m - // with the exponent forced to 0 and only the top 5 bits are nonzero. 0, - // infinity, and NaN all return 1.0. - // See the ARM patent for more information. { 0x0ce60, "FRCP_APPROX", ADD_ONE_SRC }, { 0x0ce70, "FRSQ_FAST.f16.Y", ADD_ONE_SRC }, { 0x0cf40, "ATAN_ASSIST", ADD_TWO_SRC }, @@ -1206,8 +1061,6 @@ static const struct add_op_info add_op_infos[] = { { 0x0cf60, "FLOG2_TABLE", ADD_ONE_SRC }, { 0x0cf64, "FLOGE_TABLE", ADD_ONE_SRC }, { 0x0d000, "BRANCH", ADD_BRANCH }, - // For each bit i, return src2[i] ? src0[i] : src1[i]. In other words, this - // is the same as (src2 & src0) | (~src2 & src1). { 0x0e8c0, "MUX", ADD_THREE_SRC }, { 0x0e9b0, "ATAN_LDEXP.Y.f32", ADD_TWO_SRC }, { 0x0e9b8, "ATAN_LDEXP.X.f32", ADD_TWO_SRC }, @@ -1216,22 +1069,36 @@ static const struct add_op_info add_op_infos[] = { { 0x0ea68, "SEL.YX.i16", ADD_TWO_SRC }, { 0x0ea78, "SEL.YY.i16", ADD_TWO_SRC }, { 0x0ec00, "F32_TO_F16", ADD_TWO_SRC }, + { 0x0e840, "CSEL.64", ADD_THREE_SRC }, // u2u32(src2) ? src0 : src1 + { 0x0e940, "CSEL.8", ADD_THREE_SRC }, // (src2 != 0) ? src0 : src1 { 0x0f640, "ICMP.GL.GT", ADD_TWO_SRC }, // src0 > src1 ? 1 : 0 { 0x0f648, "ICMP.GL.GE", ADD_TWO_SRC }, { 0x0f650, "UCMP.GL.GT", ADD_TWO_SRC }, { 0x0f658, "UCMP.GL.GE", ADD_TWO_SRC }, { 0x0f660, "ICMP.GL.EQ", ADD_TWO_SRC }, { 0x0f669, "ICMP.GL.NEQ", ADD_TWO_SRC }, + { 0x0f690, "UCMP.8.GT", ADD_TWO_SRC }, + { 0x0f698, "UCMP.8.GE", ADD_TWO_SRC }, + { 0x0f6a8, "ICMP.8.NE", ADD_TWO_SRC }, { 0x0f6c0, "ICMP.D3D.GT", ADD_TWO_SRC }, // src0 > src1 ? ~0 : 0 { 0x0f6c8, "ICMP.D3D.GE", ADD_TWO_SRC }, { 0x0f6d0, "UCMP.D3D.GT", ADD_TWO_SRC }, { 0x0f6d8, "UCMP.D3D.GE", ADD_TWO_SRC }, { 0x0f6e0, "ICMP.D3D.EQ", ADD_TWO_SRC }, + { 0x0f700, "ICMP.64.GT.PT1", ADD_TWO_SRC }, + { 0x0f708, "ICMP.64.GE.PT1", ADD_TWO_SRC }, + { 0x0f710, "UCMP.64.GT.PT1", ADD_TWO_SRC }, + { 0x0f718, "UCMP.64.GE.PT1", ADD_TWO_SRC }, + { 0x0f720, "ICMP.64.EQ.PT1", ADD_TWO_SRC }, + { 0x0f728, "ICMP.64.NE.PT1", ADD_TWO_SRC }, + { 0x0f7c0, "ICMP.64.PT2", ADD_THREE_SRC }, // src3 = result of PT1 { 0x10000, "MAX.v2f16", ADD_FMINMAX16 }, { 0x11000, "ADD_MSCALE.f32", ADD_FADDMscale }, { 0x12000, "MIN.v2f16", ADD_FMINMAX16 }, { 0x14000, "ADD.v2f16", ADD_FADD16 }, + { 0x16000, "FCMP.GL", ADD_FCMP16 }, { 0x17000, "FCMP.D3D", ADD_FCMP16 }, + { 0x17880, "ADD.v4i8", ADD_TWO_SRC }, { 0x178c0, "ADD.i32", ADD_TWO_SRC }, { 0x17900, "ADD.v2i16", ADD_TWO_SRC }, { 0x17ac0, "SUB.i32", ADD_TWO_SRC }, @@ -1240,62 +1107,33 @@ static const struct add_op_info add_op_infos[] = { { 0x17d90, "ADD.i32.u16.X", ADD_TWO_SRC }, { 0x17dc0, "ADD.i32.i16.Y", ADD_TWO_SRC }, { 0x17dd0, "ADD.i32.u16.Y", ADD_TWO_SRC }, - // Compute varying address and datatype (for storing in the vertex shader), - // and store the vec3 result in the data register. The result is passed as - // the 3 normal arguments to ST_VAR. - { 0x18000, "LD_VAR_ADDR.f16", ADD_VARYING_ADDRESS, true }, - { 0x18100, "LD_VAR_ADDR.f32", ADD_VARYING_ADDRESS, true }, - { 0x18200, "LD_VAR_ADDR.i32", ADD_VARYING_ADDRESS, true }, - { 0x18300, "LD_VAR_ADDR.u32", ADD_VARYING_ADDRESS, true }, - // Conditional discards (discard_if) in NIR. Compares the first two - // sources and discards if the result is true - { 0x19181, "DISCARD.FEQ.f32", ADD_TWO_SRC, true }, - { 0x19189, "DISCARD.FNE.f32", ADD_TWO_SRC, true }, - { 0x1918C, "DISCARD.GL.f32", ADD_TWO_SRC, true }, /* Consumes ICMP.GL/etc with fixed 0 argument */ - { 0x19190, "DISCARD.FLE.f32", ADD_TWO_SRC, true }, - { 0x19198, "DISCARD.FLT.f32", ADD_TWO_SRC, true }, - // Implements alpha-to-coverage, as well as possibly the late depth and - // stencil tests. The first source is the existing sample mask in R60 - // (possibly modified by gl_SampleMask), and the second source is the alpha - // value. The sample mask is written right away based on the - // alpha-to-coverage result using the normal register write mechanism, - // since that doesn't need to read from any memory, and then written again - // later based on the result of the stencil and depth tests using the - // special register. + { 0x18000, "LD_VAR_ADDR", ADD_VARYING_ADDRESS, false }, + { 0x19100, "DISCARD.FEQ.f16", ADD_TWO_SRC, false }, + { 0x19188, "DISCARD.FNE.f16", ADD_TWO_SRC, false }, + { 0x19110, "DISCARD.FLE.f16", ADD_TWO_SRC, false }, + { 0x19118, "DISCARD.FLT.f16", ADD_TWO_SRC, false }, + { 0x19180, "DISCARD.FEQ.f32", ADD_TWO_SRC, false }, + { 0x19188, "DISCARD.FNE.f32", ADD_TWO_SRC, false }, + { 0x19190, "DISCARD.FLE.f32", ADD_TWO_SRC, false }, + { 0x19198, "DISCARD.FLT.f32", ADD_TWO_SRC, false }, { 0x191e8, "ATEST.f32", ADD_TWO_SRC, true }, { 0x191f0, "ATEST.X.f16", ADD_TWO_SRC, true }, { 0x191f8, "ATEST.Y.f16", ADD_TWO_SRC, true }, - // store a varying given the address and datatype from LD_VAR_ADDR { 0x19300, "ST_VAR.v1", ADD_THREE_SRC, true }, { 0x19340, "ST_VAR.v2", ADD_THREE_SRC, true }, { 0x19380, "ST_VAR.v3", ADD_THREE_SRC, true }, { 0x193c0, "ST_VAR.v4", ADD_THREE_SRC, true }, - // This takes the sample coverage mask (computed by ATEST above) as a - // regular argument, in addition to the vec4 color in the special register. { 0x1952c, "BLEND", ADD_BLENDING, true }, { 0x1a000, "LD_VAR.16", ADD_VARYING_INTERP, true }, { 0x1ae60, "TEX", ADD_TEX, true }, - { 0x1c000, "RSHIFT_NAND.i32", ADD_THREE_SRC }, - { 0x1c300, "RSHIFT_OR.i32", ADD_THREE_SRC }, - { 0x1c400, "RSHIFT_AND.i32", ADD_THREE_SRC }, - { 0x1c700, "RSHIFT_NOR.i32", ADD_THREE_SRC }, - { 0x1c800, "LSHIFT_NAND.i32", ADD_THREE_SRC }, - { 0x1cb00, "LSHIFT_OR.i32", ADD_THREE_SRC }, - { 0x1cc00, "LSHIFT_AND.i32", ADD_THREE_SRC }, - { 0x1cf00, "LSHIFT_NOR.i32", ADD_THREE_SRC }, - { 0x1d000, "RSHIFT_XOR.i32", ADD_THREE_SRC }, - { 0x1d100, "RSHIFT_XNOR.i32", ADD_THREE_SRC }, - { 0x1d200, "LSHIFT_XOR.i32", ADD_THREE_SRC }, - { 0x1d300, "LSHIFT_XNOR.i32", ADD_THREE_SRC }, - { 0x1d400, "LSHIFT_ADD.i32", ADD_THREE_SRC }, - { 0x1d500, "LSHIFT_SUB.i32", ADD_THREE_SRC }, - { 0x1d500, "LSHIFT_RSUB.i32", ADD_THREE_SRC }, - { 0x1d700, "RSHIFT_ADD.i32", ADD_THREE_SRC }, - { 0x1d800, "RSHIFT_SUB.i32", ADD_THREE_SRC }, - { 0x1d900, "RSHIFT_RSUB.i32", ADD_THREE_SRC }, - { 0x1da00, "ARSHIFT_ADD.i32", ADD_THREE_SRC }, - { 0x1db00, "ARSHIFT_SUB.i32", ADD_THREE_SRC }, - { 0x1dc00, "ARSHIFT_RSUB.i32", ADD_THREE_SRC }, + { 0x1b000, "TEX.f16", ADD_TEX_COMPACT, true }, + { 0x1c000, "RSHIFT_NAND.i32", ADD_SHIFT }, + { 0x1c400, "RSHIFT_AND.i32", ADD_SHIFT }, + { 0x1c800, "LSHIFT_NAND.i32", ADD_SHIFT }, + { 0x1cc00, "LSHIFT_AND.i32", ADD_SHIFT }, + { 0x1d000, "RSHIFT_XOR.i32", ADD_SHIFT }, + { 0x1d400, "LSHIFT_ADD.i32", ADD_SHIFT }, + { 0x1d800, "RSHIFT_SUB.i32", ADD_SHIFT }, { 0x1dd18, "OR.i32", ADD_TWO_SRC }, { 0x1dd20, "AND.i32", ADD_TWO_SRC }, { 0x1dd60, "LSHIFT.i32", ADD_TWO_SRC }, @@ -1319,6 +1157,9 @@ static struct add_op_info find_add_op_info(unsigned op) case ADD_THREE_SRC: opCmp = op & ~0x3f; break; + case ADD_SHIFT: + opCmp = op & ~0x3ff; + break; case ADD_TEX: opCmp = op & ~0xf; break; @@ -1342,11 +1183,9 @@ static struct add_op_info find_add_op_info(unsigned op) opCmp = op & ~0x7ff; break; case ADD_VARYING_ADDRESS: - opCmp = op & ~0xff; + opCmp = op & ~0xfff; break; case ADD_LOAD_ATTR: - opCmp = op & ~0x7f; - break; case ADD_BRANCH: opCmp = op & ~0xfff; break; @@ -1382,11 +1221,11 @@ static void dump_add(FILE *fp, uint64_t word, struct bifrost_regs regs, // float16 seems like it doesn't support output modifiers if (info.src_type == ADD_FADD || info.src_type == ADD_FMINMAX) { // output modifiers - dump_output_mod(fp, bits(ADD.op, 8, 10)); + fprintf(fp, "%s", bi_output_mod_name(bits(ADD.op, 8, 10))); if (info.src_type == ADD_FADD) - dump_round_mode(fp, bits(ADD.op, 10, 12)); + fprintf(fp, "%s", bi_round_mode_name(bits(ADD.op, 10, 12))); else - dump_minmax_mode(fp, bits(ADD.op, 10, 12)); + fprintf(fp, "%s", bi_minmax_mode_name(bits(ADD.op, 10, 12))); } else if (info.src_type == ADD_FCMP || info.src_type == ADD_FCMP16) { dump_fcmp(fp, bits(ADD.op, 3, 6)); if (info.src_type == ADD_FCMP) @@ -1424,26 +1263,14 @@ static void dump_add(FILE *fp, uint64_t word, struct bifrost_regs regs, fprintf(fp, ".reuse"); if (ADD.op & 0x400) fprintf(fp, ".flat"); - switch ((ADD.op >> 7) & 0x3) { - case 0: - fprintf(fp, ".per_frag"); - break; - case 1: - fprintf(fp, ".centroid"); - break; - case 2: - break; - case 3: - fprintf(fp, ".explicit"); - break; - } + fprintf(fp, "%s", bi_interp_mode_name((ADD.op >> 7) & 0x3)); fprintf(fp, ".v%d", ((ADD.op >> 5) & 0x3) + 1); } else if (info.src_type == ADD_BRANCH) { - enum branch_code branchCode = (enum branch_code) ((ADD.op >> 6) & 0x3f); + enum bifrost_branch_code branchCode = (enum bifrost_branch_code) ((ADD.op >> 6) & 0x3f); if (branchCode == BR_ALWAYS) { // unconditional branch } else { - enum branch_cond cond = (enum branch_cond) ((ADD.op >> 6) & 0x7); + enum bifrost_branch_cond cond = (enum bifrost_branch_cond) ((ADD.op >> 6) & 0x7); enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7); bool portSwapped = (ADD.op & 0x7) < ADD.src0; // See the comment in branch_bit_size @@ -1537,7 +1364,32 @@ static void dump_add(FILE *fp, uint64_t word, struct bifrost_regs regs, } } } + } else if (info.src_type == ADD_SHIFT) { + struct bifrost_shift_add shift; + memcpy(&shift, &ADD, sizeof(ADD)); + + if (shift.invert_1) + fprintf(fp, ".invert_1"); + + if (shift.invert_2) + fprintf(fp, ".invert_2"); + + if (shift.zero) + fprintf(fp, ".unk%u", shift.zero); + } else if (info.src_type == ADD_VARYING_ADDRESS) { + struct bifrost_ld_var_addr ld; + memcpy(&ld, &ADD, sizeof(ADD)); + fprintf(fp, ".%s", bi_ldst_type_name(ld.type)); + } else if (info.src_type == ADD_LOAD_ATTR) { + struct bifrost_ld_attr ld; + memcpy(&ld, &ADD, sizeof(ADD)); + + if (ld.channels) + fprintf(fp, ".v%d%s", ld.channels + 1, bi_ldst_type_name(ld.type)); + else + fprintf(fp, ".%s", bi_ldst_type_name(ld.type)); } + fprintf(fp, " "); struct bifrost_reg_ctrl next_ctrl = DecodeRegCtrl(fp, next_regs); @@ -1564,6 +1416,13 @@ static void dump_add(FILE *fp, uint64_t word, struct bifrost_regs regs, int tex_index; int sampler_index; bool dualTex = false; + + fprintf(fp, "coords <"); + dump_src(fp, ADD.src0, regs, consts, false); + fprintf(fp, ", "); + dump_src(fp, ADD.op & 0x7, regs, consts, false); + fprintf(fp, ">, "); + if (info.src_type == ADD_TEX_COMPACT) { tex_index = (ADD.op >> 3) & 0x7; sampler_index = (ADD.op >> 7) & 0x7; @@ -1577,13 +1436,20 @@ static void dump_add(FILE *fp, uint64_t word, struct bifrost_regs regs, struct bifrost_tex_ctrl ctrl; memcpy((char *) &ctrl, (char *) &controlBits, sizeof(ctrl)); - // TODO: figure out what actually triggers dual-tex - if (ctrl.result_type == 9) { + /* Dual-tex triggered for adjacent texturing + * instructions with the same coordinates to different + * textures/samplers. Observed for the compact + * (2D/normal) case. */ + + if ((ctrl.result_type & 7) == 1) { + bool f32 = ctrl.result_type & 8; + struct bifrost_dual_tex_ctrl dualCtrl; memcpy((char *) &dualCtrl, (char *) &controlBits, sizeof(ctrl)); - fprintf(fp, "(dualtex) tex0:%d samp0:%d tex1:%d samp1:%d ", + fprintf(fp, "(dualtex) tex0:%d samp0:%d tex1:%d samp1:%d %s", dualCtrl.tex_index0, dualCtrl.sampler_index0, - dualCtrl.tex_index1, dualCtrl.sampler_index1); + dualCtrl.tex_index1, dualCtrl.sampler_index1, + f32 ? "f32" : "f16"); if (dualCtrl.unk0 != 3) fprintf(fp, "unk:%d ", dualCtrl.unk0); dualTex = true; @@ -1717,7 +1583,7 @@ static void dump_add(FILE *fp, uint64_t word, struct bifrost_regs regs, break; } case ADD_LOAD_ATTR: - fprintf(fp, "location:%d, ", (ADD.op >> 3) & 0xf); + fprintf(fp, "location:%d, ", (ADD.op >> 3) & 0x1f); case ADD_TWO_SRC: dump_src(fp, ADD.src0, regs, consts, false); fprintf(fp, ", "); @@ -1730,6 +1596,16 @@ static void dump_add(FILE *fp, uint64_t word, struct bifrost_regs regs, fprintf(fp, ", "); dump_src(fp, (ADD.op >> 3) & 0x7, regs, consts, false); break; + case ADD_SHIFT: { + struct bifrost_shift_add shift; + memcpy(&shift, &ADD, sizeof(ADD)); + dump_src(fp, shift.src0, regs, consts, false); + fprintf(fp, ", "); + dump_src(fp, shift.src1, regs, consts, false); + fprintf(fp, ", "); + dump_src(fp, shift.src2, regs, consts, false); + break; + } case ADD_FADD: case ADD_FMINMAX: if (ADD.op & 0x10) @@ -1808,6 +1684,7 @@ static void dump_add(FILE *fp, uint64_t word, struct bifrost_regs regs, dump_16swizzle(fp, (ADD.op >> 8) & 0x3); if (abs1 && abs2) fprintf(fp, ")"); + fprintf(fp, "/* %X */\n", (ADD.op >> 10) & 0x3); /* mode */ break; } case ADD_FADDMscale: { @@ -1879,7 +1756,7 @@ static void dump_add(FILE *fp, uint64_t word, struct bifrost_regs regs, dump_16swizzle(fp, (ADD.op >> 8) & 0x3); break; case ADD_BRANCH: { - enum branch_code code = (enum branch_code) ((ADD.op >> 6) & 0x3f); + enum bifrost_branch_code code = (enum bifrost_branch_code) ((ADD.op >> 6) & 0x3f); enum branch_bit_size size = (enum branch_bit_size) ((ADD.op >> 9) & 0x7); if (code != BR_ALWAYS) { dump_src(fp, ADD.src0, regs, consts, false);