typedef enum {
/* category 0: */
OPC_NOP = _OPC(0, 0),
- OPC_BR = _OPC(0, 1),
+ OPC_B = _OPC(0, 1),
OPC_JUMP = _OPC(0, 2),
OPC_CALL = _OPC(0, 3),
OPC_RET = _OPC(0, 4),
OPC_CHSH = _OPC(0, 10),
OPC_FLOW_REV = _OPC(0, 11),
+ OPC_BKT = _OPC(0, 16),
+ OPC_STKS = _OPC(0, 17),
+ OPC_STKR = _OPC(0, 18),
+ OPC_XSET = _OPC(0, 19),
+ OPC_XCLR = _OPC(0, 20),
+ OPC_GETONE = _OPC(0, 21),
+ OPC_DBG = _OPC(0, 22),
+ OPC_SHPS = _OPC(0, 23), /* shader prologue start */
+ OPC_SHPE = _OPC(0, 24), /* shader prologue end */
+
+ OPC_PREDT = _OPC(0, 29), /* predicated true */
+ OPC_PREDF = _OPC(0, 30), /* predicated false */
+ OPC_PREDE = _OPC(0, 31), /* predicated end */
+
/* category 1: */
OPC_MOV = _OPC(1, 0),
OPC_CMPV_U = _OPC(2, 33),
OPC_CMPV_S = _OPC(2, 34),
/* 35-47 - invalid */
- OPC_MUL_U = _OPC(2, 48),
- OPC_MUL_S = _OPC(2, 49),
+ OPC_MUL_U24 = _OPC(2, 48), /* 24b mul into 32b result */
+ OPC_MUL_S24 = _OPC(2, 49), /* 24b mul into 32b result with sign extension */
OPC_MULL_U = _OPC(2, 50),
OPC_BFREV_B = _OPC(2, 51),
OPC_CLZ_S = _OPC(2, 52),
OPC_SIN = _OPC(4, 4),
OPC_COS = _OPC(4, 5),
OPC_SQRT = _OPC(4, 6),
- // 7-63 - invalid
+ /* NOTE that these are 8+opc from their highp equivs, so it's possible
+ * that the high order bit in the opc field has been repurposed for
+ * half-precision use? But note that other ops (rcp/lsin/cos/sqrt)
+ * still use the same opc as highp
+ */
+ OPC_HRSQ = _OPC(4, 9),
+ OPC_HLOG2 = _OPC(4, 10),
+ OPC_HEXP2 = _OPC(4, 11),
/* category 5: */
OPC_ISAM = _OPC(5, 0),
OPC_STG = _OPC(6, 3), /* store-global */
OPC_STL = _OPC(6, 4),
OPC_STP = _OPC(6, 5),
- OPC_STI = _OPC(6, 6),
+ OPC_LDIB = _OPC(6, 6),
OPC_G2L = _OPC(6, 7),
OPC_L2G = _OPC(6, 8),
OPC_PREFETCH = _OPC(6, 9),
/* meta instructions (category -1): */
/* placeholder instr to mark shader inputs: */
OPC_META_INPUT = _OPC(-1, 0),
- /* The "fan-in" and "fan-out" instructions are used for keeping
+ /* The "collect" and "split" instructions are used for keeping
* track of instructions that write to multiple dst registers
- * (fan-out) like texture sample instructions, or read multiple
- * consecutive scalar registers (fan-in) (bary.f, texture samp)
+ * (split) like texture sample instructions, or read multiple
+ * consecutive scalar registers (collect) (bary.f, texture samp)
+ *
+ * A "split" extracts a scalar component from a vecN, and a
+ * "collect" gathers multiple scalar components into a vecN
*/
- OPC_META_FO = _OPC(-1, 2),
- OPC_META_FI = _OPC(-1, 3),
+ OPC_META_SPLIT = _OPC(-1, 2),
+ OPC_META_COLLECT = _OPC(-1, 3),
+
+ /* placeholder for texture fetches that run before FS invocation
+ * starts:
+ */
+ OPC_META_TEX_PREFETCH = _OPC(-1, 4),
} opc_t;
uint32_t dummy12 : 12;
uint32_t dummy13 : 13;
uint32_t dummy8 : 8;
+ int32_t idummy13 : 13;
+ int32_t idummy8 : 8;
} reg_t;
/* special registers: */
return (reg.num == REG_A0) || (reg.num == REG_P0);
}
+typedef enum {
+ BRANCH_PLAIN = 0, /* br */
+ BRANCH_OR = 1, /* brao */
+ BRANCH_AND = 2, /* braa */
+ BRANCH_CONST = 3, /* brac */
+ BRANCH_ANY = 4, /* bany */
+ BRANCH_ALL = 5, /* ball */
+ BRANCH_X = 6, /* brax ??? */
+} brtype_t;
+
typedef struct PACKED {
/* dword0: */
union PACKED {
};
/* dword1: */
- uint32_t dummy2 : 8;
+ uint32_t idx : 5; /* brac.N index */
+ uint32_t brtype : 3; /* branch type, see brtype_t */
uint32_t repeat : 3;
uint32_t dummy3 : 1;
uint32_t ss : 1;
- uint32_t dummy4 : 7;
- uint32_t inv : 1;
- uint32_t comp : 2;
+ uint32_t inv1 : 1;
+ uint32_t comp1 : 2;
+ uint32_t eq : 1;
+ uint32_t opc_hi : 1; /* at least one bit */
+ uint32_t dummy4 : 2;
+ uint32_t inv0 : 1;
+ uint32_t comp0 : 2; /* component for first src */
uint32_t opc : 4;
uint32_t jmp_tgt : 1;
uint32_t sync : 1;
uint32_t dst : 8;
uint32_t repeat : 2;
uint32_t sat : 1;
- uint32_t src1_r : 1;
+ uint32_t src1_r : 1; /* doubles as nop0 if repeat==0 */
uint32_t ss : 1;
uint32_t ul : 1; /* dunno */
uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */
uint32_t ei : 1;
uint32_t cond : 3;
- uint32_t src2_r : 1;
+ uint32_t src2_r : 1; /* doubles as nop1 if repeat==0 */
uint32_t full : 1; /* not half */
uint32_t opc : 6;
uint32_t jmp_tgt : 1;
uint32_t must_be_zero1: 2;
uint32_t src2_c : 1;
uint32_t src1_neg : 1;
- uint32_t src2_r : 1;
+ uint32_t src2_r : 1; /* doubles as nop1 if repeat==0 */
};
struct PACKED {
uint32_t src1 : 10;
uint32_t dst : 8;
uint32_t repeat : 2;
uint32_t sat : 1;
- uint32_t src1_r : 1;
+ uint32_t src1_r : 1; /* doubles as nop0 if repeat==0 */
uint32_t ss : 1;
uint32_t ul : 1;
uint32_t dst_half : 1; /* or widen/narrow.. ie. dst hrN <-> rN */
uint32_t opc_cat : 3;
} instr_cat4_t;
+/* With is_bindless_s2en = 1, this determines whether bindless is enabled and
+ * if so, how to get the (base, index) pair for both sampler and texture.
+ * There is a single base embedded in the instruction, which is always used
+ * for the texture.
+ */
+typedef enum {
+ /* Use traditional GL binding model, get texture and sampler index
+ * from src3 which is not presumed to be uniform. This is
+ * backwards-compatible with earlier generations, where this field was
+ * always 0 and nonuniform-indexed sampling always worked.
+ */
+ CAT5_NONUNIFORM = 0,
+
+ /* The sampler base comes from the low 3 bits of a1.x, and the sampler
+ * and texture index come from src3 which is presumed to be uniform.
+ */
+ CAT5_BINDLESS_A1_UNIFORM = 1,
+
+ /* The texture and sampler share the same base, and the sampler and
+ * texture index come from src3 which is *not* presumed to be uniform.
+ */
+ CAT5_BINDLESS_NONUNIFORM = 2,
+
+ /* The sampler base comes from the low 3 bits of a1.x, and the sampler
+ * and texture index come from src3 which is *not* presumed to be
+ * uniform.
+ */
+ CAT5_BINDLESS_A1_NONUNIFORM = 3,
+
+ /* Use traditional GL binding model, get texture and sampler index
+ * from src3 which is presumed to be uniform.
+ */
+ CAT5_UNIFORM = 4,
+
+ /* The texture and sampler share the same base, and the sampler and
+ * texture index come from src3 which is presumed to be uniform.
+ */
+ CAT5_BINDLESS_UNIFORM = 5,
+
+ /* The texture and sampler share the same base, get sampler index from low
+ * 4 bits of src3 and texture index from high 4 bits.
+ */
+ CAT5_BINDLESS_IMM = 6,
+
+ /* The sampler base comes from the low 3 bits of a1.x, and the texture
+ * index comes from the next 8 bits of a1.x. The sampler index is an
+ * immediate in src3.
+ */
+ CAT5_BINDLESS_A1_IMM = 7,
+} cat5_desc_mode_t;
+
typedef struct PACKED {
/* dword0: */
union PACKED {
} norm;
/* s2en case: */
struct PACKED {
- uint32_t full : 1; /* not half */
- uint32_t src1 : 8;
- uint32_t src2 : 11;
- uint32_t dummy1 : 1;
- uint32_t src3 : 8;
- uint32_t dummy2 : 3;
- } s2en;
+ uint32_t full : 1; /* not half */
+ uint32_t src1 : 8;
+ uint32_t src2 : 8;
+ uint32_t dummy1 : 2;
+ uint32_t base_hi : 2;
+ uint32_t src3 : 8;
+ uint32_t desc_mode : 3;
+ } s2en_bindless;
/* same in either case: */
// XXX I think, confirm this
struct PACKED {
uint32_t full : 1; /* not half */
uint32_t src1 : 8;
- uint32_t pad : 23;
+ uint32_t src2 : 8;
+ uint32_t pad : 15;
};
};
/* dword1: */
- uint32_t dst : 8;
- uint32_t wrmask : 4; /* write-mask */
- uint32_t type : 3;
- uint32_t dummy2 : 1; /* seems to be ignored */
- uint32_t is_3d : 1;
-
- uint32_t is_a : 1;
- uint32_t is_s : 1;
- uint32_t is_s2en : 1;
- uint32_t is_o : 1;
- uint32_t is_p : 1;
-
- uint32_t opc : 5;
- uint32_t jmp_tgt : 1;
- uint32_t sync : 1;
- uint32_t opc_cat : 3;
+ uint32_t dst : 8;
+ uint32_t wrmask : 4; /* write-mask */
+ uint32_t type : 3;
+ uint32_t base_lo : 1; /* used with bindless */
+ uint32_t is_3d : 1;
+
+ uint32_t is_a : 1;
+ uint32_t is_s : 1;
+ uint32_t is_s2en_bindless : 1;
+ uint32_t is_o : 1;
+ uint32_t is_p : 1;
+
+ uint32_t opc : 5;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
} instr_cat5_t;
/* dword0 encoding for src_off: [src1 + off], src2: */
uint32_t src_ssbo : 8;
uint32_t pad2 : 3; // type
uint32_t g : 1;
- uint32_t pad3 : 1;
+ uint32_t src_ssbo_im : 1;
uint32_t pad4 : 10; // opc/jmp_tgt/sync/opc_cat
} instr_cat6ldgb_t;
};
} instr_cat6_t;
+/* Similar to cat5_desc_mode_t, describes how the descriptor is loaded.
+ */
+typedef enum {
+ /* Use old GL binding model with an immediate index. */
+ CAT6_IMM = 0,
+
+ CAT6_UNIFORM = 1,
+
+ CAT6_NONUNIFORM = 2,
+
+ /* Use the bindless model, with an immediate index.
+ */
+ CAT6_BINDLESS_IMM = 4,
+
+ /* Use the bindless model, with a uniform register index.
+ */
+ CAT6_BINDLESS_UNIFORM = 5,
+
+ /* Use the bindless model, with a register index that isn't guaranteed
+ * to be uniform. This presumably checks if the indices are equal and
+ * splits up the load/store, because it works the way you would
+ * expect.
+ */
+ CAT6_BINDLESS_NONUNIFORM = 6,
+} cat6_desc_mode_t;
+
+/**
+ * For atomic ops (which return a value):
+ *
+ * pad1=1, pad3=c, pad5=3
+ * src1 - vecN offset/coords
+ * src2.x - is actually dest register
+ * src2.y - is 'data' except for cmpxchg where src2.y is 'compare'
+ * and src2.z is 'data'
+ *
+ * For stib (which does not return a value):
+ * pad1=0, pad3=c, pad5=2
+ * src1 - vecN offset/coords
+ * src2 - value to store
+ *
+ * For ldib:
+ * pad1=1, pad3=c, pad5=2
+ * src1 - vecN offset/coords
+ *
+ * for ldc (load from UBO using descriptor):
+ * pad1=0, pad3=8, pad5=2
+ *
+ * pad2 and pad5 are only observed to be 0.
+ */
+typedef struct PACKED {
+ /* dword0: */
+ uint32_t pad1 : 1;
+ uint32_t base : 3;
+ uint32_t pad2 : 2;
+ uint32_t desc_mode : 3;
+ uint32_t d : 2;
+ uint32_t typed : 1;
+ uint32_t type_size : 2;
+ uint32_t opc : 5;
+ uint32_t pad3 : 5;
+ uint32_t src1 : 8; /* coordinate/offset */
+
+ /* dword1: */
+ uint32_t src2 : 8; /* or the dst for load instructions */
+ uint32_t pad4 : 1; //mustbe0 ??
+ uint32_t ssbo : 8; /* ssbo/image binding point */
+ uint32_t type : 3;
+ uint32_t pad5 : 7;
+ uint32_t jmp_tgt : 1;
+ uint32_t sync : 1;
+ uint32_t opc_cat : 3;
+} instr_cat6_a6xx_t;
+
typedef struct PACKED {
/* dword0: */
uint32_t pad1 : 32;
instr_cat4_t cat4;
instr_cat5_t cat5;
instr_cat6_t cat6;
+ instr_cat6_a6xx_t cat6_a6xx;
instr_cat7_t cat7;
struct PACKED {
/* dword0: */
}
}
-static inline uint32_t instr_opc(instr_t *instr)
+/* We can probably drop the gpu_id arg, but keeping it for now so we can
+ * assert if we see something we think should be new encoding on an older
+ * gpu.
+ */
+static inline bool is_cat6_legacy(instr_t *instr, unsigned gpu_id)
+{
+ instr_cat6_a6xx_t *cat6 = &instr->cat6_a6xx;
+
+ /* At least one of these two bits is pad in all the possible
+ * "legacy" cat6 encodings, and a analysis of all the pre-a6xx
+ * cmdstream traces I have indicates that the pad bit is zero
+ * in all cases. So we can use this to detect new encoding:
+ */
+ if ((cat6->pad3 & 0x8) && (cat6->pad5 & 0x2)) {
+ assert(gpu_id >= 600);
+ assert(instr->cat6.opc == 0);
+ return false;
+ }
+
+ return true;
+}
+
+static inline uint32_t instr_opc(instr_t *instr, unsigned gpu_id)
{
switch (instr->opc_cat) {
- case 0: return instr->cat0.opc;
+ case 0: return instr->cat0.opc | instr->cat0.opc_hi << 4;
case 1: return 0;
case 2: return instr->cat2.opc;
case 3: return instr->cat3.opc;
case 4: return instr->cat4.opc;
case 5: return instr->cat5.opc;
- case 6: return instr->cat6.opc;
+ case 6:
+ if (!is_cat6_legacy(instr, gpu_id))
+ return instr->cat6_a6xx.opc;
+ return instr->cat6.opc;
case 7: return instr->cat7.opc;
default: return 0;
}
}
}
-int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out);
+static inline bool is_isam(opc_t opc)
+{
+ switch (opc) {
+ case OPC_ISAM:
+ case OPC_ISAML:
+ case OPC_ISAMM:
+ return true;
+ default:
+ return false;
+ }
+}
+
+
+static inline bool is_cat2_float(opc_t opc)
+{
+ switch (opc) {
+ case OPC_ADD_F:
+ case OPC_MIN_F:
+ case OPC_MAX_F:
+ case OPC_MUL_F:
+ case OPC_SIGN_F:
+ case OPC_CMPS_F:
+ case OPC_ABSNEG_F:
+ case OPC_CMPV_F:
+ case OPC_FLOOR_F:
+ case OPC_CEIL_F:
+ case OPC_RNDNE_F:
+ case OPC_RNDAZ_F:
+ case OPC_TRUNC_F:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+static inline bool is_cat3_float(opc_t opc)
+{
+ switch (opc) {
+ case OPC_MAD_F16:
+ case OPC_MAD_F32:
+ case OPC_SEL_F16:
+ case OPC_SEL_F32:
+ return true;
+ default:
+ return false;
+ }
+}
+
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id);
#endif /* INSTR_A3XX_H_ */