X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_opcodes.py;h=a0ecc9c578845a113436782b08eeb79ae1ed78ed;hb=ddffcf362770940cfc6300ff4d90c0443937ccbb;hp=a4b02507eda7c2149b2281d246c1d0e94e211757;hpb=389ee819c04f3375358d0253bdb1f6094f2423c6;p=mesa.git diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index a4b02507eda..a0ecc9c5788 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -49,15 +49,15 @@ class Format(Enum): PSEUDO_BRANCH = 16 PSEUDO_BARRIER = 17 PSEUDO_REDUCTION = 18 + VOP3P = 19 VOP1 = 1 << 8 VOP2 = 1 << 9 VOPC = 1 << 10 VOP3A = 1 << 11 VOP3B = 1 << 11 - VOP3P = 1 << 12 - VINTRP = 1 << 13 - DPP = 1 << 14 - SDWA = 1 << 15 + VINTRP = 1 << 12 + DPP = 1 << 13 + SDWA = 1 << 14 def get_builder_fields(self): if self == Format.SOPK: @@ -77,7 +77,6 @@ class Format(Enum): elif self == Format.MTBUF: return [('unsigned', 'dfmt', None), ('unsigned', 'nfmt', None), - ('unsigned', 'img_format', None), ('unsigned', 'offset', None), ('bool', 'offen', None), ('bool', 'idxen', 'false'), @@ -85,12 +84,12 @@ class Format(Enum): ('bool', 'glc', 'false'), ('bool', 'dlc', 'false'), ('bool', 'slc', 'false'), - ('bool', 'tfe', 'false'), - ('bool', 'lds', 'false')] + ('bool', 'tfe', 'false')] elif self == Format.MUBUF: return [('unsigned', 'offset', None), ('bool', 'offen', None), ('bool', 'idxen', 'false'), + ('bool', 'addr64', 'false'), ('bool', 'disable_wqm', 'false'), ('bool', 'glc', 'false'), ('bool', 'dlc', 'false'), @@ -150,12 +149,21 @@ class Format(Enum): def get_builder_field_decls(self): return [('%s %s=%s' % (f[0], f[1], f[2]) if f[2] != None else '%s %s' % (f[0], f[1])) for f in self.get_builder_fields()] + def get_builder_initialization(self, num_operands): + res = '' + if self == Format.SDWA: + for i in range(min(num_operands, 2)): + res += 'instr->sel[{0}] = op{0}.op.bytes() == 2 ? sdwa_uword : (op{0}.op.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n'.format(i) + res += 'instr->dst_sel = def0.bytes() == 2 ? sdwa_uword : (def0.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n' + res += 'instr->dst_preserve = true;' + return res + class Opcode(object): """Class that represents all the information we have about the opcode NOTE: this must be kept in sync with aco_op_info """ - def __init__(self, name, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod): + def __init__(self, name, opcode_gfx7, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod, is_atomic): """Parameters: - name is the name of the opcode (prepend nir_op_ for the enum name) @@ -167,6 +175,7 @@ class Opcode(object): constant value of the opcode given the constant values of its inputs. """ assert isinstance(name, str) + assert isinstance(opcode_gfx7, int) assert isinstance(opcode_gfx9, int) assert isinstance(opcode_gfx10, int) assert isinstance(format, Format) @@ -174,24 +183,60 @@ class Opcode(object): assert isinstance(output_mod, bool) self.name = name + self.opcode_gfx7 = opcode_gfx7 self.opcode_gfx9 = opcode_gfx9 self.opcode_gfx10 = opcode_gfx10 self.input_mod = "1" if input_mod else "0" self.output_mod = "1" if output_mod else "0" + self.is_atomic = "1" if is_atomic else "0" self.format = format + parts = name.replace('_e64', '').rsplit('_', 2) + op_dtype = parts[-1] + def_dtype = parts[-2] if len(parts) > 1 else parts[-1] + + def_dtype_sizes = {'{}{}'.format(prefix, size) : size for prefix in 'biuf' for size in [64, 32, 24, 16]} + op_dtype_sizes = {k:v for k, v in def_dtype_sizes.items()} + # inline constants are 32-bit for 16-bit integer/typeless instructions: https://reviews.llvm.org/D81841 + op_dtype_sizes['b16'] = 32 + op_dtype_sizes['i16'] = 32 + op_dtype_sizes['u16'] = 32 + + self.operand_size = op_dtype_sizes.get(op_dtype, 0) + self.definition_size = def_dtype_sizes.get(def_dtype, self.operand_size) + + # exceptions + if self.operand_size == 16 and op_dtype != 'f16': + self.operand_size = 16 + elif self.operand_size == 24: + self.operand_size = 32 + elif name in ['s_sext_i32_i8', 's_sext_i32_i16', 'v_msad_u8', 'v_cvt_pk_u16_u32', 'v_cvt_pk_i16_i32']: + self.operand_size = 32 + elif name in ['v_qsad_pk_u16_u8', 'v_mqsad_pk_u16_u8', 'v_mqsad_u32_u8']: + self.definition_size = 0 + self.operand_size = 0 + elif name in ['v_mad_u64_u32', 'v_mad_i64_i32']: + self.operand_size = 0 + elif '_pk_' in name or name in ['v_lerp_u8', 'v_sad_u8', 'v_sad_u16', + 'v_cvt_f32_ubyte0', 'v_cvt_f32_ubyte1', + 'v_cvt_f32_ubyte2', 'v_cvt_f32_ubyte3']: + self.operand_size = 32 + self.definition_size = 32 + elif '_pknorm_' in name: + self.definition_size = 32 + elif format == Format.PSEUDO_REDUCTION: + # 64-bit reductions can have a larger definition size, but get_subdword_definition_info() handles that + self.definition_size = 32 + # global dictionary of opcodes opcodes = {} -# VOPC to GFX6 opcode translation map -VOPC_GFX6 = [0] * 256 - -def opcode(name, opcode_gfx9 = -1, opcode_gfx10 = -1, format = Format.PSEUDO, input_mod = False, output_mod = False): +def opcode(name, opcode_gfx7 = -1, opcode_gfx9 = -1, opcode_gfx10 = -1, format = Format.PSEUDO, input_mod = False, output_mod = False, is_atomic = False): assert name not in opcodes - opcodes[name] = Opcode(name, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod) + opcodes[name] = Opcode(name, opcode_gfx7, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod, is_atomic) -opcode("exp", 0, 0, format = Format.EXP) +opcode("exp", 0, 0, 0, format = Format.EXP) opcode("p_parallelcopy") opcode("p_startpgm") opcode("p_phi") @@ -213,19 +258,19 @@ opcode("p_reduce", format=Format.PSEUDO_REDUCTION) opcode("p_inclusive_scan", format=Format.PSEUDO_REDUCTION) # e.g. subgroupExclusiveMin() opcode("p_exclusive_scan", format=Format.PSEUDO_REDUCTION) -# simulates proper bpermute behavior on GFX10 wave64 -opcode("p_wave64_bpermute", format=Format.PSEUDO_REDUCTION) opcode("p_branch", format=Format.PSEUDO_BRANCH) opcode("p_cbranch", format=Format.PSEUDO_BRANCH) opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH) opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH) -opcode("p_memory_barrier_all", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_common", format=Format.PSEUDO_BARRIER) # atomic, buffer, image and shared opcode("p_memory_barrier_atomic", format=Format.PSEUDO_BARRIER) opcode("p_memory_barrier_buffer", format=Format.PSEUDO_BARRIER) opcode("p_memory_barrier_image", format=Format.PSEUDO_BARRIER) opcode("p_memory_barrier_shared", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_gs_data", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_gs_sendmsg", format=Format.PSEUDO_BARRIER) opcode("p_spill") opcode("p_reload") @@ -243,6 +288,8 @@ opcode("p_exit_early_if") opcode("p_fs_buffer_store_smem", format=Format.SMEM) +# simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64 +opcode("p_bpermute") # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc) SOP2 = { @@ -302,7 +349,7 @@ SOP2 = { ( -1, -1, -1, 0x2d, 0x36, "s_mul_hi_i32"), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP2: - opcode(name, gfx9, gfx10, Format.SOP2) + opcode(name, gfx7, gfx9, gfx10, Format.SOP2) # SOPK instructions: 0 input (+ imm), 1 output + optional scc @@ -338,7 +385,7 @@ SOPK = { ( -1, -1, -1, -1, 0x1c, "s_subvector_loop_end"), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPK: - opcode(name, gfx9, gfx10, Format.SOPK) + opcode(name, gfx7, gfx9, gfx10, Format.SOPK) # SOP1 instructions: 1 input, 1 output (+optional SCC) @@ -416,7 +463,7 @@ SOP1 = { ( -1, -1, -1, -1, -1, "p_constaddr"), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP1: - opcode(name, gfx9, gfx10, Format.SOP1) + opcode(name, gfx7, gfx9, gfx10, Format.SOP1) # SOPC instructions: 2 inputs and 0 outputs (+SCC) @@ -444,7 +491,7 @@ SOPC = { ( -1, -1, 0x13, 0x13, 0x13, "s_cmp_lg_u64"), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPC: - opcode(name, gfx9, gfx10, Format.SOPC) + opcode(name, gfx7, gfx9, gfx10, Format.SOPC) # SOPP instructions: 0 inputs (+optional scc/vcc), 0 outputs @@ -491,7 +538,7 @@ SOPP = { ( -1, -1, -1, -1, 0x26, "s_ttracedata_imm"), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPP: - opcode(name, gfx9, gfx10, Format.SOPP) + opcode(name, gfx7, gfx9, gfx10, Format.SOPP) # SMEM instructions: sbase input (2 sgpr), potentially 2 offset inputs, 1 sdata input/output @@ -585,14 +632,15 @@ SMEM = { ( -1, -1, -1, 0xac, 0xac, "s_atomic_dec_x2"), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SMEM: - opcode(name, gfx9, gfx10, Format.SMEM) + opcode(name, gfx7, gfx9, gfx10, Format.SMEM, is_atomic = "atomic" in name) # VOP2 instructions: 2 inputs, 1 output (+ optional vcc) # TODO: misses some GFX6_7 opcodes which were shifted to VOP3 in GFX8 VOP2 = { # GFX6, GFX7, GFX8, GFX9, GFX10, name, input/output modifiers - (0x00, 0x00, 0x00, 0x00, 0x01, "v_cndmask_b32", False), + (0x01, 0x01, -1, -1, -1, "v_readlane_b32", False), + (0x02, 0x02, -1, -1, -1, "v_writelane_b32", False), (0x03, 0x03, 0x01, 0x01, 0x03, "v_add_f32", True), (0x04, 0x04, 0x02, 0x02, 0x04, "v_sub_f32", True), (0x05, 0x05, 0x03, 0x03, 0x05, "v_subrev_f32", True), @@ -646,7 +694,7 @@ VOP2 = { ( -1, -1, 0x29, 0x29, -1, "v_mul_lo_u16", False), ( -1, -1, 0x2a, 0x2a, -1, "v_lshlrev_b16", False), ( -1, -1, 0x2b, 0x2b, -1, "v_lshrrev_b16", False), - ( -1, -1, 0x2c, 0x2c, -1, "v_ashrrev_b16", False), + ( -1, -1, 0x2c, 0x2c, -1, "v_ashrrev_i16", False), ( -1, -1, 0x2d, 0x2d, 0x39, "v_max_f16", True), ( -1, -1, 0x2e, 0x2e, 0x3a, "v_min_f16", True), ( -1, -1, 0x2f, 0x2f, -1, "v_max_u16", False), @@ -663,7 +711,12 @@ VOP2 = { ( -1, -1, -1, -1, 0x3c, "v_pk_fmac_f16", False), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name, modifiers) in VOP2: - opcode(name, gfx9, gfx10, Format.VOP2, modifiers, modifiers) + opcode(name, gfx7, gfx9, gfx10, Format.VOP2, modifiers, modifiers) + +if True: + # v_cndmask_b32 can use input modifiers but not output modifiers + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x00, 0x00, 0x00, 0x00, 0x01, "v_cndmask_b32") + opcode(name, gfx7, gfx9, gfx10, Format.VOP2, True, False) # VOP1 instructions: instructions with 1 input and 1 output @@ -763,7 +816,7 @@ VOP1 = { ( -1, -1, -1, -1, 0x68, "v_swaprel_b32", False, False), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod) in VOP1: - opcode(name, gfx9, gfx10, Format.VOP1, in_mod, out_mod) + opcode(name, gfx7, gfx9, gfx10, Format.VOP1, in_mod, out_mod) # VOPC instructions: @@ -777,29 +830,29 @@ VOPC_CLASS = { (0xb8, 0xb8, 0x13, 0x13, 0xb8, "v_cmpx_class_f64"), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in VOPC_CLASS: - opcode(name, gfx9, gfx10, Format.VOPC, True, False) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) COMPF = ["f", "lt", "eq", "le", "gt", "lg", "ge", "o", "u", "nge", "nlg", "ngt", "nle", "neq", "nlt", "tru"] for i in range(8): (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x20+i, 0x20+i, 0xc8+i, "v_cmp_"+COMPF[i]+"_f16") - opcode(name, gfx9, gfx10, Format.VOPC, True, False) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x30+i, 0x30+i, 0xd8+i, "v_cmpx_"+COMPF[i]+"_f16") - opcode(name, gfx9, gfx10, Format.VOPC, True, False) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x28+i, 0x28+i, 0xe8+i, "v_cmp_"+COMPF[i+8]+"_f16") - opcode(name, gfx9, gfx10, Format.VOPC, True, False) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x38+i, 0x38+i, 0xf8+i, "v_cmpx_"+COMPF[i+8]+"_f16") - opcode(name, gfx9, gfx10, Format.VOPC, True, False) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) for i in range(16): (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x00+i, 0x00+i, 0x40+i, 0x40+i, 0x00+i, "v_cmp_"+COMPF[i]+"_f32") - opcode(name, gfx9, gfx10, Format.VOPC, True, False) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x10+i, 0x10+i, 0x50+i, 0x50+i, 0x10+i, "v_cmpx_"+COMPF[i]+"_f32") - opcode(name, gfx9, gfx10, Format.VOPC, True, False) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x20+i, 0x20+i, 0x60+i, 0x60+i, 0x20+i, "v_cmp_"+COMPF[i]+"_f64") - opcode(name, gfx9, gfx10, Format.VOPC, True, False) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x30+i, 0x30+i, 0x70+i, 0x70+i, 0x30+i, "v_cmpx_"+COMPF[i]+"_f64") - opcode(name, gfx9, gfx10, Format.VOPC, True, False) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False) # GFX_6_7 (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x40+i, 0x40+i, -1, -1, -1, "v_cmps_"+COMPF[i]+"_f32") (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x50+i, 0x50+i, -1, -1, -1, "v_cmpsx_"+COMPF[i]+"_f32") @@ -811,41 +864,41 @@ COMPI = ["f", "lt", "eq", "le", "gt", "lg", "ge", "tru"] # GFX_8_9 for i in [0,7]: # only 0 and 7 (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa0+i, 0xa0+i, -1, "v_cmp_"+COMPI[i]+"_i16") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb0+i, 0xb0+i, -1, "v_cmpx_"+COMPI[i]+"_i16") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa8+i, 0xa8+i, -1, "v_cmp_"+COMPI[i]+"_u16") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb8+i, 0xb8+i, -1, "v_cmpx_"+COMPI[i]+"_u16") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) for i in range(1, 7): # [1..6] (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa0+i, 0xa0+i, 0x88+i, "v_cmp_"+COMPI[i]+"_i16") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb0+i, 0xb0+i, 0x98+i, "v_cmpx_"+COMPI[i]+"_i16") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa8+i, 0xa8+i, 0xa8+i, "v_cmp_"+COMPI[i]+"_u16") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb8+i, 0xb8+i, 0xb8+i, "v_cmpx_"+COMPI[i]+"_u16") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) for i in range(8): (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x80+i, 0x80+i, 0xc0+i, 0xc0+i, 0x80+i, "v_cmp_"+COMPI[i]+"_i32") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x90+i, 0x90+i, 0xd0+i, 0xd0+i, 0x90+i, "v_cmpx_"+COMPI[i]+"_i32") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xa0+i, 0xa0+i, 0xe0+i, 0xe0+i, 0xa0+i, "v_cmp_"+COMPI[i]+"_i64") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xb0+i, 0xb0+i, 0xf0+i, 0xf0+i, 0xb0+i, "v_cmpx_"+COMPI[i]+"_i64") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xc0+i, 0xc0+i, 0xc8+i, 0xc8+i, 0xc0+i, "v_cmp_"+COMPI[i]+"_u32") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xd0+i, 0xd0+i, 0xd8+i, 0xd8+i, 0xd0+i, "v_cmpx_"+COMPI[i]+"_u32") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xe0+i, 0xe0+i, 0xe8+i, 0xe8+i, 0xe0+i, "v_cmp_"+COMPI[i]+"_u64") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xf0+i, 0xf0+i, 0xf8+i, 0xf8+i, 0xf0+i, "v_cmpx_"+COMPI[i]+"_u64") - opcode(name, gfx9, gfx10, Format.VOPC) + opcode(name, gfx7, gfx9, gfx10, Format.VOPC) # VOPP instructions: packed 16bit instructions - 1 or 2 inputs and 1 output @@ -876,7 +929,7 @@ VOPP = { # note that these are only supported on gfx9+ so we'll need to distinguish between gfx8 and gfx9 here # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, -1, code, code, name) for (code, name) in VOPP: - opcode(name, code, code, Format.VOP3P) + opcode(name, -1, code, code, Format.VOP3P) # VINTERP instructions: @@ -887,7 +940,7 @@ VINTRP = { } # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name) for (code, name) in VINTRP: - opcode(name, code, code, Format.VINTRP) + opcode(name, code, code, code, Format.VINTRP) # VOP3 instructions: 3 inputs, 1 output # VOP3b instructions: have a unique scalar output, e.g. VOP2 with vcc out @@ -985,8 +1038,8 @@ VOP3 = { ( -1, -1, 0x276, 0x276, -1, "v_interp_p2_legacy_f16", True, True), ( -1, -1, -1, 0x277, 0x35a, "v_interp_p2_f16", True, True), (0x12b, 0x12b, 0x288, 0x288, 0x362, "v_ldexp_f32", False, True), - (0x101, 0x101, 0x289, 0x289, 0x360, "v_readlane_b32", False, False), - (0x102, 0x102, 0x28a, 0x28a, 0x361, "v_writelane_b32", False, False), + ( -1, -1, 0x289, 0x289, 0x360, "v_readlane_b32_e64", False, False), + ( -1, -1, 0x28a, 0x28a, 0x361, "v_writelane_b32_e64", False, False), (0x122, 0x122, 0x28b, 0x28b, 0x364, "v_bcnt_u32_b32", False, False), (0x123, 0x123, 0x28c, 0x28c, 0x365, "v_mbcnt_lo_u32_b32", False, False), (0x124, 0x124, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32", False, False), @@ -1012,10 +1065,19 @@ VOP3 = { ( -1, -1, -1, -1, 0x30f, "v_add_co_u32_e64", False, False), ( -1, -1, -1, -1, 0x310, "v_sub_co_u32_e64", False, False), ( -1, -1, -1, -1, 0x319, "v_subrev_co_u32_e64", False, False), -# TODO: many 16bit instructions moved from VOP2 to VOP3 on GFX10 + ( -1, -1, -1, -1, 0x303, "v_add_u16_e64", False, False), + ( -1, -1, -1, -1, 0x304, "v_sub_u16_e64", False, False), + ( -1, -1, -1, -1, 0x305, "v_mul_lo_u16_e64", False, False), + ( -1, -1, -1, -1, 0x309, "v_max_u16_e64", False, False), + ( -1, -1, -1, -1, 0x30a, "v_max_i16_e64", False, False), + ( -1, -1, -1, -1, 0x30b, "v_min_u16_e64", False, False), + ( -1, -1, -1, -1, 0x30c, "v_min_i16_e64", False, False), + ( -1, -1, -1, -1, 0x307, "v_lshrrev_b16_e64", False, False), + ( -1, -1, -1, -1, 0x308, "v_ashrrev_i16_e64", False, False), + ( -1, -1, -1, -1, 0x314, "v_lshlrev_b16_e64", False, False), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod) in VOP3: - opcode(name, gfx9, gfx10, Format.VOP3A, in_mod, out_mod) + opcode(name, gfx7, gfx9, gfx10, Format.VOP3A, in_mod, out_mod) # DS instructions: 3 inputs (1 addr, 2 data), 1 output @@ -1177,7 +1239,7 @@ DS = { ( -1, 0xff, 0xff, 0xff, 0xff, "ds_read_b128"), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in DS: - opcode(name, gfx9, gfx10, Format.DS) + opcode(name, gfx7, gfx9, gfx10, Format.DS) # MUBUF instructions: MUBUF = { @@ -1262,7 +1324,7 @@ MUBUF = { ( -1, -1, -1, -1, 0x72, "buffer_gl1_inv"), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MUBUF: - opcode(name, gfx9, gfx10, Format.MUBUF) + opcode(name, gfx7, gfx9, gfx10, Format.MUBUF, is_atomic = "atomic" in name) MTBUF = { (0x00, 0x00, 0x00, 0x00, 0x00, "tbuffer_load_format_x"), @@ -1283,7 +1345,7 @@ MTBUF = { ( -1, -1, 0x0f, 0x0f, 0x0f, "tbuffer_store_format_d16_xyzw"), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MTBUF: - opcode(name, gfx9, gfx10, Format.MTBUF) + opcode(name, gfx7, gfx9, gfx10, Format.MTBUF) IMAGE = { @@ -1302,7 +1364,7 @@ IMAGE = { } # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name) for (code, name) in IMAGE: - opcode(name, code, code, Format.MIMG) + opcode(name, code, code, code, Format.MIMG) IMAGE_ATOMIC = { (0x0f, 0x0f, 0x10, "image_atomic_swap"), @@ -1326,7 +1388,7 @@ IMAGE_ATOMIC = { # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (gfx6, gfx7, gfx89, gfx89, ???, name) # gfx7 and gfx10 opcodes are the same here for (gfx6, gfx7, gfx89, name) in IMAGE_ATOMIC: - opcode(name, gfx89, gfx7, Format.MIMG) + opcode(name, gfx7, gfx89, gfx7, Format.MIMG, is_atomic = True) IMAGE_SAMPLE = { (0x20, "image_sample"), @@ -1372,7 +1434,7 @@ IMAGE_SAMPLE = { } # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name) for (code, name) in IMAGE_SAMPLE: - opcode(name, code, code, Format.MIMG) + opcode(name, code, code, code, Format.MIMG) IMAGE_GATHER4 = { (0x40, "image_gather4"), @@ -1405,7 +1467,7 @@ IMAGE_GATHER4 = { } # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name) for (code, name) in IMAGE_GATHER4: - opcode(name, code, code, Format.MIMG) + opcode(name, code, code, code, Format.MIMG) FLAT = { @@ -1466,7 +1528,7 @@ FLAT = { (0x60, -1, 0x60, "flat_atomic_fmax_x2"), } for (gfx7, gfx8, gfx10, name) in FLAT: - opcode(name, gfx8, gfx10, Format.FLAT) + opcode(name, gfx7, gfx8, gfx10, Format.FLAT, is_atomic = "atomic" in name) GLOBAL = { #GFX8_9, GFX10 @@ -1526,7 +1588,7 @@ GLOBAL = { ( -1, 0x60, "global_atomic_fmax_x2"), } for (gfx8, gfx10, name) in GLOBAL: - opcode(name, gfx8, gfx10, Format.GLOBAL) + opcode(name, -1, gfx8, gfx10, Format.GLOBAL, is_atomic = "atomic" in name) SCRATCH = { #GFX8_9, GFX10 @@ -1554,7 +1616,7 @@ SCRATCH = { (0x25, 0x25, "scratch_load_short_d16_hi"), } for (gfx8, gfx10, name) in SCRATCH: - opcode(name, gfx8, gfx10, Format.SCRATCH) + opcode(name, -1, gfx8, gfx10, Format.SCRATCH) # check for duplicate opcode numbers for ver in ['gfx9', 'gfx10']: