X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_opcodes.py;h=89e30d734f63870ff5d9848b778000d0e950133a;hp=65e739b06443de829784ca0cfd4f343d7b0069b6;hb=51bc11abc206ae5ea0946f5a79c68527701c24e0;hpb=6a586a60067ccc7337a3bb047e21ecc2384cc56a diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 65e739b0644..89e30d734f6 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -49,15 +49,15 @@ class Format(Enum): PSEUDO_BRANCH = 16 PSEUDO_BARRIER = 17 PSEUDO_REDUCTION = 18 + VOP3P = 19 VOP1 = 1 << 8 VOP2 = 1 << 9 VOPC = 1 << 10 VOP3A = 1 << 11 VOP3B = 1 << 11 - VOP3P = 1 << 12 - VINTRP = 1 << 13 - DPP = 1 << 14 - SDWA = 1 << 15 + VINTRP = 1 << 12 + DPP = 1 << 13 + SDWA = 1 << 14 def get_builder_fields(self): if self == Format.SOPK: @@ -77,7 +77,6 @@ class Format(Enum): elif self == Format.MTBUF: return [('unsigned', 'dfmt', None), ('unsigned', 'nfmt', None), - ('unsigned', 'img_format', None), ('unsigned', 'offset', None), ('bool', 'offen', None), ('bool', 'idxen', 'false'), @@ -85,12 +84,12 @@ class Format(Enum): ('bool', 'glc', 'false'), ('bool', 'dlc', 'false'), ('bool', 'slc', 'false'), - ('bool', 'tfe', 'false'), - ('bool', 'lds', 'false')] + ('bool', 'tfe', 'false')] elif self == Format.MUBUF: return [('unsigned', 'offset', None), ('bool', 'offen', None), ('bool', 'idxen', 'false'), + ('bool', 'addr64', 'false'), ('bool', 'disable_wqm', 'false'), ('bool', 'glc', 'false'), ('bool', 'dlc', 'false'), @@ -130,7 +129,7 @@ class Format(Enum): return [('uint16_t', 'dpp_ctrl', None), ('uint8_t', 'row_mask', '0xF'), ('uint8_t', 'bank_mask', '0xF'), - ('bool', 'bound_ctrl', 'false')] + ('bool', 'bound_ctrl', 'true')] elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]: return [('uint16_t', 'offset', 0), ('bool', 'can_reorder', 'true'), @@ -150,12 +149,21 @@ class Format(Enum): def get_builder_field_decls(self): return [('%s %s=%s' % (f[0], f[1], f[2]) if f[2] != None else '%s %s' % (f[0], f[1])) for f in self.get_builder_fields()] + def get_builder_initialization(self, num_operands): + res = '' + if self == Format.SDWA: + for i in range(min(num_operands, 2)): + res += 'instr->sel[{0}] = op{0}.op.bytes() == 2 ? sdwa_uword : (op{0}.op.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n'.format(i) + res += 'instr->dst_sel = def0.bytes() == 2 ? sdwa_uword : (def0.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n' + res += 'instr->dst_preserve = true;' + return res + class Opcode(object): """Class that represents all the information we have about the opcode NOTE: this must be kept in sync with aco_op_info """ - def __init__(self, name, opcode_gfx7, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod): + def __init__(self, name, opcode_gfx7, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod, is_atomic): """Parameters: - name is the name of the opcode (prepend nir_op_ for the enum name) @@ -180,15 +188,53 @@ class Opcode(object): self.opcode_gfx10 = opcode_gfx10 self.input_mod = "1" if input_mod else "0" self.output_mod = "1" if output_mod else "0" + self.is_atomic = "1" if is_atomic else "0" self.format = format + parts = name.replace('_e64', '').rsplit('_', 2) + op_dtype = parts[-1] + def_dtype = parts[-2] if len(parts) > 1 else parts[-1] + + def_dtype_sizes = {'{}{}'.format(prefix, size) : size for prefix in 'biuf' for size in [64, 32, 24, 16]} + op_dtype_sizes = {k:v for k, v in def_dtype_sizes.items()} + # inline constants are 32-bit for 16-bit integer/typeless instructions: https://reviews.llvm.org/D81841 + op_dtype_sizes['b16'] = 32 + op_dtype_sizes['i16'] = 32 + op_dtype_sizes['u16'] = 32 + + self.operand_size = op_dtype_sizes.get(op_dtype, 0) + self.definition_size = def_dtype_sizes.get(def_dtype, self.operand_size) + + # exceptions + if self.operand_size == 16 and op_dtype != 'f16': + self.operand_size = 16 + elif self.operand_size == 24: + self.operand_size = 32 + elif name in ['s_sext_i32_i8', 's_sext_i32_i16', 'v_msad_u8', 'v_cvt_pk_u16_u32', 'v_cvt_pk_i16_i32']: + self.operand_size = 32 + elif name in ['v_qsad_pk_u16_u8', 'v_mqsad_pk_u16_u8', 'v_mqsad_u32_u8']: + self.definition_size = 0 + self.operand_size = 0 + elif name in ['v_mad_u64_u32', 'v_mad_i64_i32']: + self.operand_size = 0 + elif '_pk_' in name or name in ['v_lerp_u8', 'v_sad_u8', 'v_sad_u16', + 'v_cvt_f32_ubyte0', 'v_cvt_f32_ubyte1', + 'v_cvt_f32_ubyte2', 'v_cvt_f32_ubyte3']: + self.operand_size = 32 + self.definition_size = 32 + elif '_pknorm_' in name: + self.definition_size = 32 + elif format == Format.PSEUDO_REDUCTION: + # 64-bit reductions can have a larger definition size, but get_subdword_definition_info() handles that + self.definition_size = 32 + # global dictionary of opcodes opcodes = {} -def opcode(name, opcode_gfx7 = -1, opcode_gfx9 = -1, opcode_gfx10 = -1, format = Format.PSEUDO, input_mod = False, output_mod = False): +def opcode(name, opcode_gfx7 = -1, opcode_gfx9 = -1, opcode_gfx10 = -1, format = Format.PSEUDO, input_mod = False, output_mod = False, is_atomic = False): assert name not in opcodes - opcodes[name] = Opcode(name, opcode_gfx7, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod) + opcodes[name] = Opcode(name, opcode_gfx7, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod, is_atomic) opcode("exp", 0, 0, 0, format = Format.EXP) opcode("p_parallelcopy") @@ -212,19 +258,19 @@ opcode("p_reduce", format=Format.PSEUDO_REDUCTION) opcode("p_inclusive_scan", format=Format.PSEUDO_REDUCTION) # e.g. subgroupExclusiveMin() opcode("p_exclusive_scan", format=Format.PSEUDO_REDUCTION) -# simulates proper bpermute behavior on GFX10 wave64 -opcode("p_wave64_bpermute", format=Format.PSEUDO_REDUCTION) opcode("p_branch", format=Format.PSEUDO_BRANCH) opcode("p_cbranch", format=Format.PSEUDO_BRANCH) opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH) opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH) -opcode("p_memory_barrier_all", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_common", format=Format.PSEUDO_BARRIER) # atomic, buffer, image and shared opcode("p_memory_barrier_atomic", format=Format.PSEUDO_BARRIER) opcode("p_memory_barrier_buffer", format=Format.PSEUDO_BARRIER) opcode("p_memory_barrier_image", format=Format.PSEUDO_BARRIER) opcode("p_memory_barrier_shared", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_gs_data", format=Format.PSEUDO_BARRIER) +opcode("p_memory_barrier_gs_sendmsg", format=Format.PSEUDO_BARRIER) opcode("p_spill") opcode("p_reload") @@ -242,6 +288,8 @@ opcode("p_exit_early_if") opcode("p_fs_buffer_store_smem", format=Format.SMEM) +# simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64 +opcode("p_bpermute") # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc) SOP2 = { @@ -584,14 +632,13 @@ SMEM = { ( -1, -1, -1, 0xac, 0xac, "s_atomic_dec_x2"), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SMEM: - opcode(name, gfx7, gfx9, gfx10, Format.SMEM) + opcode(name, gfx7, gfx9, gfx10, Format.SMEM, is_atomic = "atomic" in name) # VOP2 instructions: 2 inputs, 1 output (+ optional vcc) # TODO: misses some GFX6_7 opcodes which were shifted to VOP3 in GFX8 VOP2 = { # GFX6, GFX7, GFX8, GFX9, GFX10, name, input/output modifiers - (0x00, 0x00, 0x00, 0x00, 0x01, "v_cndmask_b32", False), (0x01, 0x01, -1, -1, -1, "v_readlane_b32", False), (0x02, 0x02, -1, -1, -1, "v_writelane_b32", False), (0x03, 0x03, 0x01, 0x01, 0x03, "v_add_f32", True), @@ -625,6 +672,7 @@ VOP2 = { (0x1f, 0x1f, 0x16, 0x16, 0x1f, "v_mac_f32", True), (0x20, 0x20, 0x17, 0x17, 0x20, "v_madmk_f32", False), (0x21, 0x21, 0x18, 0x18, 0x21, "v_madak_f32", False), + (0x24, 0x24, -1, -1, -1, "v_mbcnt_hi_u32_b32", False), (0x25, 0x25, 0x19, 0x19, -1, "v_add_co_u32", False), # VOP3B only in RDNA (0x26, 0x26, 0x1a, 0x1a, -1, "v_sub_co_u32", False), # VOP3B only in RDNA (0x27, 0x27, 0x1b, 0x1b, -1, "v_subrev_co_u32", False), # VOP3B only in RDNA @@ -647,7 +695,7 @@ VOP2 = { ( -1, -1, 0x29, 0x29, -1, "v_mul_lo_u16", False), ( -1, -1, 0x2a, 0x2a, -1, "v_lshlrev_b16", False), ( -1, -1, 0x2b, 0x2b, -1, "v_lshrrev_b16", False), - ( -1, -1, 0x2c, 0x2c, -1, "v_ashrrev_b16", False), + ( -1, -1, 0x2c, 0x2c, -1, "v_ashrrev_i16", False), ( -1, -1, 0x2d, 0x2d, 0x39, "v_max_f16", True), ( -1, -1, 0x2e, 0x2e, 0x3a, "v_min_f16", True), ( -1, -1, 0x2f, 0x2f, -1, "v_max_u16", False), @@ -666,6 +714,11 @@ VOP2 = { for (gfx6, gfx7, gfx8, gfx9, gfx10, name, modifiers) in VOP2: opcode(name, gfx7, gfx9, gfx10, Format.VOP2, modifiers, modifiers) +if True: + # v_cndmask_b32 can use input modifiers but not output modifiers + (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x00, 0x00, 0x00, 0x00, 0x01, "v_cndmask_b32") + opcode(name, gfx7, gfx9, gfx10, Format.VOP2, True, False) + # VOP1 instructions: instructions with 1 input and 1 output VOP1 = { @@ -681,6 +734,7 @@ VOP1 = { (0x08, 0x08, 0x08, 0x08, 0x08, "v_cvt_i32_f32", True, False), (0x09, 0x09, -1, -1, 0x09, "v_mov_fed_b32", True, False), # LLVM mentions it for GFX8_9 (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "v_cvt_f16_f32", True, True), + ( -1, -1, -1, -1, -1, "p_cvt_f16_f32_rtne", True, True), (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "v_cvt_f32_f16", True, True), (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "v_cvt_rpi_i32_f32", True, False), (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "v_cvt_flr_i32_f32", True, False), @@ -990,7 +1044,7 @@ VOP3 = { ( -1, -1, 0x28a, 0x28a, 0x361, "v_writelane_b32_e64", False, False), (0x122, 0x122, 0x28b, 0x28b, 0x364, "v_bcnt_u32_b32", False, False), (0x123, 0x123, 0x28c, 0x28c, 0x365, "v_mbcnt_lo_u32_b32", False, False), - (0x124, 0x124, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32", False, False), + ( -1, -1, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32_e64", False, False), ( -1, -1, 0x28f, 0x28f, 0x2ff, "v_lshlrev_b64", False, False), ( -1, -1, 0x290, 0x290, 0x300, "v_lshrrev_b64", False, False), ( -1, -1, 0x291, 0x291, 0x301, "v_ashrrev_i64", False, False), @@ -1013,7 +1067,16 @@ VOP3 = { ( -1, -1, -1, -1, 0x30f, "v_add_co_u32_e64", False, False), ( -1, -1, -1, -1, 0x310, "v_sub_co_u32_e64", False, False), ( -1, -1, -1, -1, 0x319, "v_subrev_co_u32_e64", False, False), -# TODO: many 16bit instructions moved from VOP2 to VOP3 on GFX10 + ( -1, -1, -1, -1, 0x303, "v_add_u16_e64", False, False), + ( -1, -1, -1, -1, 0x304, "v_sub_u16_e64", False, False), + ( -1, -1, -1, -1, 0x305, "v_mul_lo_u16_e64", False, False), + ( -1, -1, -1, -1, 0x309, "v_max_u16_e64", False, False), + ( -1, -1, -1, -1, 0x30a, "v_max_i16_e64", False, False), + ( -1, -1, -1, -1, 0x30b, "v_min_u16_e64", False, False), + ( -1, -1, -1, -1, 0x30c, "v_min_i16_e64", False, False), + ( -1, -1, -1, -1, 0x307, "v_lshrrev_b16_e64", False, False), + ( -1, -1, -1, -1, 0x308, "v_ashrrev_i16_e64", False, False), + ( -1, -1, -1, -1, 0x314, "v_lshlrev_b16_e64", False, False), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod) in VOP3: opcode(name, gfx7, gfx9, gfx10, Format.VOP3A, in_mod, out_mod) @@ -1263,7 +1326,7 @@ MUBUF = { ( -1, -1, -1, -1, 0x72, "buffer_gl1_inv"), } for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MUBUF: - opcode(name, gfx7, gfx9, gfx10, Format.MUBUF) + opcode(name, gfx7, gfx9, gfx10, Format.MUBUF, is_atomic = "atomic" in name) MTBUF = { (0x00, 0x00, 0x00, 0x00, 0x00, "tbuffer_load_format_x"), @@ -1327,7 +1390,7 @@ IMAGE_ATOMIC = { # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (gfx6, gfx7, gfx89, gfx89, ???, name) # gfx7 and gfx10 opcodes are the same here for (gfx6, gfx7, gfx89, name) in IMAGE_ATOMIC: - opcode(name, gfx7, gfx89, gfx7, Format.MIMG) + opcode(name, gfx7, gfx89, gfx7, Format.MIMG, is_atomic = True) IMAGE_SAMPLE = { (0x20, "image_sample"), @@ -1467,7 +1530,7 @@ FLAT = { (0x60, -1, 0x60, "flat_atomic_fmax_x2"), } for (gfx7, gfx8, gfx10, name) in FLAT: - opcode(name, gfx7, gfx8, gfx10, Format.FLAT) + opcode(name, gfx7, gfx8, gfx10, Format.FLAT, is_atomic = "atomic" in name) GLOBAL = { #GFX8_9, GFX10 @@ -1527,7 +1590,7 @@ GLOBAL = { ( -1, 0x60, "global_atomic_fmax_x2"), } for (gfx8, gfx10, name) in GLOBAL: - opcode(name, -1, gfx8, gfx10, Format.GLOBAL) + opcode(name, -1, gfx8, gfx10, Format.GLOBAL, is_atomic = "atomic" in name) SCRATCH = { #GFX8_9, GFX10