PSEUDO_BRANCH = 16
PSEUDO_BARRIER = 17
PSEUDO_REDUCTION = 18
+ VOP3P = 19
VOP1 = 1 << 8
VOP2 = 1 << 9
VOPC = 1 << 10
VOP3A = 1 << 11
VOP3B = 1 << 11
- VOP3P = 1 << 12
- VINTRP = 1 << 13
- DPP = 1 << 14
- SDWA = 1 << 15
+ VINTRP = 1 << 12
+ DPP = 1 << 13
+ SDWA = 1 << 14
def get_builder_fields(self):
if self == Format.SOPK:
elif self == Format.MTBUF:
return [('unsigned', 'dfmt', None),
('unsigned', 'nfmt', None),
- ('unsigned', 'img_format', None),
('unsigned', 'offset', None),
('bool', 'offen', None),
('bool', 'idxen', 'false'),
('bool', 'glc', 'false'),
('bool', 'dlc', 'false'),
('bool', 'slc', 'false'),
- ('bool', 'tfe', 'false'),
- ('bool', 'lds', 'false')]
+ ('bool', 'tfe', 'false')]
elif self == Format.MUBUF:
return [('unsigned', 'offset', None),
('bool', 'offen', None),
('bool', 'idxen', 'false'),
+ ('bool', 'addr64', 'false'),
('bool', 'disable_wqm', 'false'),
('bool', 'glc', 'false'),
('bool', 'dlc', 'false'),
def get_builder_field_decls(self):
return [('%s %s=%s' % (f[0], f[1], f[2]) if f[2] != None else '%s %s' % (f[0], f[1])) for f in self.get_builder_fields()]
+ def get_builder_initialization(self, num_operands):
+ res = ''
+ if self == Format.SDWA:
+ for i in range(min(num_operands, 2)):
+ res += 'instr->sel[{0}] = op{0}.op.bytes() == 2 ? sdwa_uword : (op{0}.op.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n'.format(i)
+ res += 'instr->dst_sel = def0.bytes() == 2 ? sdwa_uword : (def0.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n'
+ res += 'instr->dst_preserve = true;'
+ return res
+
class Opcode(object):
"""Class that represents all the information we have about the opcode
self.is_atomic = "1" if is_atomic else "0"
self.format = format
+ parts = name.replace('_e64', '').rsplit('_', 2)
+ op_dtype = parts[-1]
+ def_dtype = parts[-2] if len(parts) > 1 else parts[-1]
+
+ def_dtype_sizes = {'{}{}'.format(prefix, size) : size for prefix in 'biuf' for size in [64, 32, 24, 16]}
+ op_dtype_sizes = {k:v for k, v in def_dtype_sizes.items()}
+ # inline constants are 32-bit for 16-bit integer/typeless instructions: https://reviews.llvm.org/D81841
+ op_dtype_sizes['b16'] = 32
+ op_dtype_sizes['i16'] = 32
+ op_dtype_sizes['u16'] = 32
+
+ self.operand_size = op_dtype_sizes.get(op_dtype, 0)
+ self.definition_size = def_dtype_sizes.get(def_dtype, self.operand_size)
+
+ # exceptions
+ if self.operand_size == 16 and op_dtype != 'f16':
+ self.operand_size = 16
+ elif self.operand_size == 24:
+ self.operand_size = 32
+ elif name in ['s_sext_i32_i8', 's_sext_i32_i16', 'v_msad_u8', 'v_cvt_pk_u16_u32', 'v_cvt_pk_i16_i32']:
+ self.operand_size = 32
+ elif name in ['v_qsad_pk_u16_u8', 'v_mqsad_pk_u16_u8', 'v_mqsad_u32_u8']:
+ self.definition_size = 0
+ self.operand_size = 0
+ elif name in ['v_mad_u64_u32', 'v_mad_i64_i32']:
+ self.operand_size = 0
+ elif '_pk_' in name or name in ['v_lerp_u8', 'v_sad_u8', 'v_sad_u16',
+ 'v_cvt_f32_ubyte0', 'v_cvt_f32_ubyte1',
+ 'v_cvt_f32_ubyte2', 'v_cvt_f32_ubyte3']:
+ self.operand_size = 32
+ self.definition_size = 32
+ elif '_pknorm_' in name:
+ self.definition_size = 32
+ elif format == Format.PSEUDO_REDUCTION:
+ # 64-bit reductions can have a larger definition size, but get_subdword_definition_info() handles that
+ self.definition_size = 32
+
# global dictionary of opcodes
opcodes = {}
-def opcode(name, opcode_gfx7 = -1, opcode_gfx9 = -1, opcode_gfx10 = -1, format = Format.PSEUDO, input_mod = False, output_mod = False, is_atomic = True):
+def opcode(name, opcode_gfx7 = -1, opcode_gfx9 = -1, opcode_gfx10 = -1, format = Format.PSEUDO, input_mod = False, output_mod = False, is_atomic = False):
assert name not in opcodes
opcodes[name] = Opcode(name, opcode_gfx7, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod, is_atomic)
opcode("p_inclusive_scan", format=Format.PSEUDO_REDUCTION)
# e.g. subgroupExclusiveMin()
opcode("p_exclusive_scan", format=Format.PSEUDO_REDUCTION)
-# simulates proper bpermute behavior on GFX10 wave64
-opcode("p_wave64_bpermute", format=Format.PSEUDO_REDUCTION)
opcode("p_branch", format=Format.PSEUDO_BRANCH)
opcode("p_cbranch", format=Format.PSEUDO_BRANCH)
opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH)
opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH)
-opcode("p_memory_barrier_all", format=Format.PSEUDO_BARRIER)
+opcode("p_memory_barrier_common", format=Format.PSEUDO_BARRIER) # atomic, buffer, image and shared
opcode("p_memory_barrier_atomic", format=Format.PSEUDO_BARRIER)
opcode("p_memory_barrier_buffer", format=Format.PSEUDO_BARRIER)
opcode("p_memory_barrier_image", format=Format.PSEUDO_BARRIER)
opcode("p_memory_barrier_shared", format=Format.PSEUDO_BARRIER)
+opcode("p_memory_barrier_gs_data", format=Format.PSEUDO_BARRIER)
+opcode("p_memory_barrier_gs_sendmsg", format=Format.PSEUDO_BARRIER)
opcode("p_spill")
opcode("p_reload")
opcode("p_fs_buffer_store_smem", format=Format.SMEM)
+# simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64
+opcode("p_bpermute")
# SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
SOP2 = {
( -1, -1, -1, 0xac, 0xac, "s_atomic_dec_x2"),
}
for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SMEM:
- opcode(name, gfx7, gfx9, gfx10, Format.SMEM, is_atomic = "atomic" not in name)
+ opcode(name, gfx7, gfx9, gfx10, Format.SMEM, is_atomic = "atomic" in name)
# VOP2 instructions: 2 inputs, 1 output (+ optional vcc)
( -1, -1, 0x29, 0x29, -1, "v_mul_lo_u16", False),
( -1, -1, 0x2a, 0x2a, -1, "v_lshlrev_b16", False),
( -1, -1, 0x2b, 0x2b, -1, "v_lshrrev_b16", False),
- ( -1, -1, 0x2c, 0x2c, -1, "v_ashrrev_b16", False),
+ ( -1, -1, 0x2c, 0x2c, -1, "v_ashrrev_i16", False),
( -1, -1, 0x2d, 0x2d, 0x39, "v_max_f16", True),
( -1, -1, 0x2e, 0x2e, 0x3a, "v_min_f16", True),
( -1, -1, 0x2f, 0x2f, -1, "v_max_u16", False),
( -1, -1, -1, -1, 0x30f, "v_add_co_u32_e64", False, False),
( -1, -1, -1, -1, 0x310, "v_sub_co_u32_e64", False, False),
( -1, -1, -1, -1, 0x319, "v_subrev_co_u32_e64", False, False),
-# TODO: many 16bit instructions moved from VOP2 to VOP3 on GFX10
+ ( -1, -1, -1, -1, 0x303, "v_add_u16_e64", False, False),
+ ( -1, -1, -1, -1, 0x304, "v_sub_u16_e64", False, False),
+ ( -1, -1, -1, -1, 0x305, "v_mul_lo_u16_e64", False, False),
+ ( -1, -1, -1, -1, 0x309, "v_max_u16_e64", False, False),
+ ( -1, -1, -1, -1, 0x30a, "v_max_i16_e64", False, False),
+ ( -1, -1, -1, -1, 0x30b, "v_min_u16_e64", False, False),
+ ( -1, -1, -1, -1, 0x30c, "v_min_i16_e64", False, False),
+ ( -1, -1, -1, -1, 0x307, "v_lshrrev_b16_e64", False, False),
+ ( -1, -1, -1, -1, 0x308, "v_ashrrev_i16_e64", False, False),
+ ( -1, -1, -1, -1, 0x314, "v_lshlrev_b16_e64", False, False),
}
for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod) in VOP3:
opcode(name, gfx7, gfx9, gfx10, Format.VOP3A, in_mod, out_mod)
( -1, -1, -1, -1, 0x72, "buffer_gl1_inv"),
}
for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MUBUF:
- opcode(name, gfx7, gfx9, gfx10, Format.MUBUF, is_atomic = "atomic" not in name)
+ opcode(name, gfx7, gfx9, gfx10, Format.MUBUF, is_atomic = "atomic" in name)
MTBUF = {
(0x00, 0x00, 0x00, 0x00, 0x00, "tbuffer_load_format_x"),
# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (gfx6, gfx7, gfx89, gfx89, ???, name)
# gfx7 and gfx10 opcodes are the same here
for (gfx6, gfx7, gfx89, name) in IMAGE_ATOMIC:
- opcode(name, gfx7, gfx89, gfx7, Format.MIMG, is_atomic = False)
+ opcode(name, gfx7, gfx89, gfx7, Format.MIMG, is_atomic = True)
IMAGE_SAMPLE = {
(0x20, "image_sample"),
(0x60, -1, 0x60, "flat_atomic_fmax_x2"),
}
for (gfx7, gfx8, gfx10, name) in FLAT:
- opcode(name, gfx7, gfx8, gfx10, Format.FLAT, is_atomic = "atomic" not in name)
+ opcode(name, gfx7, gfx8, gfx10, Format.FLAT, is_atomic = "atomic" in name)
GLOBAL = {
#GFX8_9, GFX10
( -1, 0x60, "global_atomic_fmax_x2"),
}
for (gfx8, gfx10, name) in GLOBAL:
- opcode(name, -1, gfx8, gfx10, Format.GLOBAL, is_atomic = "atomic" not in name)
+ opcode(name, -1, gfx8, gfx10, Format.GLOBAL, is_atomic = "atomic" in name)
SCRATCH = {
#GFX8_9, GFX10