aco: set tcs_in_out_eq=false if float controls of VS and TCS stages differ

[mesa.git] / src / amd / compiler / aco_opcodes.py
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py

index 1b9c3c7a155cbad7aa8470ba5a5dcf764704518d..89e30d734f63870ff5d9848b778000d0e950133a 100644 (file)
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -27,6 +27,7 @@
  # Class that represents all the information we have about the opcode
  # NOTE: this must be kept in sync with aco_op_info
  
  # Class that represents all the information we have about the opcode
  # NOTE: this must be kept in sync with aco_op_info
  
+import sys
  from enum import Enum
  
  class Format(Enum):
  from enum import Enum
  
  class Format(Enum):
@@ -48,15 +49,15 @@ class Format(Enum):
     PSEUDO_BRANCH = 16
     PSEUDO_BARRIER = 17
     PSEUDO_REDUCTION = 18
     PSEUDO_BRANCH = 16
     PSEUDO_BARRIER = 17
     PSEUDO_REDUCTION = 18
+   VOP3P = 19
     VOP1 = 1 << 8
     VOP2 = 1 << 9
     VOPC = 1 << 10
     VOP3A = 1 << 11
     VOP3B = 1 << 11
     VOP1 = 1 << 8
     VOP2 = 1 << 9
     VOPC = 1 << 10
     VOP3A = 1 << 11
     VOP3B = 1 << 11
-   VOP3P = 1 << 12
-   VINTRP = 1 << 13
-   DPP = 1 << 14
-   SDWA = 1 << 15
+   VINTRP = 1 << 12
+   DPP = 1 << 13
+   SDWA = 1 << 14
  
     def get_builder_fields(self):
        if self == Format.SOPK:
  
     def get_builder_fields(self):
        if self == Format.SOPK:
@@ -76,7 +77,6 @@ class Format(Enum):
        elif self == Format.MTBUF:
           return [('unsigned', 'dfmt', None),
                   ('unsigned', 'nfmt', None),
        elif self == Format.MTBUF:
           return [('unsigned', 'dfmt', None),
                   ('unsigned', 'nfmt', None),
-                 ('unsigned', 'img_format', None),
                   ('unsigned', 'offset', None),
                   ('bool', 'offen', None),
                   ('bool', 'idxen', 'false'),
                   ('unsigned', 'offset', None),
                   ('bool', 'offen', None),
                   ('bool', 'idxen', 'false'),
@@ -84,12 +84,12 @@ class Format(Enum):
                   ('bool', 'glc', 'false'),
                   ('bool', 'dlc', 'false'),
                   ('bool', 'slc', 'false'),
                   ('bool', 'glc', 'false'),
                   ('bool', 'dlc', 'false'),
                   ('bool', 'slc', 'false'),
-                 ('bool', 'tfe', 'false'),
-                 ('bool', 'lds', 'false')]
+                 ('bool', 'tfe', 'false')]
        elif self == Format.MUBUF:
           return [('unsigned', 'offset', None),
                   ('bool', 'offen', None),
                   ('bool', 'idxen', 'false'),
        elif self == Format.MUBUF:
           return [('unsigned', 'offset', None),
                   ('bool', 'offen', None),
                   ('bool', 'idxen', 'false'),
+                 ('bool', 'addr64', 'false'),
                   ('bool', 'disable_wqm', 'false'),
                   ('bool', 'glc', 'false'),
                   ('bool', 'dlc', 'false'),
                   ('bool', 'disable_wqm', 'false'),
                   ('bool', 'glc', 'false'),
                   ('bool', 'dlc', 'false'),
@@ -129,9 +129,10 @@ class Format(Enum):
           return [('uint16_t', 'dpp_ctrl', None),
                   ('uint8_t', 'row_mask', '0xF'),
                   ('uint8_t', 'bank_mask', '0xF'),
           return [('uint16_t', 'dpp_ctrl', None),
                   ('uint8_t', 'row_mask', '0xF'),
                   ('uint8_t', 'bank_mask', '0xF'),
-                 ('bool', 'bound_ctrl', 'false')]
+                 ('bool', 'bound_ctrl', 'true')]
        elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
           return [('uint16_t', 'offset', 0),
        elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
           return [('uint16_t', 'offset', 0),
+                 ('bool', 'can_reorder', 'true'),
                   ('bool', 'glc', 'false'),
                   ('bool', 'slc', 'false'),
                   ('bool', 'lds', 'false'),
                   ('bool', 'glc', 'false'),
                   ('bool', 'slc', 'false'),
                   ('bool', 'lds', 'false'),
@@ -148,12 +149,21 @@ class Format(Enum):
     def get_builder_field_decls(self):
        return [('%s %s=%s' % (f[0], f[1], f[2]) if f[2] != None else '%s %s' % (f[0], f[1])) for f in self.get_builder_fields()]
  
     def get_builder_field_decls(self):
        return [('%s %s=%s' % (f[0], f[1], f[2]) if f[2] != None else '%s %s' % (f[0], f[1])) for f in self.get_builder_fields()]
  
+   def get_builder_initialization(self, num_operands):
+      res = ''
+      if self == Format.SDWA:
+         for i in range(min(num_operands, 2)):
+            res += 'instr->sel[{0}] = op{0}.op.bytes() == 2 ? sdwa_uword : (op{0}.op.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n'.format(i)
+         res += 'instr->dst_sel = def0.bytes() == 2 ? sdwa_uword : (def0.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n'
+         res += 'instr->dst_preserve = true;'
+      return res
+
  
  class Opcode(object):
     """Class that represents all the information we have about the opcode
     NOTE: this must be kept in sync with aco_op_info
     """
  
  class Opcode(object):
     """Class that represents all the information we have about the opcode
     NOTE: this must be kept in sync with aco_op_info
     """
-   def __init__(self, name, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod):
+   def __init__(self, name, opcode_gfx7, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod, is_atomic):
        """Parameters:
  
        - name is the name of the opcode (prepend nir_op_ for the enum name)
        """Parameters:
  
        - name is the name of the opcode (prepend nir_op_ for the enum name)
@@ -165,6 +175,7 @@ class Opcode(object):
          constant value of the opcode given the constant values of its inputs.
        """
        assert isinstance(name, str)
          constant value of the opcode given the constant values of its inputs.
        """
        assert isinstance(name, str)
+      assert isinstance(opcode_gfx7, int)
        assert isinstance(opcode_gfx9, int)
        assert isinstance(opcode_gfx10, int)
        assert isinstance(format, Format)
        assert isinstance(opcode_gfx9, int)
        assert isinstance(opcode_gfx10, int)
        assert isinstance(format, Format)
@@ -172,24 +183,60 @@ class Opcode(object):
        assert isinstance(output_mod, bool)
  
        self.name = name
        assert isinstance(output_mod, bool)
  
        self.name = name
+      self.opcode_gfx7 = opcode_gfx7
        self.opcode_gfx9 = opcode_gfx9
        self.opcode_gfx10 = opcode_gfx10
        self.input_mod = "1" if input_mod else "0"
        self.output_mod = "1" if output_mod else "0"
        self.opcode_gfx9 = opcode_gfx9
        self.opcode_gfx10 = opcode_gfx10
        self.input_mod = "1" if input_mod else "0"
        self.output_mod = "1" if output_mod else "0"
+      self.is_atomic = "1" if is_atomic else "0"
        self.format = format
  
        self.format = format
  
+      parts = name.replace('_e64', '').rsplit('_', 2)
+      op_dtype = parts[-1]
+      def_dtype = parts[-2] if len(parts) > 1 else parts[-1]
+
+      def_dtype_sizes = {'{}{}'.format(prefix, size) : size for prefix in 'biuf' for size in [64, 32, 24, 16]}
+      op_dtype_sizes = {k:v for k, v in def_dtype_sizes.items()}
+      # inline constants are 32-bit for 16-bit integer/typeless instructions: https://reviews.llvm.org/D81841
+      op_dtype_sizes['b16'] = 32
+      op_dtype_sizes['i16'] = 32
+      op_dtype_sizes['u16'] = 32
+
+      self.operand_size = op_dtype_sizes.get(op_dtype, 0)
+      self.definition_size = def_dtype_sizes.get(def_dtype, self.operand_size)
+
+      # exceptions
+      if self.operand_size == 16 and op_dtype != 'f16':
+         self.operand_size = 16
+      elif self.operand_size == 24:
+        self.operand_size = 32
+      elif name in ['s_sext_i32_i8', 's_sext_i32_i16', 'v_msad_u8', 'v_cvt_pk_u16_u32', 'v_cvt_pk_i16_i32']:
+         self.operand_size = 32
+      elif name in ['v_qsad_pk_u16_u8', 'v_mqsad_pk_u16_u8', 'v_mqsad_u32_u8']:
+         self.definition_size = 0
+         self.operand_size = 0
+      elif name in ['v_mad_u64_u32', 'v_mad_i64_i32']:
+         self.operand_size = 0
+      elif '_pk_' in name or name in ['v_lerp_u8', 'v_sad_u8', 'v_sad_u16',
+                                      'v_cvt_f32_ubyte0', 'v_cvt_f32_ubyte1',
+                                      'v_cvt_f32_ubyte2', 'v_cvt_f32_ubyte3']:
+         self.operand_size = 32
+         self.definition_size = 32
+      elif '_pknorm_' in name:
+         self.definition_size = 32
+      elif format == Format.PSEUDO_REDUCTION:
+         # 64-bit reductions can have a larger definition size, but get_subdword_definition_info() handles that
+         self.definition_size = 32
+
  
  # global dictionary of opcodes
  opcodes = {}
  
  
  # global dictionary of opcodes
  opcodes = {}
  
-# VOPC to GFX6 opcode translation map
-VOPC_GFX6 = [0] * 256
-
-def opcode(name, opcode_gfx9 = -1, opcode_gfx10 = -1, format = Format.PSEUDO, input_mod = False, output_mod = False):
+def opcode(name, opcode_gfx7 = -1, opcode_gfx9 = -1, opcode_gfx10 = -1, format = Format.PSEUDO, input_mod = False, output_mod = False, is_atomic = False):
     assert name not in opcodes
     assert name not in opcodes
-   opcodes[name] = Opcode(name, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod)
+   opcodes[name] = Opcode(name, opcode_gfx7, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod, is_atomic)
  
  
-opcode("exp", 0, 0, format = Format.EXP)
+opcode("exp", 0, 0, 0, format = Format.EXP)
  opcode("p_parallelcopy")
  opcode("p_startpgm")
  opcode("p_phi")
  opcode("p_parallelcopy")
  opcode("p_startpgm")
  opcode("p_phi")
@@ -217,11 +264,13 @@ opcode("p_cbranch", format=Format.PSEUDO_BRANCH)
  opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH)
  opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH)
  
  opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH)
  opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH)
  
-opcode("p_memory_barrier_all", format=Format.PSEUDO_BARRIER)
+opcode("p_memory_barrier_common", format=Format.PSEUDO_BARRIER) # atomic, buffer, image and shared
  opcode("p_memory_barrier_atomic", format=Format.PSEUDO_BARRIER)
  opcode("p_memory_barrier_buffer", format=Format.PSEUDO_BARRIER)
  opcode("p_memory_barrier_image", format=Format.PSEUDO_BARRIER)
  opcode("p_memory_barrier_shared", format=Format.PSEUDO_BARRIER)
  opcode("p_memory_barrier_atomic", format=Format.PSEUDO_BARRIER)
  opcode("p_memory_barrier_buffer", format=Format.PSEUDO_BARRIER)
  opcode("p_memory_barrier_image", format=Format.PSEUDO_BARRIER)
  opcode("p_memory_barrier_shared", format=Format.PSEUDO_BARRIER)
+opcode("p_memory_barrier_gs_data", format=Format.PSEUDO_BARRIER)
+opcode("p_memory_barrier_gs_sendmsg", format=Format.PSEUDO_BARRIER)
  
  opcode("p_spill")
  opcode("p_reload")
  
  opcode("p_spill")
  opcode("p_reload")
@@ -235,9 +284,12 @@ opcode("p_discard_if")
  opcode("p_load_helper")
  opcode("p_demote_to_helper")
  opcode("p_is_helper")
  opcode("p_load_helper")
  opcode("p_demote_to_helper")
  opcode("p_is_helper")
+opcode("p_exit_early_if")
  
  opcode("p_fs_buffer_store_smem", format=Format.SMEM)
  
  
  opcode("p_fs_buffer_store_smem", format=Format.SMEM)
  
+# simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64
+opcode("p_bpermute")
  
  # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
  SOP2 = {
  
  # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
  SOP2 = {
@@ -297,7 +349,7 @@ SOP2 = {
     (  -1,   -1,   -1, 0x2d, 0x36, "s_mul_hi_i32"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP2:
     (  -1,   -1,   -1, 0x2d, 0x36, "s_mul_hi_i32"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP2:
-    opcode(name, gfx9, gfx10, Format.SOP2)
+    opcode(name, gfx7, gfx9, gfx10, Format.SOP2)
  
  
  # SOPK instructions: 0 input (+ imm), 1 output + optional scc
  
  
  # SOPK instructions: 0 input (+ imm), 1 output + optional scc
@@ -333,7 +385,7 @@ SOPK = {
     (  -1,   -1,   -1,   -1, 0x1c, "s_subvector_loop_end"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPK:
     (  -1,   -1,   -1,   -1, 0x1c, "s_subvector_loop_end"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPK:
-   opcode(name, gfx9, gfx10, Format.SOPK)
+   opcode(name, gfx7, gfx9, gfx10, Format.SOPK)
  
  
  # SOP1 instructions: 1 input, 1 output (+optional SCC)
  
  
  # SOP1 instructions: 1 input, 1 output (+optional SCC)
@@ -411,7 +463,7 @@ SOP1 = {
     (  -1,   -1,   -1,   -1,   -1, "p_constaddr"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP1:
     (  -1,   -1,   -1,   -1,   -1, "p_constaddr"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP1:
-   opcode(name, gfx9, gfx10, Format.SOP1)
+   opcode(name, gfx7, gfx9, gfx10, Format.SOP1)
  
  
  # SOPC instructions: 2 inputs and 0 outputs (+SCC)
  
  
  # SOPC instructions: 2 inputs and 0 outputs (+SCC)
@@ -439,7 +491,7 @@ SOPC = {
     (  -1,   -1, 0x13, 0x13, 0x13, "s_cmp_lg_u64"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPC:
     (  -1,   -1, 0x13, 0x13, 0x13, "s_cmp_lg_u64"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPC:
-   opcode(name, gfx9, gfx10, Format.SOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.SOPC)
  
  
  # SOPP instructions: 0 inputs (+optional scc/vcc), 0 outputs
  
  
  # SOPP instructions: 0 inputs (+optional scc/vcc), 0 outputs
@@ -486,7 +538,7 @@ SOPP = {
     (  -1,   -1,   -1,   -1, 0x26, "s_ttracedata_imm"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPP:
     (  -1,   -1,   -1,   -1, 0x26, "s_ttracedata_imm"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPP:
-   opcode(name, gfx9, gfx10, Format.SOPP)
+   opcode(name, gfx7, gfx9, gfx10, Format.SOPP)
  
  
  # SMEM instructions: sbase input (2 sgpr), potentially 2 offset inputs, 1 sdata input/output
  
  
  # SMEM instructions: sbase input (2 sgpr), potentially 2 offset inputs, 1 sdata input/output
@@ -580,14 +632,15 @@ SMEM = {
     (  -1,   -1,   -1, 0xac, 0xac, "s_atomic_dec_x2"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SMEM:
     (  -1,   -1,   -1, 0xac, 0xac, "s_atomic_dec_x2"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SMEM:
-   opcode(name, gfx9, gfx10, Format.SMEM)
+   opcode(name, gfx7, gfx9, gfx10, Format.SMEM, is_atomic = "atomic" in name)
  
  
  # VOP2 instructions: 2 inputs, 1 output (+ optional vcc)
  # TODO: misses some GFX6_7 opcodes which were shifted to VOP3 in GFX8
  VOP2 = {
    # GFX6, GFX7, GFX8, GFX9, GFX10, name, input/output modifiers
  
  
  # VOP2 instructions: 2 inputs, 1 output (+ optional vcc)
  # TODO: misses some GFX6_7 opcodes which were shifted to VOP3 in GFX8
  VOP2 = {
    # GFX6, GFX7, GFX8, GFX9, GFX10, name, input/output modifiers
-   (0x00, 0x00, 0x00, 0x00, 0x01, "v_cndmask_b32", False),
+   (0x01, 0x01,   -1,   -1,   -1, "v_readlane_b32", False),
+   (0x02, 0x02,   -1,   -1,   -1, "v_writelane_b32", False),
     (0x03, 0x03, 0x01, 0x01, 0x03, "v_add_f32", True),
     (0x04, 0x04, 0x02, 0x02, 0x04, "v_sub_f32", True),
     (0x05, 0x05, 0x03, 0x03, 0x05, "v_subrev_f32", True),
     (0x03, 0x03, 0x01, 0x01, 0x03, "v_add_f32", True),
     (0x04, 0x04, 0x02, 0x02, 0x04, "v_sub_f32", True),
     (0x05, 0x05, 0x03, 0x03, 0x05, "v_subrev_f32", True),
@@ -619,6 +672,7 @@ VOP2 = {
     (0x1f, 0x1f, 0x16, 0x16, 0x1f, "v_mac_f32", True),
     (0x20, 0x20, 0x17, 0x17, 0x20, "v_madmk_f32", False),
     (0x21, 0x21, 0x18, 0x18, 0x21, "v_madak_f32", False),
     (0x1f, 0x1f, 0x16, 0x16, 0x1f, "v_mac_f32", True),
     (0x20, 0x20, 0x17, 0x17, 0x20, "v_madmk_f32", False),
     (0x21, 0x21, 0x18, 0x18, 0x21, "v_madak_f32", False),
+   (0x24, 0x24,   -1,   -1,   -1, "v_mbcnt_hi_u32_b32", False),
     (0x25, 0x25, 0x19, 0x19,   -1, "v_add_co_u32", False), # VOP3B only in RDNA
     (0x26, 0x26, 0x1a, 0x1a,   -1, "v_sub_co_u32", False), # VOP3B only in RDNA
     (0x27, 0x27, 0x1b, 0x1b,   -1, "v_subrev_co_u32", False), # VOP3B only in RDNA
     (0x25, 0x25, 0x19, 0x19,   -1, "v_add_co_u32", False), # VOP3B only in RDNA
     (0x26, 0x26, 0x1a, 0x1a,   -1, "v_sub_co_u32", False), # VOP3B only in RDNA
     (0x27, 0x27, 0x1b, 0x1b,   -1, "v_subrev_co_u32", False), # VOP3B only in RDNA
@@ -641,7 +695,7 @@ VOP2 = {
     (  -1,   -1, 0x29, 0x29,   -1, "v_mul_lo_u16", False),
     (  -1,   -1, 0x2a, 0x2a,   -1, "v_lshlrev_b16", False),
     (  -1,   -1, 0x2b, 0x2b,   -1, "v_lshrrev_b16", False),
     (  -1,   -1, 0x29, 0x29,   -1, "v_mul_lo_u16", False),
     (  -1,   -1, 0x2a, 0x2a,   -1, "v_lshlrev_b16", False),
     (  -1,   -1, 0x2b, 0x2b,   -1, "v_lshrrev_b16", False),
-   (  -1,   -1, 0x2c, 0x2c,   -1, "v_ashrrev_b16", False),
+   (  -1,   -1, 0x2c, 0x2c,   -1, "v_ashrrev_i16", False),
     (  -1,   -1, 0x2d, 0x2d, 0x39, "v_max_f16", True),
     (  -1,   -1, 0x2e, 0x2e, 0x3a, "v_min_f16", True),
     (  -1,   -1, 0x2f, 0x2f,   -1, "v_max_u16", False),
     (  -1,   -1, 0x2d, 0x2d, 0x39, "v_max_f16", True),
     (  -1,   -1, 0x2e, 0x2e, 0x3a, "v_min_f16", True),
     (  -1,   -1, 0x2f, 0x2f,   -1, "v_max_u16", False),
@@ -658,7 +712,12 @@ VOP2 = {
     (  -1,   -1,   -1,   -1, 0x3c, "v_pk_fmac_f16", False),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name, modifiers) in VOP2:
     (  -1,   -1,   -1,   -1, 0x3c, "v_pk_fmac_f16", False),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name, modifiers) in VOP2:
-   opcode(name, gfx9, gfx10, Format.VOP2, modifiers, modifiers)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOP2, modifiers, modifiers)
+
+if True:
+    # v_cndmask_b32 can use input modifiers but not output modifiers
+    (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x00, 0x00, 0x00, 0x00, 0x01, "v_cndmask_b32")
+    opcode(name, gfx7, gfx9, gfx10, Format.VOP2, True, False)
  
  
  # VOP1 instructions: instructions with 1 input and 1 output
  
  
  # VOP1 instructions: instructions with 1 input and 1 output
@@ -675,6 +734,7 @@ VOP1 = {
     (0x08, 0x08, 0x08, 0x08, 0x08, "v_cvt_i32_f32", True, False),
     (0x09, 0x09,   -1,   -1, 0x09, "v_mov_fed_b32", True, False), # LLVM mentions it for GFX8_9
     (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "v_cvt_f16_f32", True, True),
     (0x08, 0x08, 0x08, 0x08, 0x08, "v_cvt_i32_f32", True, False),
     (0x09, 0x09,   -1,   -1, 0x09, "v_mov_fed_b32", True, False), # LLVM mentions it for GFX8_9
     (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "v_cvt_f16_f32", True, True),
+   (  -1,   -1,   -1,   -1,   -1, "p_cvt_f16_f32_rtne", True, True),
     (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "v_cvt_f32_f16", True, True),
     (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "v_cvt_rpi_i32_f32", True, False),
     (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "v_cvt_flr_i32_f32", True, False),
     (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "v_cvt_f32_f16", True, True),
     (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "v_cvt_rpi_i32_f32", True, False),
     (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "v_cvt_flr_i32_f32", True, False),
@@ -758,7 +818,7 @@ VOP1 = {
     (  -1,   -1,   -1,   -1, 0x68, "v_swaprel_b32", False, False),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod) in VOP1:
     (  -1,   -1,   -1,   -1, 0x68, "v_swaprel_b32", False, False),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod) in VOP1:
-   opcode(name, gfx9, gfx10, Format.VOP1, in_mod, out_mod)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOP1, in_mod, out_mod)
  
  
  # VOPC instructions:
  
  
  # VOPC instructions:
@@ -772,29 +832,29 @@ VOPC_CLASS = {
     (0xb8, 0xb8, 0x13, 0x13, 0xb8, "v_cmpx_class_f64"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in VOPC_CLASS:
     (0xb8, 0xb8, 0x13, 0x13, 0xb8, "v_cmpx_class_f64"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in VOPC_CLASS:
-    opcode(name, gfx9, gfx10, Format.VOPC, True, False)
+    opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
  
  COMPF = ["f", "lt", "eq", "le", "gt", "lg", "ge", "o", "u", "nge", "nlg", "ngt", "nle", "neq", "nlt", "tru"]
  
  for i in range(8):
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x20+i, 0x20+i, 0xc8+i, "v_cmp_"+COMPF[i]+"_f16")
  
  COMPF = ["f", "lt", "eq", "le", "gt", "lg", "ge", "o", "u", "nge", "nlg", "ngt", "nle", "neq", "nlt", "tru"]
  
  for i in range(8):
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x20+i, 0x20+i, 0xc8+i, "v_cmp_"+COMPF[i]+"_f16")
-   opcode(name, gfx9, gfx10, Format.VOPC, True, False)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x30+i, 0x30+i, 0xd8+i, "v_cmpx_"+COMPF[i]+"_f16")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x30+i, 0x30+i, 0xd8+i, "v_cmpx_"+COMPF[i]+"_f16")
-   opcode(name, gfx9, gfx10, Format.VOPC, True, False)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x28+i, 0x28+i, 0xe8+i, "v_cmp_"+COMPF[i+8]+"_f16")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x28+i, 0x28+i, 0xe8+i, "v_cmp_"+COMPF[i+8]+"_f16")
-   opcode(name, gfx9, gfx10, Format.VOPC, True, False)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x38+i, 0x38+i, 0xf8+i, "v_cmpx_"+COMPF[i+8]+"_f16")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x38+i, 0x38+i, 0xf8+i, "v_cmpx_"+COMPF[i+8]+"_f16")
-   opcode(name, gfx9, gfx10, Format.VOPC, True, False)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
  
  for i in range(16):
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x00+i, 0x00+i, 0x40+i, 0x40+i, 0x00+i, "v_cmp_"+COMPF[i]+"_f32")
  
  for i in range(16):
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x00+i, 0x00+i, 0x40+i, 0x40+i, 0x00+i, "v_cmp_"+COMPF[i]+"_f32")
-   opcode(name, gfx9, gfx10, Format.VOPC, True, False)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x10+i, 0x10+i, 0x50+i, 0x50+i, 0x10+i, "v_cmpx_"+COMPF[i]+"_f32")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x10+i, 0x10+i, 0x50+i, 0x50+i, 0x10+i, "v_cmpx_"+COMPF[i]+"_f32")
-   opcode(name, gfx9, gfx10, Format.VOPC, True, False)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x20+i, 0x20+i, 0x60+i, 0x60+i, 0x20+i, "v_cmp_"+COMPF[i]+"_f64")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x20+i, 0x20+i, 0x60+i, 0x60+i, 0x20+i, "v_cmp_"+COMPF[i]+"_f64")
-   opcode(name, gfx9, gfx10, Format.VOPC, True, False)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x30+i, 0x30+i, 0x70+i, 0x70+i, 0x30+i, "v_cmpx_"+COMPF[i]+"_f64")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x30+i, 0x30+i, 0x70+i, 0x70+i, 0x30+i, "v_cmpx_"+COMPF[i]+"_f64")
-   opcode(name, gfx9, gfx10, Format.VOPC, True, False)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
     # GFX_6_7
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x40+i, 0x40+i, -1, -1, -1, "v_cmps_"+COMPF[i]+"_f32")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x50+i, 0x50+i, -1, -1, -1, "v_cmpsx_"+COMPF[i]+"_f32")
     # GFX_6_7
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x40+i, 0x40+i, -1, -1, -1, "v_cmps_"+COMPF[i]+"_f32")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x50+i, 0x50+i, -1, -1, -1, "v_cmpsx_"+COMPF[i]+"_f32")
@@ -806,41 +866,41 @@ COMPI = ["f", "lt", "eq", "le", "gt", "lg", "ge", "tru"]
  # GFX_8_9
  for i in [0,7]: # only 0 and 7
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa0+i, 0xa0+i, -1, "v_cmp_"+COMPI[i]+"_i16")
  # GFX_8_9
  for i in [0,7]: # only 0 and 7
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa0+i, 0xa0+i, -1, "v_cmp_"+COMPI[i]+"_i16")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb0+i, 0xb0+i, -1, "v_cmpx_"+COMPI[i]+"_i16")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb0+i, 0xb0+i, -1, "v_cmpx_"+COMPI[i]+"_i16")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa8+i, 0xa8+i, -1, "v_cmp_"+COMPI[i]+"_u16")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa8+i, 0xa8+i, -1, "v_cmp_"+COMPI[i]+"_u16")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb8+i, 0xb8+i, -1, "v_cmpx_"+COMPI[i]+"_u16")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb8+i, 0xb8+i, -1, "v_cmpx_"+COMPI[i]+"_u16")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
  
  for i in range(1, 7): # [1..6]
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa0+i, 0xa0+i, 0x88+i, "v_cmp_"+COMPI[i]+"_i16")
  
  for i in range(1, 7): # [1..6]
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa0+i, 0xa0+i, 0x88+i, "v_cmp_"+COMPI[i]+"_i16")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb0+i, 0xb0+i, 0x98+i, "v_cmpx_"+COMPI[i]+"_i16")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb0+i, 0xb0+i, 0x98+i, "v_cmpx_"+COMPI[i]+"_i16")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa8+i, 0xa8+i, 0xa8+i, "v_cmp_"+COMPI[i]+"_u16")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa8+i, 0xa8+i, 0xa8+i, "v_cmp_"+COMPI[i]+"_u16")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb8+i, 0xb8+i, 0xb8+i, "v_cmpx_"+COMPI[i]+"_u16")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb8+i, 0xb8+i, 0xb8+i, "v_cmpx_"+COMPI[i]+"_u16")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
  
  for i in range(8):
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x80+i, 0x80+i, 0xc0+i, 0xc0+i, 0x80+i, "v_cmp_"+COMPI[i]+"_i32")
  
  for i in range(8):
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x80+i, 0x80+i, 0xc0+i, 0xc0+i, 0x80+i, "v_cmp_"+COMPI[i]+"_i32")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x90+i, 0x90+i, 0xd0+i, 0xd0+i, 0x90+i, "v_cmpx_"+COMPI[i]+"_i32")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x90+i, 0x90+i, 0xd0+i, 0xd0+i, 0x90+i, "v_cmpx_"+COMPI[i]+"_i32")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xa0+i, 0xa0+i, 0xe0+i, 0xe0+i, 0xa0+i, "v_cmp_"+COMPI[i]+"_i64")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xa0+i, 0xa0+i, 0xe0+i, 0xe0+i, 0xa0+i, "v_cmp_"+COMPI[i]+"_i64")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xb0+i, 0xb0+i, 0xf0+i, 0xf0+i, 0xb0+i, "v_cmpx_"+COMPI[i]+"_i64")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xb0+i, 0xb0+i, 0xf0+i, 0xf0+i, 0xb0+i, "v_cmpx_"+COMPI[i]+"_i64")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xc0+i, 0xc0+i, 0xc8+i, 0xc8+i, 0xc0+i, "v_cmp_"+COMPI[i]+"_u32")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xc0+i, 0xc0+i, 0xc8+i, 0xc8+i, 0xc0+i, "v_cmp_"+COMPI[i]+"_u32")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xd0+i, 0xd0+i, 0xd8+i, 0xd8+i, 0xd0+i, "v_cmpx_"+COMPI[i]+"_u32")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xd0+i, 0xd0+i, 0xd8+i, 0xd8+i, 0xd0+i, "v_cmpx_"+COMPI[i]+"_u32")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xe0+i, 0xe0+i, 0xe8+i, 0xe8+i, 0xe0+i, "v_cmp_"+COMPI[i]+"_u64")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xe0+i, 0xe0+i, 0xe8+i, 0xe8+i, 0xe0+i, "v_cmp_"+COMPI[i]+"_u64")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xf0+i, 0xf0+i, 0xf8+i, 0xf8+i, 0xf0+i, "v_cmpx_"+COMPI[i]+"_u64")
     (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xf0+i, 0xf0+i, 0xf8+i, 0xf8+i, 0xf0+i, "v_cmpx_"+COMPI[i]+"_u64")
-   opcode(name, gfx9, gfx10, Format.VOPC)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
  
  
  # VOPP instructions: packed 16bit instructions - 1 or 2 inputs and 1 output
  
  
  # VOPP instructions: packed 16bit instructions - 1 or 2 inputs and 1 output
@@ -871,7 +931,7 @@ VOPP = {
  # note that these are only supported on gfx9+ so we'll need to distinguish between gfx8 and gfx9 here
  # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, -1, code, code, name)
  for (code, name) in VOPP:
  # note that these are only supported on gfx9+ so we'll need to distinguish between gfx8 and gfx9 here
  # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, -1, code, code, name)
  for (code, name) in VOPP:
-   opcode(name, code, code, Format.VOP3P)
+   opcode(name, -1, code, code, Format.VOP3P)
  
  
  # VINTERP instructions: 
  
  
  # VINTERP instructions: 
@@ -882,7 +942,7 @@ VINTRP = {
  }
  # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
  for (code, name) in VINTRP:
  }
  # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
  for (code, name) in VINTRP:
-   opcode(name, code, code, Format.VINTRP)
+   opcode(name, code, code, code, Format.VINTRP)
  
  # VOP3 instructions: 3 inputs, 1 output
  # VOP3b instructions: have a unique scalar output, e.g. VOP2 with vcc out
  
  # VOP3 instructions: 3 inputs, 1 output
  # VOP3b instructions: have a unique scalar output, e.g. VOP2 with vcc out
@@ -980,11 +1040,11 @@ VOP3 = {
     (   -1,    -1, 0x276, 0x276,    -1, "v_interp_p2_legacy_f16", True, True),
     (   -1,    -1,    -1, 0x277, 0x35a, "v_interp_p2_f16", True, True),
     (0x12b, 0x12b, 0x288, 0x288, 0x362, "v_ldexp_f32", False, True),
     (   -1,    -1, 0x276, 0x276,    -1, "v_interp_p2_legacy_f16", True, True),
     (   -1,    -1,    -1, 0x277, 0x35a, "v_interp_p2_f16", True, True),
     (0x12b, 0x12b, 0x288, 0x288, 0x362, "v_ldexp_f32", False, True),
-   (0x101, 0x101, 0x289, 0x289, 0x360, "v_readlane_b32", False, False),
-   (0x102, 0x102, 0x28a, 0x28a, 0x361, "v_writelane_b32", False, False),
+   (   -1,    -1, 0x289, 0x289, 0x360, "v_readlane_b32_e64", False, False),
+   (   -1,    -1, 0x28a, 0x28a, 0x361, "v_writelane_b32_e64", False, False),
     (0x122, 0x122, 0x28b, 0x28b, 0x364, "v_bcnt_u32_b32", False, False),
     (0x123, 0x123, 0x28c, 0x28c, 0x365, "v_mbcnt_lo_u32_b32", False, False),
     (0x122, 0x122, 0x28b, 0x28b, 0x364, "v_bcnt_u32_b32", False, False),
     (0x123, 0x123, 0x28c, 0x28c, 0x365, "v_mbcnt_lo_u32_b32", False, False),
-   (0x124, 0x124, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32", False, False),
+   (   -1,    -1, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32_e64", False, False),
     (   -1,    -1, 0x28f, 0x28f, 0x2ff, "v_lshlrev_b64", False, False),
     (   -1,    -1, 0x290, 0x290, 0x300, "v_lshrrev_b64", False, False),
     (   -1,    -1, 0x291, 0x291, 0x301, "v_ashrrev_i64", False, False),
     (   -1,    -1, 0x28f, 0x28f, 0x2ff, "v_lshlrev_b64", False, False),
     (   -1,    -1, 0x290, 0x290, 0x300, "v_lshrrev_b64", False, False),
     (   -1,    -1, 0x291, 0x291, 0x301, "v_ashrrev_i64", False, False),
@@ -1007,10 +1067,19 @@ VOP3 = {
     (   -1,    -1,    -1,    -1, 0x30f, "v_add_co_u32_e64", False, False),
     (   -1,    -1,    -1,    -1, 0x310, "v_sub_co_u32_e64", False, False),
     (   -1,    -1,    -1,    -1, 0x319, "v_subrev_co_u32_e64", False, False),
     (   -1,    -1,    -1,    -1, 0x30f, "v_add_co_u32_e64", False, False),
     (   -1,    -1,    -1,    -1, 0x310, "v_sub_co_u32_e64", False, False),
     (   -1,    -1,    -1,    -1, 0x319, "v_subrev_co_u32_e64", False, False),
-# TODO: many 16bit instructions moved from VOP2 to VOP3 on GFX10
+   (   -1,    -1,    -1,    -1, 0x303, "v_add_u16_e64", False, False),
+   (   -1,    -1,    -1,    -1, 0x304, "v_sub_u16_e64", False, False),
+   (   -1,    -1,    -1,    -1, 0x305, "v_mul_lo_u16_e64", False, False),
+   (   -1,    -1,    -1,    -1, 0x309, "v_max_u16_e64", False, False),
+   (   -1,    -1,    -1,    -1, 0x30a, "v_max_i16_e64", False, False),
+   (   -1,    -1,    -1,    -1, 0x30b, "v_min_u16_e64", False, False),
+   (   -1,    -1,    -1,    -1, 0x30c, "v_min_i16_e64", False, False),
+   (   -1,    -1,    -1,    -1, 0x307, "v_lshrrev_b16_e64", False, False),
+   (   -1,    -1,    -1,    -1, 0x308, "v_ashrrev_i16_e64", False, False),
+   (   -1,    -1,    -1,    -1, 0x314, "v_lshlrev_b16_e64", False, False),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod) in VOP3:
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod) in VOP3:
-   opcode(name, gfx9, gfx10, Format.VOP3A, in_mod, out_mod)
+   opcode(name, gfx7, gfx9, gfx10, Format.VOP3A, in_mod, out_mod)
  
  
  # DS instructions: 3 inputs (1 addr, 2 data), 1 output
  
  
  # DS instructions: 3 inputs (1 addr, 2 data), 1 output
@@ -1172,7 +1241,7 @@ DS = {
     (  -1, 0xff, 0xff, 0xff, 0xff, "ds_read_b128"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in DS:
     (  -1, 0xff, 0xff, 0xff, 0xff, "ds_read_b128"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in DS:
-    opcode(name, gfx9, gfx10, Format.DS)
+    opcode(name, gfx7, gfx9, gfx10, Format.DS)
  
  # MUBUF instructions:
  MUBUF = {
  
  # MUBUF instructions:
  MUBUF = {
@@ -1257,7 +1326,7 @@ MUBUF = {
     (  -1,   -1,   -1,   -1, 0x72, "buffer_gl1_inv"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MUBUF:
     (  -1,   -1,   -1,   -1, 0x72, "buffer_gl1_inv"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MUBUF:
-    opcode(name, gfx9, gfx10, Format.MUBUF)
+    opcode(name, gfx7, gfx9, gfx10, Format.MUBUF, is_atomic = "atomic" in name)
  
  MTBUF = {
     (0x00, 0x00, 0x00, 0x00, 0x00, "tbuffer_load_format_x"),
  
  MTBUF = {
     (0x00, 0x00, 0x00, 0x00, 0x00, "tbuffer_load_format_x"),
@@ -1278,7 +1347,7 @@ MTBUF = {
     (  -1,   -1, 0x0f, 0x0f, 0x0f, "tbuffer_store_format_d16_xyzw"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MTBUF:
     (  -1,   -1, 0x0f, 0x0f, 0x0f, "tbuffer_store_format_d16_xyzw"),
  }
  for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MTBUF:
-    opcode(name, gfx9, gfx10, Format.MTBUF)
+    opcode(name, gfx7, gfx9, gfx10, Format.MTBUF)
  
  
  IMAGE = {
  
  
  IMAGE = {
@@ -1297,7 +1366,7 @@ IMAGE = {
  }
  # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
  for (code, name) in IMAGE:
  }
  # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
  for (code, name) in IMAGE:
-   opcode(name, code, code, Format.MIMG)
+   opcode(name, code, code, code, Format.MIMG)
  
  IMAGE_ATOMIC = {
     (0x0f, 0x0f, 0x10, "image_atomic_swap"),
  
  IMAGE_ATOMIC = {
     (0x0f, 0x0f, 0x10, "image_atomic_swap"),
@@ -1321,7 +1390,7 @@ IMAGE_ATOMIC = {
  # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (gfx6, gfx7, gfx89, gfx89, ???, name)
  # gfx7 and gfx10 opcodes are the same here
  for (gfx6, gfx7, gfx89, name) in IMAGE_ATOMIC:
  # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (gfx6, gfx7, gfx89, gfx89, ???, name)
  # gfx7 and gfx10 opcodes are the same here
  for (gfx6, gfx7, gfx89, name) in IMAGE_ATOMIC:
-   opcode(name, gfx89, gfx7, Format.MIMG)
+   opcode(name, gfx7, gfx89, gfx7, Format.MIMG, is_atomic = True)
  
  IMAGE_SAMPLE = {
     (0x20, "image_sample"),
  
  IMAGE_SAMPLE = {
     (0x20, "image_sample"),
@@ -1367,7 +1436,7 @@ IMAGE_SAMPLE = {
  }
  # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
  for (code, name) in IMAGE_SAMPLE:
  }
  # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
  for (code, name) in IMAGE_SAMPLE:
-   opcode(name, code, code, Format.MIMG)
+   opcode(name, code, code, code, Format.MIMG)
  
  IMAGE_GATHER4 = {
     (0x40, "image_gather4"),
  
  IMAGE_GATHER4 = {
     (0x40, "image_gather4"),
@@ -1400,7 +1469,7 @@ IMAGE_GATHER4 = {
  }
  # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
  for (code, name) in IMAGE_GATHER4:
  }
  # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
  for (code, name) in IMAGE_GATHER4:
-   opcode(name, code, code, Format.MIMG)
+   opcode(name, code, code, code, Format.MIMG)
  
  
  FLAT = {
  
  
  FLAT = {
@@ -1447,9 +1516,9 @@ FLAT = {
     (0x51, 0x61, 0x51, "flat_atomic_cmpswap_x2"),
     (0x52, 0x62, 0x52, "flat_atomic_add_x2"),
     (0x53, 0x63, 0x53, "flat_atomic_sub_x2"),
     (0x51, 0x61, 0x51, "flat_atomic_cmpswap_x2"),
     (0x52, 0x62, 0x52, "flat_atomic_add_x2"),
     (0x53, 0x63, 0x53, "flat_atomic_sub_x2"),
-   (0x55, 0x64, 0x54, "flat_atomic_smin_x2"),
-   (0x56, 0x65, 0x55, "flat_atomic_umin_x2"),
-   (0x57, 0x66, 0x56, "flat_atomic_smax_x2"),
+   (0x55, 0x64, 0x55, "flat_atomic_smin_x2"),
+   (0x56, 0x65, 0x56, "flat_atomic_umin_x2"),
+   (0x57, 0x66, 0x57, "flat_atomic_smax_x2"),
     (0x58, 0x67, 0x58, "flat_atomic_umax_x2"),
     (0x59, 0x68, 0x59, "flat_atomic_and_x2"),
     (0x5a, 0x69, 0x5a, "flat_atomic_or_x2"),
     (0x58, 0x67, 0x58, "flat_atomic_umax_x2"),
     (0x59, 0x68, 0x59, "flat_atomic_and_x2"),
     (0x5a, 0x69, 0x5a, "flat_atomic_or_x2"),
@@ -1461,7 +1530,7 @@ FLAT = {
     (0x60,   -1, 0x60, "flat_atomic_fmax_x2"),
  }
  for (gfx7, gfx8, gfx10, name) in FLAT:
     (0x60,   -1, 0x60, "flat_atomic_fmax_x2"),
  }
  for (gfx7, gfx8, gfx10, name) in FLAT:
-    opcode(name, gfx8, gfx10, Format.FLAT)
+    opcode(name, gfx7, gfx8, gfx10, Format.FLAT, is_atomic = "atomic" in name)
  
  GLOBAL = {
     #GFX8_9, GFX10
  
  GLOBAL = {
     #GFX8_9, GFX10
@@ -1507,9 +1576,9 @@ GLOBAL = {
     (0x61, 0x51, "global_atomic_cmpswap_x2"),
     (0x62, 0x52, "global_atomic_add_x2"),
     (0x63, 0x53, "global_atomic_sub_x2"),
     (0x61, 0x51, "global_atomic_cmpswap_x2"),
     (0x62, 0x52, "global_atomic_add_x2"),
     (0x63, 0x53, "global_atomic_sub_x2"),
-   (0x64, 0x54, "global_atomic_smin_x2"),
-   (0x65, 0x55, "global_atomic_umin_x2"),
-   (0x66, 0x56, "global_atomic_smax_x2"),
+   (0x64, 0x55, "global_atomic_smin_x2"),
+   (0x65, 0x56, "global_atomic_umin_x2"),
+   (0x66, 0x57, "global_atomic_smax_x2"),
     (0x67, 0x58, "global_atomic_umax_x2"),
     (0x68, 0x59, "global_atomic_and_x2"),
     (0x69, 0x5a, "global_atomic_or_x2"),
     (0x67, 0x58, "global_atomic_umax_x2"),
     (0x68, 0x59, "global_atomic_and_x2"),
     (0x69, 0x5a, "global_atomic_or_x2"),
@@ -1521,7 +1590,7 @@ GLOBAL = {
     (  -1, 0x60, "global_atomic_fmax_x2"),
  }
  for (gfx8, gfx10, name) in GLOBAL:
     (  -1, 0x60, "global_atomic_fmax_x2"),
  }
  for (gfx8, gfx10, name) in GLOBAL:
-    opcode(name, gfx8, gfx10, Format.GLOBAL)
+    opcode(name, -1, gfx8, gfx10, Format.GLOBAL, is_atomic = "atomic" in name)
  
  SCRATCH = {
     #GFX8_9, GFX10
  
  SCRATCH = {
     #GFX8_9, GFX10
@@ -1549,4 +1618,28 @@ SCRATCH = {
     (0x25, 0x25, "scratch_load_short_d16_hi"),
  }
  for (gfx8, gfx10, name) in SCRATCH:
     (0x25, 0x25, "scratch_load_short_d16_hi"),
  }
  for (gfx8, gfx10, name) in SCRATCH:
-    opcode(name, gfx8, gfx10, Format.SCRATCH)
+    opcode(name, -1, gfx8, gfx10, Format.SCRATCH)
+
+# check for duplicate opcode numbers
+for ver in ['gfx9', 'gfx10']:
+    op_to_name = {}
+    for op in opcodes.values():
+        if op.format in [Format.PSEUDO, Format.PSEUDO_BRANCH, Format.PSEUDO_BARRIER, Format.PSEUDO_REDUCTION]:
+            continue
+
+        num = getattr(op, 'opcode_' + ver)
+        if num == -1:
+            continue
+
+        key = (op.format, num)
+
+        if key in op_to_name:
+            # exceptions
+            names = set([op_to_name[key], op.name])
+            if ver in ['gfx8', 'gfx9'] and names == set(['v_mul_lo_i32', 'v_mul_lo_u32']):
+                continue
+
+            print('%s and %s share the same opcode number (%s)' % (op_to_name[key], op.name, ver))
+            sys.exit(1)
+        else:
+            op_to_name[key] = op.name