aco: fix nir_op_f2f16_rtne with non-default rounding modes

[mesa.git] / src / amd / compiler / aco_opcodes.py
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py

index 1396b8c3af4e756000bfe65566c42b14a785bb5c..89e30d734f63870ff5d9848b778000d0e950133a 100644 (file)
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -129,7 +129,7 @@ class Format(Enum):
           return [('uint16_t', 'dpp_ctrl', None),
                   ('uint8_t', 'row_mask', '0xF'),
                   ('uint8_t', 'bank_mask', '0xF'),
-                 ('bool', 'bound_ctrl', 'false')]
+                 ('bool', 'bound_ctrl', 'true')]
        elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
           return [('uint16_t', 'offset', 0),
                   ('bool', 'can_reorder', 'true'),
@@ -194,12 +194,21 @@ class Opcode(object):
        parts = name.replace('_e64', '').rsplit('_', 2)
        op_dtype = parts[-1]
        def_dtype = parts[-2] if len(parts) > 1 else parts[-1]
-      dtype_sizes = {'{}{}'.format(prefix, size) : size for prefix in 'biuf' for size in [64, 32, 24, 16]}
-      self.operand_size = dtype_sizes.get(op_dtype, 0)
-      self.definition_size = dtype_sizes.get(def_dtype, self.operand_size)
+
+      def_dtype_sizes = {'{}{}'.format(prefix, size) : size for prefix in 'biuf' for size in [64, 32, 24, 16]}
+      op_dtype_sizes = {k:v for k, v in def_dtype_sizes.items()}
+      # inline constants are 32-bit for 16-bit integer/typeless instructions: https://reviews.llvm.org/D81841
+      op_dtype_sizes['b16'] = 32
+      op_dtype_sizes['i16'] = 32
+      op_dtype_sizes['u16'] = 32
+
+      self.operand_size = op_dtype_sizes.get(op_dtype, 0)
+      self.definition_size = def_dtype_sizes.get(def_dtype, self.operand_size)
  
        # exceptions
-      if self.operand_size == 24:
+      if self.operand_size == 16 and op_dtype != 'f16':
+         self.operand_size = 16
+      elif self.operand_size == 24:
          self.operand_size = 32
        elif name in ['s_sext_i32_i8', 's_sext_i32_i16', 'v_msad_u8', 'v_cvt_pk_u16_u32', 'v_cvt_pk_i16_i32']:
           self.operand_size = 32
@@ -208,9 +217,6 @@ class Opcode(object):
           self.operand_size = 0
        elif name in ['v_mad_u64_u32', 'v_mad_i64_i32']:
           self.operand_size = 0
-      elif name.replace('_e64', '') in ['v_lshrrev_b16', 'v_ashrrev_i16', 'v_lshlrev_b16']:
-         # v_lshlrev_b16 tested on GFX10 with 1/2 PI inline constant
-         self.operand_size = 32
        elif '_pk_' in name or name in ['v_lerp_u8', 'v_sad_u8', 'v_sad_u16',
                                        'v_cvt_f32_ubyte0', 'v_cvt_f32_ubyte1',
                                        'v_cvt_f32_ubyte2', 'v_cvt_f32_ubyte3']:
@@ -666,6 +672,7 @@ VOP2 = {
     (0x1f, 0x1f, 0x16, 0x16, 0x1f, "v_mac_f32", True),
     (0x20, 0x20, 0x17, 0x17, 0x20, "v_madmk_f32", False),
     (0x21, 0x21, 0x18, 0x18, 0x21, "v_madak_f32", False),
+   (0x24, 0x24,   -1,   -1,   -1, "v_mbcnt_hi_u32_b32", False),
     (0x25, 0x25, 0x19, 0x19,   -1, "v_add_co_u32", False), # VOP3B only in RDNA
     (0x26, 0x26, 0x1a, 0x1a,   -1, "v_sub_co_u32", False), # VOP3B only in RDNA
     (0x27, 0x27, 0x1b, 0x1b,   -1, "v_subrev_co_u32", False), # VOP3B only in RDNA
@@ -727,6 +734,7 @@ VOP1 = {
     (0x08, 0x08, 0x08, 0x08, 0x08, "v_cvt_i32_f32", True, False),
     (0x09, 0x09,   -1,   -1, 0x09, "v_mov_fed_b32", True, False), # LLVM mentions it for GFX8_9
     (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "v_cvt_f16_f32", True, True),
+   (  -1,   -1,   -1,   -1,   -1, "p_cvt_f16_f32_rtne", True, True),
     (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "v_cvt_f32_f16", True, True),
     (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "v_cvt_rpi_i32_f32", True, False),
     (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "v_cvt_flr_i32_f32", True, False),
@@ -1036,7 +1044,7 @@ VOP3 = {
     (   -1,    -1, 0x28a, 0x28a, 0x361, "v_writelane_b32_e64", False, False),
     (0x122, 0x122, 0x28b, 0x28b, 0x364, "v_bcnt_u32_b32", False, False),
     (0x123, 0x123, 0x28c, 0x28c, 0x365, "v_mbcnt_lo_u32_b32", False, False),
-   (0x124, 0x124, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32", False, False),
+   (   -1,    -1, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32_e64", False, False),
     (   -1,    -1, 0x28f, 0x28f, 0x2ff, "v_lshlrev_b64", False, False),
     (   -1,    -1, 0x290, 0x290, 0x300, "v_lshrrev_b64", False, False),
     (   -1,    -1, 0x291, 0x291, 0x301, "v_ashrrev_i64", False, False),