nir: Use b2b opcodes for shared and constant memory

[mesa.git] / src / compiler / nir / nir_opt_algebraic.py
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py

index fd0007bb54d1bf8bad15cb9a6b2fee32136c7316..dbfd4917503321cfb441d2655127e4b564bd4459 100644 (file)
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -271,6 +271,25 @@ for log2 in range(1, 7): # powers of two from 2 to 64
         (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),
     ])
  
+# To save space in the state tables, reduce to the set that is known to help.
+# Previously, this was range(1, 32).  In addition, a couple rules inside the
+# loop are commented out.  Revisit someday, probably after mesa/#2635 has some
+# resolution.
+for i in [1, 2, 16, 24]:
+    lo_mask = 0xffffffff >> i
+    hi_mask = (0xffffffff << i) & 0xffffffff
+
+    optimizations.extend([
+        # This pattern seems to only help in the soft-fp64 code.
+        (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)),
+#        (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)),
+#        (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)),
+
+        (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)),
+        (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)),
+#        (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct
+    ])
+
  optimizations.extend([
     # This is common for address calculations.  Reassociating may enable the
     # 'a<<c' to be CSE'd.  It also helps architectures that have an ISHLADD
@@ -497,8 +516,21 @@ optimizations.extend([
     (('umin', ('umin', a, b), b), ('umin', a, b)),
     (('imin', ('imin', a, b), b), ('imin', a, b)),
     (('iand@32', a, ('inot', ('ishr', a, 31))), ('imax', a, 0)),
-   (('fmax', a, ('fneg', a)), ('fabs', a)),
-   (('imax', a, ('ineg', a)), ('iabs', a)),
+
+   # Simplify logic to detect sign of an integer.
+   (('ieq', ('iand', a, 0x80000000), 0x00000000), ('ige', a, 0)),
+   (('ine', ('iand', a, 0x80000000), 0x80000000), ('ige', a, 0)),
+   (('ine', ('iand', a, 0x80000000), 0x00000000), ('ilt', a, 0)),
+   (('ieq', ('iand', a, 0x80000000), 0x80000000), ('ilt', a, 0)),
+   (('ine', ('ushr', 'a@32', 31), 0), ('ilt', a, 0)),
+   (('ieq', ('ushr', 'a@32', 31), 0), ('ige', a, 0)),
+   (('ieq', ('ushr', 'a@32', 31), 1), ('ilt', a, 0)),
+   (('ine', ('ushr', 'a@32', 31), 1), ('ige', a, 0)),
+   (('ine', ('ishr', 'a@32', 31), 0), ('ilt', a, 0)),
+   (('ieq', ('ishr', 'a@32', 31), 0), ('ige', a, 0)),
+   (('ieq', ('ishr', 'a@32', 31), -1), ('ilt', a, 0)),
+   (('ine', ('ishr', 'a@32', 31), -1), ('ige', a, 0)),
+
     (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))),
     (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))),
     (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))),
@@ -594,7 +626,19 @@ optimizations.extend([
     (('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)),
     (('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))),
  
-   (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', 'a@32', 'b@32'), 0), '!options->lower_bitops'),
+   (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', a, b), 0), '!options->lower_bitops'),
+   (('ior',  ('ine', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('ior', a, b), 0), '!options->lower_bitops'),
+
+   # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code.
+   # The first part of the iand comes from the !__feq64_nonnan.
+   #
+   # The second pattern is a reformulation of the first based on the relation
+   # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation
+   # happens to be y == 0.
+   (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0),  b), c)), ('ilt', a, 0)),
+    ('iand', ('inot', ('iand',                         b , c)), ('ilt', a, 0))),
+   (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)),
+    ('iand', ('inot', ('iand', ('ieq',             b , 0), c)), ('ilt', a, 0))),
  
     # These patterns can result when (a < b || a < c) => (a < min(b, c))
     # transformations occur before constant propagation and loop-unrolling.
@@ -724,8 +768,6 @@ optimizations.extend([
     (('ishr', a, 0), a),
     (('ushr', 0, a), 0),
     (('ushr', a, 0), a),
-   (('iand', 0xff, ('ushr@32', a, 24)), ('ushr', a, 24)),
-   (('iand', 0xffff, ('ushr@32', a, 16)), ('ushr', a, 16)),
     (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
     (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), '!options->lower_rotate'),
     (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
@@ -745,6 +787,7 @@ optimizations.extend([
     (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
     (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
      ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
+   (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)),
     (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)),
     (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))),
     (('~fpow', a, 1.0), a),
@@ -833,6 +876,13 @@ optimizations.extend([
     (('~f2u32', ('i2f', 'a@32')), a),
     (('~f2u32', ('u2f', 'a@32')), a),
  
+   # Conversions from float16 to float32 and back can always be removed
+   (('f2f16', ('f2f32', 'a@16')), a),
+   (('f2fmp', ('f2f32', 'a@16')), a),
+   # Conversions to float16 would be lossy so they should only be removed if
+   # the instruction was generated by the precision lowering pass.
+   (('f2f32', ('f2fmp', 'a@32')), a),
+
     (('ffloor', 'a(is_integral)'), a),
     (('fceil', 'a(is_integral)'), a),
     (('ftrunc', 'a(is_integral)'), a),
@@ -904,6 +954,7 @@ optimizations.extend([
     (('ilt', 'a(is_lt_zero)',      'b(is_not_negative)'), True),
  
     (('ult', 0, 'a(is_gt_zero)'), True),
+   (('ult', a, 0), False),
  
     # Packing and then unpacking does nothing
     (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
@@ -935,6 +986,15 @@ optimizations.extend([
     (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)),
     (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),
     (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),
+
+   # Optimize half packing
+   (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))),
+   (('ishr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))),
+
+   (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
+    ('pack_half_2x16', ('vec2', a, b))),
+   (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
+    ('pack_half_2x16', ('vec2', a, b))),
  ])
  
  # After the ('extract_u8', a, 0) pattern, above, triggers, there will be
@@ -989,6 +1049,9 @@ optimizations.extend([
     (('~fadd', '#a',          ('fadd', 'b(is_not_const)', '#c')),  ('fadd', ('fadd', a,          c),           b)),
     (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
     (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
+   (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)),
+   (('ior',  '#a', ('ior',  'b(is_not_const)', '#c')), ('ior',  ('ior',  a, c), b)),
+   (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)),
  
     # Drop mul-div by the same value when there's no wrapping.
     (('idiv', ('imul(no_signed_wrap)', a, b), b), a),
@@ -1005,8 +1068,7 @@ optimizations.extend([
     (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
  
     # Misc. lowering
-   (('fmod@16', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
-   (('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
+   (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
     (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
     (('uadd_carry@32', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
     (('usub_borrow@32', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
@@ -1270,7 +1332,7 @@ for bit_size in [8, 16, 32, 64]:
                                  ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_add_sat'),
     ]
  
-invert = OrderedDict([('feq', 'fne'), ('fne', 'feq'), ('fge', 'flt'), ('flt', 'fge')])
+invert = OrderedDict([('feq', 'fne'), ('fne', 'feq')])
  
  for left, right in itertools.combinations_with_replacement(invert.keys(), 2):
     optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))),
@@ -1297,7 +1359,7 @@ for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']):
     optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
  
  # Optimize away x2xN(a@N)
-for t in ['int', 'uint', 'float']:
+for t in ['int', 'uint', 'float', 'bool']:
     for N in type_sizes(t):
        x2xN = '{0}2{0}{1}'.format(t[0], N)
        aN = 'a@{0}'.format(N)
@@ -1406,7 +1468,16 @@ for t in ['int', 'uint', 'float']:
              ]
  
  def fexp2i(exp, bits):
-   # We assume that exp is already in the right range.
+   # Generate an expression which constructs value 2.0^exp or 0.0.
+   #
+   # We assume that exp is already in a valid range:
+   #
+   #   * [-15, 15] for 16-bit float
+   #   * [-127, 127] for 32-bit float
+   #   * [-1023, 1023] for 16-bit float
+   #
+   # If exp is the lowest value in the valid range, a value of 0.0 is
+   # constructed.  Otherwise, the value 2.0^exp is constructed.
     if bits == 16:
        return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
     elif bits == 32:
@@ -1417,22 +1488,40 @@ def fexp2i(exp, bits):
        assert False
  
  def ldexp(f, exp, bits):
-   # First, we clamp exp to a reasonable range.  The maximum possible range
-   # for a normal exponent is [-126, 127] and, throwing in denormals, you get
-   # a maximum range of [-149, 127].  This means that we can potentially have
-   # a swing of +-276.  If you start with FLT_MAX, you actually have to do
-   # ldexp(FLT_MAX, -278) to get it to flush all the way to zero.  The GLSL
-   # spec, on the other hand, only requires that we handle an exponent value
-   # in the range [-126, 128].  This implementation is *mostly* correct; it
-   # handles a range on exp of [-252, 254] which allows you to create any
-   # value (including denorms if the hardware supports it) and to adjust the
-   # exponent of any normal value to anything you want.
+   # The maximum possible range for a normal exponent is [-126, 127] and,
+   # throwing in denormals, you get a maximum range of [-149, 127].  This
+   # means that we can potentially have a swing of +-276.  If you start with
+   # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush
+   # all the way to zero.  The GLSL spec only requires that we handle a subset
+   # of this range.  From version 4.60 of the spec:
+   #
+   #    "If exp is greater than +128 (single-precision) or +1024
+   #    (double-precision), the value returned is undefined. If exp is less
+   #    than -126 (single-precision) or -1022 (double-precision), the value
+   #    returned may be flushed to zero. Additionally, splitting the value
+   #    into a significand and exponent using frexp() and then reconstructing
+   #    a floating-point value using ldexp() should yield the original input
+   #    for zero and all finite non-denormalized values."
+   #
+   # The SPIR-V spec has similar language.
+   #
+   # In order to handle the maximum value +128 using the fexp2i() helper
+   # above, we have to split the exponent in half and do two multiply
+   # operations.
+   #
+   # First, we clamp exp to a reasonable range.  Specifically, we clamp to
+   # twice the full range that is valid for the fexp2i() function above.  If
+   # exp/2 is the bottom value of that range, the fexp2i() expression will
+   # yield 0.0f which, when multiplied by f, will flush it to zero which is
+   # allowed by the GLSL and SPIR-V specs for low exponent values.  If the
+   # value is clamped from above, then it must have been above the supported
+   # range of the GLSL built-in and therefore any return value is acceptable.
     if bits == 16:
-      exp = ('imin', ('imax', exp, -28), 30)
+      exp = ('imin', ('imax', exp, -30), 30)
     elif bits == 32:
-      exp = ('imin', ('imax', exp, -252), 254)
+      exp = ('imin', ('imax', exp, -254), 254)
     elif bits == 64:
-      exp = ('imin', ('imax', exp, -2044), 2046)
+      exp = ('imin', ('imax', exp, -2046), 2046)
     else:
        assert False
  
@@ -1650,6 +1739,9 @@ late_optimizations = [
     (('ior', a, a), a),
     (('iand', a, a), a),
  
+   (('iand', ('ine(is_used_once)', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('umin', a, b), 0)),
+   (('ior',  ('ieq(is_used_once)', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('umin', a, b), 0)),
+
     (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
  
     (('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'),
@@ -1730,6 +1822,11 @@ late_optimizations = [
      ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
     (('~fadd', ('ffma(is_used_once)', a, b, ('fmul', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
      ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
+
+   # Convert f2fmp instructions to concrete f2f16 instructions. At this point
+   # any conversions that could have been removed will have been removed in
+   # nir_opt_algebraic so any remaining ones are required.
+   (('f2fmp', a), ('f2f16', a)),
  ]
  
  for op in ['fadd']: