(('~fneg', ('fneg', a)), a),
(('ineg', ('ineg', a)), a),
- (('fabs', ('fabs', a)), ('fabs', a)),
(('fabs', ('fneg', a)), ('fabs', a)),
(('fabs', ('u2f', a)), ('u2f', a)),
(('iabs', ('iabs', a)), ('iabs', a)),
(ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),
])
+# Optimize a pattern of address calculation created by DXVK where the offset is
+# divided by 4 and then multipled by 4. This can be turned into an iand and the
+# additions before can be reassociated to CSE the iand instruction.
+for log2 in range(1, 7): # powers of two from 2 to 64
+ v = 1 << log2
+ mask = 0xffffffff & ~(v - 1)
+ b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v)
+
+ optimizations.extend([
+ # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)'
+ (('ishl@32', ('ushr@32', a, log2), log2), ('iand', a, mask)),
+
+ # Reassociate for improved CSE
+ (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),
+ ])
+
+# To save space in the state tables, reduce to the set that is known to help.
+# Previously, this was range(1, 32). In addition, a couple rules inside the
+# loop are commented out. Revisit someday, probably after mesa/#2635 has some
+# resolution.
+for i in [1, 2, 16, 24]:
+ lo_mask = 0xffffffff >> i
+ hi_mask = (0xffffffff << i) & 0xffffffff
+
+ optimizations.extend([
+ # This pattern seems to only help in the soft-fp64 code.
+ (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)),
+# (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)),
+# (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)),
+
+ (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)),
+ (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)),
+# (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct
+ ])
+
optimizations.extend([
# This is common for address calculations. Reassociating may enable the
# 'a<<c' to be CSE'd. It also helps architectures that have an ISHLADD
(('umin', ('umin', a, b), b), ('umin', a, b)),
(('imin', ('imin', a, b), b), ('imin', a, b)),
(('iand@32', a, ('inot', ('ishr', a, 31))), ('imax', a, 0)),
- (('fmax', a, ('fneg', a)), ('fabs', a)),
- (('imax', a, ('ineg', a)), ('iabs', a)),
+
+ # Simplify logic to detect sign of an integer.
+ (('ieq', ('iand', a, 0x80000000), 0x00000000), ('ige', a, 0)),
+ (('ine', ('iand', a, 0x80000000), 0x80000000), ('ige', a, 0)),
+ (('ine', ('iand', a, 0x80000000), 0x00000000), ('ilt', a, 0)),
+ (('ieq', ('iand', a, 0x80000000), 0x80000000), ('ilt', a, 0)),
+ (('ine', ('ushr', 'a@32', 31), 0), ('ilt', a, 0)),
+ (('ieq', ('ushr', 'a@32', 31), 0), ('ige', a, 0)),
+ (('ieq', ('ushr', 'a@32', 31), 1), ('ilt', a, 0)),
+ (('ine', ('ushr', 'a@32', 31), 1), ('ige', a, 0)),
+ (('ine', ('ishr', 'a@32', 31), 0), ('ilt', a, 0)),
+ (('ieq', ('ishr', 'a@32', 31), 0), ('ige', a, 0)),
+ (('ieq', ('ishr', 'a@32', 31), -1), ('ilt', a, 0)),
+ (('ine', ('ishr', 'a@32', 31), -1), ('ige', a, 0)),
+
(('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))),
(('imin', a, ('ineg', a)), ('ineg', ('iabs', a))),
(('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))),
(('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),
(('fmax', ('fsat', a), '#b@32(is_zero_to_one)'), ('fsat', ('fmax', a, b))),
(('fmin', ('fsat', a), '#b@32(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
+
+ # If a in [0,b] then b-a is also in [0,b]. Since b in [0,1], max(b-a, 0) =
+ # fsat(b-a).
+ #
+ # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0
+ #
+ # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0.
+ (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b@32(is_zero_to_one)'), 0.0),
+ ('fsat', ('fadd', ('fneg', a), b)), '!options->lower_fsat'),
+
(('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)),
(('~ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))),
(('~ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
(('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)),
(('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))),
- (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', 'a@32', 'b@32'), 0), '!options->lower_bitops'),
+ (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', a, b), 0), '!options->lower_bitops'),
+ (('ior', ('ine', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('ior', a, b), 0), '!options->lower_bitops'),
+
+ # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code.
+ # The first part of the iand comes from the !__feq64_nonnan.
+ #
+ # The second pattern is a reformulation of the first based on the relation
+ # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation
+ # happens to be y == 0.
+ (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0), b), c)), ('ilt', a, 0)),
+ ('iand', ('inot', ('iand', b , c)), ('ilt', a, 0))),
+ (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)),
+ ('iand', ('inot', ('iand', ('ieq', b , 0), c)), ('ilt', a, 0))),
# These patterns can result when (a < b || a < c) => (a < min(b, c))
# transformations occur before constant propagation and loop-unrolling.
(('ishr', a, 0), a),
(('ushr', 0, a), 0),
(('ushr', a, 0), a),
- (('iand', 0xff, ('ushr@32', a, 24)), ('ushr', a, 24)),
- (('iand', 0xffff, ('ushr@32', a, 16)), ('ushr', a, 16)),
(('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
(('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), '!options->lower_rotate'),
(('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
(('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
(('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
+ (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)),
(('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)),
(('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))),
(('~fpow', a, 1.0), a),
(('~f2u32', ('i2f', 'a@32')), a),
(('~f2u32', ('u2f', 'a@32')), a),
+ # Conversions from float16 to float32 and back can always be removed
+ (('f2f16', ('f2f32', 'a@16')), a),
+ (('f2fmp', ('f2f32', 'a@16')), a),
+ # Conversions to float16 would be lossy so they should only be removed if
+ # the instruction was generated by the precision lowering pass.
+ (('f2f32', ('f2fmp', 'a@32')), a),
+
(('ffloor', 'a(is_integral)'), a),
(('fceil', 'a(is_integral)'), a),
(('ftrunc', 'a(is_integral)'), a),
(('ilt', 'a(is_lt_zero)', 'b(is_not_negative)'), True),
(('ult', 0, 'a(is_gt_zero)'), True),
+ (('ult', a, 0), False),
# Packing and then unpacking does nothing
(('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
(('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)),
(('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),
(('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),
+
+ # Optimize half packing
+ (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))),
+ (('ishr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))),
+
+ (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
+ ('pack_half_2x16', ('vec2', a, b))),
+ (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
+ ('pack_half_2x16', ('vec2', a, b))),
])
# After the ('extract_u8', a, 0) pattern, above, triggers, there will be
(('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)),
(('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
(('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
+ (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)),
+ (('ior', '#a', ('ior', 'b(is_not_const)', '#c')), ('ior', ('ior', a, c), b)),
+ (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)),
# Drop mul-div by the same value when there's no wrapping.
(('idiv', ('imul(no_signed_wrap)', a, b), b), a),
(('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
+ (('fmin3@64', a, b, c), ('fmin@64', a, ('fmin@64', b, c))),
+ (('fmax3@64', a, b, c), ('fmax@64', a, ('fmax@64', b, c))),
+ (('fmed3@64', a, b, c), ('fmax@64', ('fmin@64', ('fmax@64', a, b), c), ('fmin@64', a, b))),
+
# Misc. lowering
- (('fmod@16', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
- (('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
+ (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
(('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
(('uadd_carry@32', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
(('usub_borrow@32', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
(('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
(('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
+ (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'),
(('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat'),
(('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_add_sat'),
+ (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
+
+ # int64_t sum = a + b;
+ #
+ # if (a < 0 && b < 0 && a < sum)
+ # sum = INT64_MIN;
+ # } else if (a >= 0 && b >= 0 && sum < a)
+ # sum = INT64_MAX;
+ # }
+ #
+ # A couple optimizations are applied.
+ #
+ # 1. a < sum => sum >= 0. This replacement works because it is known that
+ # a < 0 and b < 0, so sum should also be < 0 unless there was
+ # underflow.
+ #
+ # 2. sum < a => sum < 0. This replacement works because it is known that
+ # a >= 0 and b >= 0, so sum should also be >= 0 unless there was
+ # overflow.
+ #
+ # 3. Invert the second if-condition and swap the order of parameters for
+ # the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >=
+ # 0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0)
+ #
+ # On Intel Gen11, this saves ~11 instructions.
+ (('iadd_sat@64', a, b), ('bcsel',
+ ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
+ 0x8000000000000000,
+ ('bcsel',
+ ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
+ ('iadd', a, b),
+ 0x7fffffffffffffff)),
+ '(options->lower_int64_options & nir_lower_iadd64) != 0'),
+
+ # int64_t sum = a - b;
+ #
+ # if (a < 0 && b >= 0 && a < sum)
+ # sum = INT64_MIN;
+ # } else if (a >= 0 && b < 0 && a >= sum)
+ # sum = INT64_MAX;
+ # }
+ #
+ # Optimizations similar to the iadd_sat case are applied here.
+ (('isub_sat@64', a, b), ('bcsel',
+ ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
+ 0x8000000000000000,
+ ('bcsel',
+ ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
+ ('isub', a, b),
+ 0x7fffffffffffffff)),
+ '(options->lower_int64_options & nir_lower_iadd64) != 0'),
+
+ # These are done here instead of in the backend because the int64 lowering
+ # pass will make a mess of the patterns. The first patterns are
+ # conditioned on nir_lower_minmax64 because it was not clear that it was
+ # always an improvement on platforms that have real int64 support. No
+ # shaders in shader-db hit this, so it was hard to say one way or the
+ # other.
+ (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
+ (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
+ (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
+ (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
+ (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
+ (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
+
+ (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
+ (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
+ # 0u < uint(a) <=> uint(a) != 0u
+ (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
# Alternative lowering that doesn't rely on bfi.
(('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
(('pack_half_2x16_split', 'a@32', 'b@32'),
('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))),
- 'options->lower_pack_half_2x16_split'),
+ 'options->lower_pack_split'),
(('unpack_half_2x16_split_x', 'a@32'),
('f2f32', ('u2u16', a)),
- 'options->lower_unpack_half_2x16_split'),
+ 'options->lower_pack_split'),
(('unpack_half_2x16_split_y', 'a@32'),
('f2f32', ('u2u16', ('ushr', a, 16))),
- 'options->lower_unpack_half_2x16_split'),
+ 'options->lower_pack_split'),
+
+ (('pack_32_2x16_split', 'a@16', 'b@16'),
+ ('ior', ('ishl', ('u2u32', b), 16), ('u2u32', a)),
+ 'options->lower_pack_split'),
+
+ (('unpack_32_2x16_split_x', 'a@32'),
+ ('u2u16', a),
+ 'options->lower_pack_split'),
+
+ (('unpack_32_2x16_split_y', 'a@32'),
+ ('u2u16', ('ushr', 'a', 16)),
+ 'options->lower_pack_split'),
(('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
(('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'),
# rule converts everyone else to imul:
(('amul', a, b), ('imul', a, b), '!options->has_imul24'),
+ (('umul24', a, b),
+ ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)),
+ '!options->has_umul24'),
+ (('umad24', a, b, c),
+ ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c),
+ '!options->has_umad24'),
+
(('imad24_ir3', a, b, 0), ('imul24', a, b)),
(('imad24_ir3', a, 0, c), (c)),
(('imad24_ir3', a, 1, c), ('iadd', a, c)),
('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_add_sat'),
]
-invert = OrderedDict([('feq', 'fne'), ('fne', 'feq'), ('fge', 'flt'), ('flt', 'fge')])
+invert = OrderedDict([('feq', 'fne'), ('fne', 'feq')])
for left, right in itertools.combinations_with_replacement(invert.keys(), 2):
optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))),
optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
# Optimize away x2xN(a@N)
-for t in ['int', 'uint', 'float']:
+for t in ['int', 'uint', 'float', 'bool']:
for N in type_sizes(t):
x2xN = '{0}2{0}{1}'.format(t[0], N)
aN = 'a@{0}'.format(N)
# The N == M case is handled by other optimizations
pass
+# Downcast operations should be able to see through pack
+for t in ['i', 'u']:
+ for N in [8, 16, 32]:
+ x2xN = '{0}2{0}{1}'.format(t, N)
+ optimizations += [
+ ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
+ ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
+ ]
+
# Optimize comparisons with up-casts
for t in ['int', 'uint', 'float']:
for N, M in itertools.product(type_sizes(t), repeat=2):
if N == 1 or N >= M:
continue
+ cond = 'true'
+ if N == 8:
+ cond = 'options->support_8bit_alu'
+ elif N == 16:
+ cond = 'options->support_16bit_alu'
x2xM = '{0}2{0}{1}'.format(t[0], M)
x2xN = '{0}2{0}{1}'.format(t[0], N)
aN = 'a@' + str(N)
bP = 'b@' + str(P)
optimizations += [
- ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b))),
- ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b))),
- ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b))),
- ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b))),
- ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a)),
- ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a)),
+ ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond),
+ ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond),
+ ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond),
+ ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond),
+ ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond),
+ ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond),
]
# The next bit doesn't work on floats because the range checks would
# and a check that the constant fits in the smaller bit size.
optimizations += [
((xeq, (x2xM, aN), '#b'),
- ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b))),
+ ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond),
((xne, (x2xM, aN), '#b'),
- ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b))),
+ ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond),
((xlt, (x2xM, aN), '#b'),
('iand', (xlt, xN_min, b),
- ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b))))),
+ ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond),
((xlt, '#a', (x2xM, bN)),
('iand', (xlt, a, xN_max),
- ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b)))),
+ ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond),
((xge, (x2xM, aN), '#b'),
('iand', (xge, xN_max, b),
- ('ior', (xge, xN_min, b), (xge, a, (x2xN, b))))),
+ ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond),
((xge, '#a', (x2xM, bN)),
('iand', (xge, a, xN_min),
- ('ior', (xge, a, xN_max), (xge, (x2xN, a), b)))),
+ ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond),
]
def fexp2i(exp, bits):
- # We assume that exp is already in the right range.
+ # Generate an expression which constructs value 2.0^exp or 0.0.
+ #
+ # We assume that exp is already in a valid range:
+ #
+ # * [-15, 15] for 16-bit float
+ # * [-127, 127] for 32-bit float
+ # * [-1023, 1023] for 16-bit float
+ #
+ # If exp is the lowest value in the valid range, a value of 0.0 is
+ # constructed. Otherwise, the value 2.0^exp is constructed.
if bits == 16:
return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
elif bits == 32:
assert False
def ldexp(f, exp, bits):
- # First, we clamp exp to a reasonable range. The maximum possible range
- # for a normal exponent is [-126, 127] and, throwing in denormals, you get
- # a maximum range of [-149, 127]. This means that we can potentially have
- # a swing of +-276. If you start with FLT_MAX, you actually have to do
- # ldexp(FLT_MAX, -278) to get it to flush all the way to zero. The GLSL
- # spec, on the other hand, only requires that we handle an exponent value
- # in the range [-126, 128]. This implementation is *mostly* correct; it
- # handles a range on exp of [-252, 254] which allows you to create any
- # value (including denorms if the hardware supports it) and to adjust the
- # exponent of any normal value to anything you want.
+ # The maximum possible range for a normal exponent is [-126, 127] and,
+ # throwing in denormals, you get a maximum range of [-149, 127]. This
+ # means that we can potentially have a swing of +-276. If you start with
+ # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush
+ # all the way to zero. The GLSL spec only requires that we handle a subset
+ # of this range. From version 4.60 of the spec:
+ #
+ # "If exp is greater than +128 (single-precision) or +1024
+ # (double-precision), the value returned is undefined. If exp is less
+ # than -126 (single-precision) or -1022 (double-precision), the value
+ # returned may be flushed to zero. Additionally, splitting the value
+ # into a significand and exponent using frexp() and then reconstructing
+ # a floating-point value using ldexp() should yield the original input
+ # for zero and all finite non-denormalized values."
+ #
+ # The SPIR-V spec has similar language.
+ #
+ # In order to handle the maximum value +128 using the fexp2i() helper
+ # above, we have to split the exponent in half and do two multiply
+ # operations.
+ #
+ # First, we clamp exp to a reasonable range. Specifically, we clamp to
+ # twice the full range that is valid for the fexp2i() function above. If
+ # exp/2 is the bottom value of that range, the fexp2i() expression will
+ # yield 0.0f which, when multiplied by f, will flush it to zero which is
+ # allowed by the GLSL and SPIR-V specs for low exponent values. If the
+ # value is clamped from above, then it must have been above the supported
+ # range of the GLSL built-in and therefore any return value is acceptable.
if bits == 16:
- exp = ('imin', ('imax', exp, -28), 30)
+ exp = ('imin', ('imax', exp, -30), 30)
elif bits == 32:
- exp = ('imin', ('imax', exp, -252), 254)
+ exp = ('imin', ('imax', exp, -254), 254)
elif bits == 64:
- exp = ('imin', ('imax', exp, -2044), 2046)
+ exp = ('imin', ('imax', exp, -2046), 2046)
else:
assert False
(('ior', a, a), a),
(('iand', a, a), a),
+ (('iand', ('ine(is_used_once)', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('umin', a, b), 0)),
+ (('ior', ('ieq(is_used_once)', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('umin', a, b), 0)),
+
(('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
(('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'),
('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
(('~fadd', ('ffma(is_used_once)', a, b, ('fmul', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
+
+ # Convert f2fmp instructions to concrete f2f16 instructions. At this point
+ # any conversions that could have been removed will have been removed in
+ # nir_opt_algebraic so any remaining ones are required.
+ (('f2fmp', a), ('f2f16', a)),
]
for op in ['fadd']:
(('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
]
+distribute_src_mods = [
+ # Try to remove some spurious negations rather than pushing them down.
+ (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)),
+ (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
+ (('fdot_replicated2', ('fneg', a), ('fneg', b)), ('fdot_replicated2', a, b)),
+ (('fdot_replicated3', ('fneg', a), ('fneg', b)), ('fdot_replicated3', a, b)),
+ (('fdot_replicated4', ('fneg', a), ('fneg', b)), ('fdot_replicated4', a, b)),
+ (('fneg', ('fneg', a)), a),
+
+ (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
+ (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
+ (('fneg', ('fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
+
+ # Note that fmin <-> fmax. I don't think there is a way to distribute
+ # fabs() into fmin or fmax.
+ (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))),
+ (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))),
+
+ # fdph works mostly like fdot, but to get the correct result, the negation
+ # must be applied to the second source.
+ (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))),
+ (('fabs', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', ('fabs', a), ('fabs', b))),
+
+ (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))),
+ (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),
+]
+
+for op in ['fmul', 'fdot_replicated2', 'fdot_replicated3', 'fdot_replicated4']:
+ distribute_src_mods.extend([
+ (('fneg', (op + '(is_used_once)', a, b)), (op, ('fneg', a), b)),
+ (('fabs', (op + '(is_used_once)', a, b)), (op, ('fabs', a), ('fabs', b))),
+ ])
+
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
before_ffma_optimizations).render())
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
late_optimizations).render())
+print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods",
+ distribute_src_mods).render())