import nir_algebraic
from nir_opcodes import type_sizes
import itertools
+import struct
+from math import pi
# Convenience variables
a = 'a'
# however, be used for backend-requested lowering operations as those need to
# happen regardless of precision.
#
-# Variable names are specified as "[#]name[@type][(cond)]" where "#" inicates
-# that the given variable will only match constants and the type indicates that
-# the given variable will only match values from ALU instructions with the
-# given output type, and (cond) specifies an additional condition function
-# (see nir_search_helpers.h).
+# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where:
+# "#" indicates that the given variable will only match constants,
+# type indicates that the given variable will only match values from ALU
+# instructions with the given output type,
+# (cond) specifies an additional condition function (see nir_search_helpers.h),
+# swiz is a swizzle applied to the variable (only in the <replace> expression)
#
# For constants, you have to be careful to make sure that it is the right
# type because python is unaware of the source and destination types of the
# should only match that particular bit-size. In the replace half of the
# expression this indicates that the constructed value should have that
# bit-size.
+#
+# If the opcode in a replacement expression is prefixed by a '!' character,
+# this indicated that the new expression will be marked exact.
+#
+# A special condition "many-comm-expr" can be used with expressions to note
+# that the expression and its subexpressions have more commutative expressions
+# than nir_replace_instr can handle. If this special condition is needed with
+# another condition, the two can be separated by a comma (e.g.,
+# "(many-comm-expr,is_used_once)").
+
+# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
+def lowered_sincos(c):
+ x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0)
+ x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0)
+ return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x)
+
+def intBitsToFloat(i):
+ return struct.unpack('!f', struct.pack('!I', i))[0]
optimizations = [
- (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b))),
- (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b))))),
+ (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
+ (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
+ (('ishl', a, '#b@32'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'),
+
(('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
(('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
(('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'),
(('idiv', a, 1), a),
(('umod', a, 1), 0),
(('imod', a, 1), 0),
- (('udiv', a, '#b@32(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b))),
+ (('udiv', a, '#b@32(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'),
(('idiv', a, '#b@32(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), 'options->lower_idiv'),
(('idiv', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), 'options->lower_idiv'),
(('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1))),
- (('fneg', ('fneg', a)), a),
+ (('~fneg', ('fneg', a)), a),
(('ineg', ('ineg', a)), a),
- (('fabs', ('fabs', a)), ('fabs', a)),
(('fabs', ('fneg', a)), ('fabs', a)),
(('fabs', ('u2f', a)), ('u2f', a)),
(('iabs', ('iabs', a)), ('iabs', a)),
(('imul', a, 0), 0),
(('umul_unorm_4x8', a, 0), 0),
(('umul_unorm_4x8', a, ~0), a),
- (('fmul', a, 1.0), a),
+ (('~fmul', a, 1.0), a),
(('imul', a, 1), a),
(('fmul', a, -1.0), ('fneg', a)),
(('imul', a, -1), ('ineg', a)),
(('~fadd@32', ('fmul', a, ('fadd', 1.0, ('fneg', c ) )), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp32'),
(('~fadd@64', ('fmul', a, ('fadd', 1.0, ('fneg', c ) )), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp64'),
# These are the same as the previous three rules, but it depends on
- # 1-fsat(x) <=> fsat(1-x):
- #
- # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially
- # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1
- # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0
+ # 1-fsat(x) <=> fsat(1-x). See below.
(('~fadd@32', ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c )))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp32'),
(('~fadd@64', ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c )))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp64'),
(('~fmul', ('fadd', ('iand', ('ineg', ('b2i32', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
- (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d)),
+ (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'),
+
+ (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d), '!options->lower_fdph'),
(('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)),
(('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)),
(('fdot4', ('vec4', a, b, c, 0.0), d), ('fdot3', ('vec3', a, b, c), d)),
(('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)),
(('fdot3', ('vec3', a, b, 0.0), c), ('fdot2', ('vec2', a, b), c)),
+ (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)),
+ (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')),
+
+ # Lower fdot to fsum when it is available
+ (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'),
+ (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'),
+ (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'),
+ (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'),
+
+ # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially
+ # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1
+ # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0
+ (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
+
+ # 1 - ((1 - a) * (1 - b))
+ # 1 - (1 - a - b + a*b)
+ # 1 - 1 + a + b - a*b
+ # a + b - a*b
+ # a + b*(1 - a)
+ # b*(1 - a) + 1*a
+ # flrp(b, 1, a)
+ (('~fadd@32', 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))),
+ ('flrp', b, 1.0, a), '!options->lower_flrp32'),
+
# (a * #b + #c) << #d
# ((a * #b) << #d) + (#c << #d)
# (a * (#b << #d)) + (#c << #d)
# (a * #b) << #c
# a * (#b << #c)
(('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))),
+]
+
+# Care must be taken here. Shifts in NIR uses only the lower log2(bitsize)
+# bits of the second source. These replacements must correctly handle the
+# case where (b % bitsize) + (c % bitsize) >= bitsize.
+for s in [8, 16, 32, 64]:
+ mask = (1 << s) - 1
+
+ ishl = "ishl@{}".format(s)
+ ishr = "ishr@{}".format(s)
+ ushr = "ushr@{}".format(s)
+
+ in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s)
+
+ optimizations.extend([
+ ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)),
+ ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)),
+
+ # To get get -1 for large shifts of negative values, ishr must instead
+ # clamp the shift count to the maximum value.
+ ((ishr, (ishr, a, '#b'), '#c'),
+ (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),
+ ])
+
+# Optimize a pattern of address calculation created by DXVK where the offset is
+# divided by 4 and then multipled by 4. This can be turned into an iand and the
+# additions before can be reassociated to CSE the iand instruction.
+for log2 in range(1, 7): # powers of two from 2 to 64
+ v = 1 << log2
+ mask = 0xffffffff & ~(v - 1)
+ b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v)
+
+ optimizations.extend([
+ # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)'
+ (('ishl@32', ('ushr@32', a, log2), log2), ('iand', a, mask)),
+
+ # Reassociate for improved CSE
+ (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),
+ ])
+
+# To save space in the state tables, reduce to the set that is known to help.
+# Previously, this was range(1, 32). In addition, a couple rules inside the
+# loop are commented out. Revisit someday, probably after mesa/#2635 has some
+# resolution.
+for i in [1, 2, 16, 24]:
+ lo_mask = 0xffffffff >> i
+ hi_mask = (0xffffffff << i) & 0xffffffff
+
+ optimizations.extend([
+ # This pattern seems to only help in the soft-fp64 code.
+ (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)),
+# (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)),
+# (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)),
+
+ (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)),
+ (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)),
+# (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct
+ ])
+
+optimizations.extend([
+ # This is common for address calculations. Reassociating may enable the
+ # 'a<<c' to be CSE'd. It also helps architectures that have an ISHLADD
+ # instruction or a constant offset field for in load / store instructions.
+ (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))),
# Comparison simplifications
(('~inot', ('flt', a, b)), ('fge', a, b)),
(('~inot', ('fge', a, b)), ('flt', a, b)),
- (('~inot', ('feq', a, b)), ('fne', a, b)),
- (('~inot', ('fne', a, b)), ('feq', a, b)),
+ (('inot', ('feq', a, b)), ('fne', a, b)),
+ (('inot', ('fne', a, b)), ('feq', a, b)),
(('inot', ('ilt', a, b)), ('ige', a, b)),
(('inot', ('ult', a, b)), ('uge', a, b)),
(('inot', ('ige', a, b)), ('ilt', a, b)),
(('inot', ('ieq', a, b)), ('ine', a, b)),
(('inot', ('ine', a, b)), ('ieq', a, b)),
+ (('iand', ('feq', a, b), ('fne', a, b)), False),
+ (('iand', ('flt', a, b), ('flt', b, a)), False),
+ (('iand', ('ieq', a, b), ('ine', a, b)), False),
+ (('iand', ('ilt', a, b), ('ilt', b, a)), False),
+ (('iand', ('ult', a, b), ('ult', b, a)), False),
+
# This helps some shaders because, after some optimizations, they end up
# with patterns like (-a < -b) || (b < a). In an ideal world, this sort of
# matching would be handled by CSE.
(('fne', ('fneg', a), -1.0), ('fne', 1.0, a)),
(('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
+ (('flt', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('flt', a, b)),
+ (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
+ (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
+ (('fge', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('fge', b, a)),
+ (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
+ (('fne', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fne', a, b)),
+
+ (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
+ (('flt', ('fsat(is_used_once)', a), 1.0), ('flt', a, 1.0)),
+ (('fge', 0.0, ('fsat(is_used_once)', a)), ('fge', 0.0, a)),
+ (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)),
+
# 0.0 >= b2f(a)
# b2f(a) <= 0.0
# b2f(a) == 0.0 because b2f(a) can only be 0 or 1
(('~fge', ('fadd', a, b), a), ('fge', b, 0.0)),
(('~feq', ('fadd', a, b), a), ('feq', b, 0.0)),
(('~fne', ('fadd', a, b), a), ('fne', b, 0.0)),
+ (('~flt', ('fadd(is_used_once)', a, '#b'), '#c'), ('flt', a, ('fadd', c, ('fneg', b)))),
+ (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)),
+ (('~fge', ('fadd(is_used_once)', a, '#b'), '#c'), ('fge', a, ('fadd', c, ('fneg', b)))),
+ (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)),
+ (('~feq', ('fadd(is_used_once)', a, '#b'), '#c'), ('feq', a, ('fadd', c, ('fneg', b)))),
+ (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)),
+ (('~fne', ('fadd(is_used_once)', a, '#b'), '#c'), ('fne', a, ('fadd', c, ('fneg', b)))),
+ (('~fne', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fne', ('fneg', ('fadd', c, b)), a)),
# Cannot remove the addition from ilt or ige due to overflow.
(('ieq', ('iadd', a, b), a), ('ieq', b, 0)),
(('feq', ('fmin', ('fneg', ('b2f', 'a@1')), 'b@1'), 0.0), ('iand', ('inot', a), ('fge', b, 0.0))),
(('feq', ('b2f', 'a@1'), 0.0), ('inot', a)),
- (('fne', ('b2f', 'a@1'), 0.0), a),
+ (('~fne', ('b2f', 'a@1'), 0.0), a),
(('ieq', ('b2i', 'a@1'), 0), ('inot', a)),
(('ine', ('b2i', 'a@1'), 0), a),
# 0.0 >= fabs(a)
(('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
+ # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a
+ (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'),
+
+ # (a < 0.0) || (a > 1.0)
+ # !(!(a < 0.0) && !(a > 1.0))
+ # !((a >= 0.0) && (a <= 1.0))
+ # !(a == fsat(a))
+ # a != fsat(a)
+ (('ior', ('flt', a, 0.0), ('flt', 1.0, a)), ('fne', a, ('fsat', a)), '!options->lower_fsat'),
+
(('fmax', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('ior', a, b))),
(('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))),
(('fmin', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))),
(('bcsel', a, a, b), ('ior', a, b)),
(('bcsel', a, b, False), ('iand', a, b)),
(('bcsel', a, b, a), ('iand', a, b)),
- (('fmin', a, a), a),
- (('fmax', a, a), a),
+ (('~fmin', a, a), a),
+ (('~fmax', a, a), a),
(('imin', a, a), a),
(('imax', a, a), a),
(('umin', a, a), a),
(('fmin', ('fmin', a, b), b), ('fmin', a, b)),
(('umin', ('umin', a, b), b), ('umin', a, b)),
(('imin', ('imin', a, b), b), ('imin', a, b)),
- (('fmax', a, ('fneg', a)), ('fabs', a)),
- (('imax', a, ('ineg', a)), ('iabs', a)),
+ (('iand@32', a, ('inot', ('ishr', a, 31))), ('imax', a, 0)),
+
+ # Simplify logic to detect sign of an integer.
+ (('ieq', ('iand', a, 0x80000000), 0x00000000), ('ige', a, 0)),
+ (('ine', ('iand', a, 0x80000000), 0x80000000), ('ige', a, 0)),
+ (('ine', ('iand', a, 0x80000000), 0x00000000), ('ilt', a, 0)),
+ (('ieq', ('iand', a, 0x80000000), 0x80000000), ('ilt', a, 0)),
+ (('ine', ('ushr', 'a@32', 31), 0), ('ilt', a, 0)),
+ (('ieq', ('ushr', 'a@32', 31), 0), ('ige', a, 0)),
+ (('ieq', ('ushr', 'a@32', 31), 1), ('ilt', a, 0)),
+ (('ine', ('ushr', 'a@32', 31), 1), ('ige', a, 0)),
+ (('ine', ('ishr', 'a@32', 31), 0), ('ilt', a, 0)),
+ (('ieq', ('ishr', 'a@32', 31), 0), ('ige', a, 0)),
+ (('ieq', ('ishr', 'a@32', 31), -1), ('ilt', a, 0)),
+ (('ine', ('ishr', 'a@32', 31), -1), ('ige', a, 0)),
+
(('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))),
(('imin', a, ('ineg', a)), ('ineg', ('iabs', a))),
(('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))),
(('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))),
- (('fmin', a, ('fabs', a)), a),
+ (('~fmin', a, ('fabs', a)), a),
(('imin', a, ('iabs', a)), a),
- (('fmax', a, ('fneg', ('fabs', a))), a),
+ (('~fmax', a, ('fneg', ('fabs', a))), a),
(('imax', a, ('ineg', ('iabs', a))), a),
(('fmax', a, ('fabs', a)), ('fabs', a)),
(('imax', a, ('iabs', a)), ('iabs', a)),
(('fmax', a, ('fneg', a)), ('fabs', a)),
(('imax', a, ('ineg', a)), ('iabs', a)),
+ (('~fmax', ('fabs', a), 0.0), ('fabs', a)),
(('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
(('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
- (('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_negate && !options->lower_fsat'),
- (('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_negate && !options->lower_fsat'),
+ (('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
+ (('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
(('fsat', ('fsign', a)), ('b2f', ('flt', 0.0, a))),
(('fsat', ('b2f', a)), ('b2f', a)),
(('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
(('fsat', ('fsat', a)), ('fsat', a)),
- (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_negate && !options->lower_fsat'),
- (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_negate && !options->lower_fsat'),
+ (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'),
+ (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'),
(('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'),
(('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),
(('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
(('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),
(('fmax', ('fsat', a), '#b@32(is_zero_to_one)'), ('fsat', ('fmax', a, b))),
(('fmin', ('fsat', a), '#b@32(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
+
+ # If a in [0,b] then b-a is also in [0,b]. Since b in [0,1], max(b-a, 0) =
+ # fsat(b-a).
+ #
+ # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0
+ #
+ # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0.
+ (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b@32(is_zero_to_one)'), 0.0),
+ ('fsat', ('fadd', ('fneg', a), b)), '!options->lower_fsat'),
+
(('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)),
(('~ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))),
(('~ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
(('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))),
(('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)),
+ # These derive from the previous patterns with the application of b < 0 <=>
+ # 0 < -b. The transformation should be applied if either comparison is
+ # used once as this ensures that the number of comparisons will not
+ # increase. The sources to the ior and iand are not symmetric, so the
+ # rules have to be duplicated to get this behavior.
+ (('~ior', ('flt(is_used_once)', 0.0, 'a@32'), ('flt', 'b@32', 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
+ (('~ior', ('flt', 0.0, 'a@32'), ('flt(is_used_once)', 'b@32', 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
+ (('~ior', ('fge(is_used_once)', 0.0, 'a@32'), ('fge', 'b@32', 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
+ (('~ior', ('fge', 0.0, 'a@32'), ('fge(is_used_once)', 'b@32', 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
+ (('~iand', ('flt(is_used_once)', 0.0, 'a@32'), ('flt', 'b@32', 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
+ (('~iand', ('flt', 0.0, 'a@32'), ('flt(is_used_once)', 'b@32', 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
+ (('~iand', ('fge(is_used_once)', 0.0, 'a@32'), ('fge', 'b@32', 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
+ (('~iand', ('fge', 0.0, 'a@32'), ('fge(is_used_once)', 'b@32', 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
+
# Common pattern like 'if (i == 0 || i == 1 || ...)'
(('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)),
(('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)),
(('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)),
(('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))),
- (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', 'a@32', 'b@32'), 0)),
+ (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', a, b), 0), '!options->lower_bitops'),
+ (('ior', ('ine', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('ior', a, b), 0), '!options->lower_bitops'),
+
+ # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code.
+ # The first part of the iand comes from the !__feq64_nonnan.
+ #
+ # The second pattern is a reformulation of the first based on the relation
+ # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation
+ # happens to be y == 0.
+ (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0), b), c)), ('ilt', a, 0)),
+ ('iand', ('inot', ('iand', b , c)), ('ilt', a, 0))),
+ (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)),
+ ('iand', ('inot', ('iand', ('ieq', b , 0), c)), ('ilt', a, 0))),
# These patterns can result when (a < b || a < c) => (a < min(b, c))
# transformations occur before constant propagation and loop-unrolling.
(('ult', ('umax', a, b), a), False),
(('uge', a, ('umax', b, a)), ('uge', a, b)),
(('uge', ('umin', a, b), a), ('uge', b, a)),
+ (('ult', a, ('iand', b, a)), False),
+ (('ult', ('ior', a, b), a), False),
+ (('uge', a, ('iand', b, a)), True),
+ (('uge', ('ior', a, b), a), True),
(('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))),
(('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))),
(('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
(('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
(('sne', a, b), ('b2f', ('fne', a, b)), 'options->lower_scmp'),
+ (('seq', ('seq', a, b), 1.0), ('seq', a, b)),
+ (('seq', ('sne', a, b), 1.0), ('sne', a, b)),
+ (('seq', ('slt', a, b), 1.0), ('slt', a, b)),
+ (('seq', ('sge', a, b), 1.0), ('sge', a, b)),
+ (('sne', ('seq', a, b), 0.0), ('seq', a, b)),
+ (('sne', ('sne', a, b), 0.0), ('sne', a, b)),
+ (('sne', ('slt', a, b), 0.0), ('slt', a, b)),
+ (('sne', ('sge', a, b), 0.0), ('sge', a, b)),
+ (('seq', ('seq', a, b), 0.0), ('sne', a, b)),
+ (('seq', ('sne', a, b), 0.0), ('seq', a, b)),
+ (('seq', ('slt', a, b), 0.0), ('sge', a, b)),
+ (('seq', ('sge', a, b), 0.0), ('slt', a, b)),
+ (('sne', ('seq', a, b), 1.0), ('sne', a, b)),
+ (('sne', ('sne', a, b), 1.0), ('seq', a, b)),
+ (('sne', ('slt', a, b), 1.0), ('sge', a, b)),
+ (('sne', ('sge', a, b), 1.0), ('slt', a, b)),
+ (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
+ (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'),
+ (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'),
+ (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
+ (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
+ (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
(('fne', ('fneg', a), a), ('fne', a, 0.0)),
(('feq', ('fneg', a), a), ('feq', a, 0.0)),
# Emulating booleans
# True/False are ~0 and 0 in NIR. b2i of True is 1, and -1 is ~0 (True).
(('ineg', ('b2i32', 'a@32')), a),
(('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF.
- (('flt', ('fsub', 0.0, ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF.
# Comparison with the same args. Note that these are not done for
# the float versions because NaN always returns false on float
# inequalities.
(('ult', a, a), False),
(('uge', a, a), True),
# Logical and bit operations
- (('fand', a, 0.0), 0.0),
(('iand', a, a), a),
(('iand', a, ~0), a),
(('iand', a, 0), 0),
(('ior', a, a), a),
(('ior', a, 0), a),
(('ior', a, True), True),
- (('fxor', a, a), 0.0),
(('ixor', a, a), 0),
(('ixor', a, 0), a),
(('inot', ('inot', a)), a),
(('ishr', a, 0), a),
(('ushr', 0, a), 0),
(('ushr', a, 0), a),
- (('iand', 0xff, ('ushr@32', a, 24)), ('ushr', a, 24)),
- (('iand', 0xffff, ('ushr@32', a, 16)), ('ushr', a, 16)),
+ (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
+ (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), '!options->lower_rotate'),
+ (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
+ (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), '!options->lower_rotate'),
+ (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),
+ (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), '!options->lower_rotate'),
+ (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),
+ (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), '!options->lower_rotate'),
+ (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), 'options->lower_rotate'),
+ (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), 'options->lower_rotate'),
+ (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), 'options->lower_rotate'),
+ (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), 'options->lower_rotate'),
# Exponential/logarithmic identities
(('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a
(('~flog2', ('fexp2', a)), a), # lg2(2^a) = a
(('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
(('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
+ (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)),
(('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)),
(('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))),
(('~fpow', a, 1.0), a),
(('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
(('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))),
(('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))),
+ (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)),
# Division and reciprocal
(('~fdiv', 1.0, a), ('frcp', a)),
(('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
(('~frcp', ('fsqrt', a)), ('frsq', a)),
(('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
(('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
+ # Trig
+ (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'),
+ (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'),
# Boolean simplifications
(('i2b32(is_used_by_if)', a), ('ine32', a, 0)),
(('i2b1(is_used_by_if)', a), ('ine', a, 0)),
(('bcsel', True, b, c), b),
(('bcsel', False, b, c), c),
(('bcsel', a, ('b2f(is_used_once)', 'b@32'), ('b2f', 'c@32')), ('b2f', ('bcsel', a, b, c))),
- # The result of this should be hit by constant propagation and, in the
- # next round of opt_algebraic, get picked up by one of the above two.
- (('bcsel', '#a', b, c), ('bcsel', ('ine', 'a', 0), b, c)),
(('bcsel', a, b, b), b),
- (('fcsel', a, b, b), b),
+ (('~fcsel', a, b, b), b),
# D3D Boolean emulation
(('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))),
(('ine', ('ineg', ('b2i', 'a@1')), 0), a),
(('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)),
(('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)),
+ (('iand', ('ineg', ('b2i', a)), 1), ('b2i', a)),
# SM5 32-bit shifts are defined to use the 5 least significant bits
(('ishl', 'a@32', ('iand', 31, b)), ('ishl', a, b)),
(('f2u', ('ftrunc', a)), ('f2u', a)),
(('i2b', ('ineg', a)), ('i2b', a)),
(('i2b', ('iabs', a)), ('i2b', a)),
- (('fabs', ('b2f', a)), ('b2f', a)),
- (('iabs', ('b2i', a)), ('b2i', a)),
(('inot', ('f2b1', a)), ('feq', a, 0.0)),
+ # The C spec says, "If the value of the integral part cannot be represented
+ # by the integer type, the behavior is undefined." "Undefined" can mean
+ # "the conversion doesn't happen at all."
+ (('~i2f32', ('f2i32', 'a@32')), ('ftrunc', a)),
+
# Ironically, mark these as imprecise because removing the conversions may
# preserve more precision than doing the conversions (e.g.,
# uint(float(0x81818181u)) == 0x81818200).
(('~f2u32', ('i2f', 'a@32')), a),
(('~f2u32', ('u2f', 'a@32')), a),
+ # Conversions from float16 to float32 and back can always be removed
+ (('f2f16', ('f2f32', 'a@16')), a),
+ (('f2fmp', ('f2f32', 'a@16')), a),
+ # Conversions to float16 would be lossy so they should only be removed if
+ # the instruction was generated by the precision lowering pass.
+ (('f2f32', ('f2fmp', 'a@32')), a),
+
+ (('ffloor', 'a(is_integral)'), a),
+ (('fceil', 'a(is_integral)'), a),
+ (('ftrunc', 'a(is_integral)'), a),
+ # fract(x) = x - floor(x), so fract(NaN) = NaN
+ (('~ffract', 'a(is_integral)'), 0.0),
+ (('fabs', 'a(is_not_negative)'), a),
+ (('iabs', 'a(is_not_negative)'), a),
+ (('fsat', 'a(is_not_positive)'), 0.0),
+
# Section 5.4.1 (Conversion and Scalar Constructors) of the GLSL 4.60 spec
# says:
#
(('ilt', ('f2u', a), b), ('ilt', ('f2i', a), b)),
(('ilt', b, ('f2u', a)), ('ilt', b, ('f2i', a))),
- (('~fmin', ('fabs', a), 1.0), ('fsat', ('fabs', a)), '!options->lower_fsat'),
+ (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'),
# The result of the multiply must be in [-1, 0], so the result of the ffma
# must be in [0, 1].
(('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),
(('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),
+ (('fne', 'a(is_not_zero)', 0.0), True),
+ (('feq', 'a(is_not_zero)', 0.0), False),
+
+ # In this chart, + means value > 0 and - means value < 0.
+ #
+ # + >= + -> unknown 0 >= + -> false - >= + -> false
+ # + >= 0 -> true 0 >= 0 -> true - >= 0 -> false
+ # + >= - -> true 0 >= - -> true - >= - -> unknown
+ #
+ # Using grouping conceptually similar to a Karnaugh map...
+ #
+ # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true
+ # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false
+ # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false
+ #
+ # The flt / ilt cases just invert the expected result.
+ #
+ # The results expecting true, must be marked imprecise. The results
+ # expecting false are fine because NaN compared >= or < anything is false.
+
+ (('~fge', 'a(is_not_negative)', 'b(is_not_positive)'), True),
+ (('fge', 'a(is_not_positive)', 'b(is_gt_zero)'), False),
+ (('fge', 'a(is_lt_zero)', 'b(is_not_negative)'), False),
+
+ (('flt', 'a(is_not_negative)', 'b(is_not_positive)'), False),
+ (('~flt', 'a(is_not_positive)', 'b(is_gt_zero)'), True),
+ (('~flt', 'a(is_lt_zero)', 'b(is_not_negative)'), True),
+
+ (('ine', 'a(is_not_zero)', 0), True),
+ (('ieq', 'a(is_not_zero)', 0), False),
+
+ (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True),
+ (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'), False),
+ (('ige', 'a(is_lt_zero)', 'b(is_not_negative)'), False),
+
+ (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False),
+ (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'), True),
+ (('ilt', 'a(is_lt_zero)', 'b(is_not_negative)'), True),
+
+ (('ult', 0, 'a(is_gt_zero)'), True),
+ (('ult', a, 0), False),
+
# Packing and then unpacking does nothing
(('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
(('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b),
(('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
('unpack_64_2x32_split_y', a)), a),
+ # Comparing two halves of an unpack separately. While this optimization
+ # should be correct for non-constant values, it's less obvious that it's
+ # useful in that case. For constant values, the pack will fold and we're
+ # guaranteed to reduce the whole tree to one instruction.
+ (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'),
+ ('ieq', ('unpack_32_2x16_split_y', a), '#c')),
+ ('ieq', a, ('pack_32_2x16_split', b, c))),
+
# Byte extraction
(('ushr', 'a@16', 8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
(('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
(('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
(('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),
(('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),
- (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte')
-]
+ (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
+
+ # Useless masking before unpacking
+ (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)),
+ (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)),
+ (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)),
+ (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)),
+ (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),
+ (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),
+
+ # Optimize half packing
+ (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))),
+ (('ishr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))),
+
+ (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
+ ('pack_half_2x16', ('vec2', a, b))),
+ (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
+ ('pack_half_2x16', ('vec2', a, b))),
+])
# After the ('extract_u8', a, 0) pattern, above, triggers, there will be
# patterns like those below.
(('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
# Subtracts
- (('~fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
- (('isub', a, ('isub', 0, b)), ('iadd', a, b)),
(('ussub_4x8', a, 0), a),
(('ussub_4x8', a, ~0), 0),
- (('fsub', a, b), ('fadd', a, ('fneg', b)), 'options->lower_sub'),
- (('isub', a, b), ('iadd', a, ('ineg', b)), 'options->lower_sub'),
- (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
- (('ineg', a), ('isub', 0, a), 'options->lower_negate'),
- (('~fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)),
- (('iadd', a, ('isub', 0, b)), ('isub', a, b)),
- (('fabs', ('fsub', 0.0, a)), ('fabs', a)),
- (('iabs', ('isub', 0, a)), ('iabs', a)),
+ # Lower all Subtractions first - they can get recombined later
+ (('fsub', a, b), ('fadd', a, ('fneg', b))),
+ (('isub', a, b), ('iadd', a, ('ineg', b))),
+ (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
+ # This is correct. We don't need isub_sat because the result type is unsigned, so it cannot overflow.
+ (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
# Propagate negation up multiplication chains
(('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))),
(('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)),
(('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
(('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
+ (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)),
+ (('ior', '#a', ('ior', 'b(is_not_const)', '#c')), ('ior', ('ior', a, c), b)),
+ (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)),
+
+ # Drop mul-div by the same value when there's no wrapping.
+ (('idiv', ('imul(no_signed_wrap)', a, b), b), a),
# By definition...
(('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)),
(('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
+ (('fmin3@64', a, b, c), ('fmin@64', a, ('fmin@64', b, c))),
+ (('fmax3@64', a, b, c), ('fmax@64', a, ('fmax@64', b, c))),
+ (('fmed3@64', a, b, c), ('fmax@64', ('fmin@64', ('fmax@64', a, b), c), ('fmin@64', a, b))),
+
# Misc. lowering
- (('fmod@16', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod16'),
- (('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod32'),
- (('fmod@64', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod64'),
- (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod32'),
+ (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
+ (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
(('uadd_carry@32', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
(('usub_borrow@32', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
(('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
- ('bcsel', ('ilt', 31, 'bits'), 'insert',
+ ('bcsel', ('ult', 31, 'bits'), 'insert',
('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')),
'options->lower_bitfield_insert'),
(('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
(('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
(('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
(('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
+ (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
+ (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
+ (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
+ (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
+
+ (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'),
(('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat'),
(('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_add_sat'),
+ (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
+
+ # int64_t sum = a + b;
+ #
+ # if (a < 0 && b < 0 && a < sum)
+ # sum = INT64_MIN;
+ # } else if (a >= 0 && b >= 0 && sum < a)
+ # sum = INT64_MAX;
+ # }
+ #
+ # A couple optimizations are applied.
+ #
+ # 1. a < sum => sum >= 0. This replacement works because it is known that
+ # a < 0 and b < 0, so sum should also be < 0 unless there was
+ # underflow.
+ #
+ # 2. sum < a => sum < 0. This replacement works because it is known that
+ # a >= 0 and b >= 0, so sum should also be >= 0 unless there was
+ # overflow.
+ #
+ # 3. Invert the second if-condition and swap the order of parameters for
+ # the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >=
+ # 0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0)
+ #
+ # On Intel Gen11, this saves ~11 instructions.
+ (('iadd_sat@64', a, b), ('bcsel',
+ ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
+ 0x8000000000000000,
+ ('bcsel',
+ ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
+ ('iadd', a, b),
+ 0x7fffffffffffffff)),
+ '(options->lower_int64_options & nir_lower_iadd64) != 0'),
+
+ # int64_t sum = a - b;
+ #
+ # if (a < 0 && b >= 0 && a < sum)
+ # sum = INT64_MIN;
+ # } else if (a >= 0 && b < 0 && a >= sum)
+ # sum = INT64_MAX;
+ # }
+ #
+ # Optimizations similar to the iadd_sat case are applied here.
+ (('isub_sat@64', a, b), ('bcsel',
+ ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
+ 0x8000000000000000,
+ ('bcsel',
+ ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
+ ('isub', a, b),
+ 0x7fffffffffffffff)),
+ '(options->lower_int64_options & nir_lower_iadd64) != 0'),
+
+ # These are done here instead of in the backend because the int64 lowering
+ # pass will make a mess of the patterns. The first patterns are
+ # conditioned on nir_lower_minmax64 because it was not clear that it was
+ # always an improvement on platforms that have real int64 support. No
+ # shaders in shader-db hit this, so it was hard to say one way or the
+ # other.
+ (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
+ (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
+ (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
+ (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
+ (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
+ (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
+
+ (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
+ (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
+ # 0u < uint(a) <=> uint(a) != 0u
+ (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
# Alternative lowering that doesn't rely on bfi.
(('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
- ('bcsel', ('ilt', 31, 'bits'),
+ ('bcsel', ('ult', 31, 'bits'),
'insert',
- ('ior',
- ('iand', 'base', ('inot', ('bfm', 'bits', 'offset'))),
- ('iand', ('ishl', 'insert', 'offset'), ('bfm', 'bits', 'offset')))),
+ (('ior',
+ ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))),
+ ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))),
'options->lower_bitfield_insert_to_shifts'),
- # bfm lowering -- note that the NIR opcode is undefined if either arg is 32.
- (('bfm', 'bits', 'offset'),
- ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'),
- 'options->lower_bfm'),
+ # Alternative lowering that uses bitfield_select.
+ (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
+ ('bcsel', ('ult', 31, 'bits'), 'insert',
+ ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')),
+ 'options->lower_bitfield_insert_to_bitfield_select'),
(('ibitfield_extract', 'value', 'offset', 'bits'),
- ('bcsel', ('ilt', 31, 'bits'), 'value',
+ ('bcsel', ('ult', 31, 'bits'), 'value',
('ibfe', 'value', 'offset', 'bits')),
'options->lower_bitfield_extract'),
('ubfe', 'value', 'offset', 'bits')),
'options->lower_bitfield_extract'),
+ # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits'
+ (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')),
+ (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')),
+ (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')),
+ (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')),
+ (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')),
+ (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')),
+
(('ibitfield_extract', 'value', 'offset', 'bits'),
('bcsel', ('ieq', 0, 'bits'),
0,
('ushr', 'value', 'offset'),
('bcsel', ('ieq', 'bits', 32),
0xffffffff,
- ('bfm', 'bits', 0))),
+ ('isub', ('ishl', 1, 'bits'), 1))),
'options->lower_bitfield_extract_to_shifts'),
(('ifind_msb', 'value'),
127.0))),
'options->lower_unpack_snorm_4x8'),
+ (('pack_half_2x16_split', 'a@32', 'b@32'),
+ ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))),
+ 'options->lower_pack_half_2x16_split'),
+
+ (('unpack_half_2x16_split_x', 'a@32'),
+ ('f2f32', ('u2u16', a)),
+ 'options->lower_unpack_half_2x16_split'),
+
+ (('unpack_half_2x16_split_y', 'a@32'),
+ ('f2f32', ('u2u16', ('ushr', a, 16))),
+ 'options->lower_unpack_half_2x16_split'),
+
(('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
(('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'),
+
+ # Address/offset calculations:
+ # Drivers supporting imul24 should use the nir_lower_amul() pass, this
+ # rule converts everyone else to imul:
+ (('amul', a, b), ('imul', a, b), '!options->has_imul24'),
+
+ (('imad24_ir3', a, b, 0), ('imul24', a, b)),
+ (('imad24_ir3', a, 0, c), (c)),
+ (('imad24_ir3', a, 1, c), ('iadd', a, c)),
+
+ # if first two srcs are const, crack apart the imad so constant folding
+ # can clean up the imul:
+ # TODO ffma should probably get a similar rule:
+ (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)),
+
+ # These will turn 24b address/offset calc back into 32b shifts, but
+ # it should be safe to get back some of the bits of precision that we
+ # already decided were no necessary:
+ (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
+ (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
+ (('imul24', a, 0), (0)),
])
# bit_size dependent lowerings
('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_add_sat'),
]
-invert = OrderedDict([('feq', 'fne'), ('fne', 'feq'), ('fge', 'flt'), ('flt', 'fge')])
+invert = OrderedDict([('feq', 'fne'), ('fne', 'feq')])
for left, right in itertools.combinations_with_replacement(invert.keys(), 2):
optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))),
optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
# Optimize away x2xN(a@N)
-for t in ['int', 'uint', 'float']:
+for t in ['int', 'uint', 'float', 'bool']:
for N in type_sizes(t):
x2xN = '{0}2{0}{1}'.format(t[0], N)
aN = 'a@{0}'.format(N)
# The N == M case is handled by other optimizations
pass
+# Downcast operations should be able to see through pack
+for t in ['i', 'u']:
+ for N in [8, 16, 32]:
+ x2xN = '{0}2{0}{1}'.format(t, N)
+ optimizations += [
+ ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
+ ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
+ ]
+
+# Optimize comparisons with up-casts
+for t in ['int', 'uint', 'float']:
+ for N, M in itertools.product(type_sizes(t), repeat=2):
+ if N == 1 or N >= M:
+ continue
+
+ x2xM = '{0}2{0}{1}'.format(t[0], M)
+ x2xN = '{0}2{0}{1}'.format(t[0], N)
+ aN = 'a@' + str(N)
+ bN = 'b@' + str(N)
+ xeq = 'feq' if t == 'float' else 'ieq'
+ xne = 'fne' if t == 'float' else 'ine'
+ xge = '{0}ge'.format(t[0])
+ xlt = '{0}lt'.format(t[0])
+
+ # Up-casts are lossless so for correctly signed comparisons of
+ # up-casted values we can do the comparison at the largest of the two
+ # original sizes and drop one or both of the casts. (We have
+ # optimizations to drop the no-op casts which this may generate.)
+ for P in type_sizes(t):
+ if P == 1 or P > N:
+ continue
+
+ bP = 'b@' + str(P)
+ optimizations += [
+ ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b))),
+ ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b))),
+ ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b))),
+ ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b))),
+ ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a)),
+ ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a)),
+ ]
+
+ # The next bit doesn't work on floats because the range checks would
+ # get way too complicated.
+ if t in ['int', 'uint']:
+ if t == 'int':
+ xN_min = -(1 << (N - 1))
+ xN_max = (1 << (N - 1)) - 1
+ elif t == 'uint':
+ xN_min = 0
+ xN_max = (1 << N) - 1
+ else:
+ assert False
+
+ # If we're up-casting and comparing to a constant, we can unfold
+ # the comparison into a comparison with the shrunk down constant
+ # and a check that the constant fits in the smaller bit size.
+ optimizations += [
+ ((xeq, (x2xM, aN), '#b'),
+ ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b))),
+ ((xne, (x2xM, aN), '#b'),
+ ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b))),
+ ((xlt, (x2xM, aN), '#b'),
+ ('iand', (xlt, xN_min, b),
+ ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b))))),
+ ((xlt, '#a', (x2xM, bN)),
+ ('iand', (xlt, a, xN_max),
+ ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b)))),
+ ((xge, (x2xM, aN), '#b'),
+ ('iand', (xge, xN_max, b),
+ ('ior', (xge, xN_min, b), (xge, a, (x2xN, b))))),
+ ((xge, '#a', (x2xM, bN)),
+ ('iand', (xge, a, xN_min),
+ ('ior', (xge, a, xN_max), (xge, (x2xN, a), b)))),
+ ]
+
def fexp2i(exp, bits):
- # We assume that exp is already in the right range.
+ # Generate an expression which constructs value 2.0^exp or 0.0.
+ #
+ # We assume that exp is already in a valid range:
+ #
+ # * [-15, 15] for 16-bit float
+ # * [-127, 127] for 32-bit float
+ # * [-1023, 1023] for 16-bit float
+ #
+ # If exp is the lowest value in the valid range, a value of 0.0 is
+ # constructed. Otherwise, the value 2.0^exp is constructed.
if bits == 16:
return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
elif bits == 32:
assert False
def ldexp(f, exp, bits):
- # First, we clamp exp to a reasonable range. The maximum possible range
- # for a normal exponent is [-126, 127] and, throwing in denormals, you get
- # a maximum range of [-149, 127]. This means that we can potentially have
- # a swing of +-276. If you start with FLT_MAX, you actually have to do
- # ldexp(FLT_MAX, -278) to get it to flush all the way to zero. The GLSL
- # spec, on the other hand, only requires that we handle an exponent value
- # in the range [-126, 128]. This implementation is *mostly* correct; it
- # handles a range on exp of [-252, 254] which allows you to create any
- # value (including denorms if the hardware supports it) and to adjust the
- # exponent of any normal value to anything you want.
+ # The maximum possible range for a normal exponent is [-126, 127] and,
+ # throwing in denormals, you get a maximum range of [-149, 127]. This
+ # means that we can potentially have a swing of +-276. If you start with
+ # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush
+ # all the way to zero. The GLSL spec only requires that we handle a subset
+ # of this range. From version 4.60 of the spec:
+ #
+ # "If exp is greater than +128 (single-precision) or +1024
+ # (double-precision), the value returned is undefined. If exp is less
+ # than -126 (single-precision) or -1022 (double-precision), the value
+ # returned may be flushed to zero. Additionally, splitting the value
+ # into a significand and exponent using frexp() and then reconstructing
+ # a floating-point value using ldexp() should yield the original input
+ # for zero and all finite non-denormalized values."
+ #
+ # The SPIR-V spec has similar language.
+ #
+ # In order to handle the maximum value +128 using the fexp2i() helper
+ # above, we have to split the exponent in half and do two multiply
+ # operations.
+ #
+ # First, we clamp exp to a reasonable range. Specifically, we clamp to
+ # twice the full range that is valid for the fexp2i() function above. If
+ # exp/2 is the bottom value of that range, the fexp2i() expression will
+ # yield 0.0f which, when multiplied by f, will flush it to zero which is
+ # allowed by the GLSL and SPIR-V specs for low exponent values. If the
+ # value is clamped from above, then it must have been above the supported
+ # range of the GLSL built-in and therefore any return value is acceptable.
if bits == 16:
- exp = ('imin', ('imax', exp, -28), 30)
+ exp = ('imin', ('imax', exp, -30), 30)
elif bits == 32:
- exp = ('imin', ('imax', exp, -252), 254)
+ exp = ('imin', ('imax', exp, -254), 254)
elif bits == 64:
- exp = ('imin', ('imax', exp, -2044), 2046)
+ exp = ('imin', ('imax', exp, -2046), 2046)
else:
assert False
step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8))
step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4))
step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2))
- step5 = ('ior', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1))
+ step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1))
return step5
-optimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'))]
+optimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
# For any float comparison operation, "cmp", if you have "a == a && a cmp b"
# then the "a == a" is redundant because it's equivalent to "a is not NaN"
# and, if a is a NaN then the second comparison will fail anyway.
for op in ['flt', 'fge', 'feq']:
optimizations += [
- (('iand', ('feq', a, a), (op, a, b)), (op, a, b)),
- (('iand', ('feq', a, a), (op, b, a)), (op, b, a)),
+ (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)),
+ (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)),
]
# Add optimizations to handle the case where the result of a ternary is
((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE')
]
+# Some optimizations for ir3-specific instructions.
+optimizations += [
+ # 'al * bl': If either 'al' or 'bl' is zero, return zero.
+ (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)),
+ # '(ah * bl) << 16 + c': If either 'ah' or 'bl' is zero, return 'c'.
+ (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')),
+ (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')),
+]
+
+# These kinds of sequences can occur after nir_opt_peephole_select.
+#
+# NOTE: fadd is not handled here because that gets in the way of ffma
+# generation in the i965 driver. Instead, fadd and ffma are handled in
+# late_optimizations.
+
+for op in ['flrp']:
+ optimizations += [
+ (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
+ (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
+ (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
+ (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
+ (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)),
+ (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)),
+ ]
+
+for op in ['fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']:
+ optimizations += [
+ (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
+ (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))),
+ (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
+ (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
+ ]
+
+for op in ['fpow']:
+ optimizations += [
+ (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
+ (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
+ (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)),
+ (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)),
+ ]
+
+for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos']:
+ optimizations += [
+ (('bcsel', a, (op + '(is_used_once)', b), (op, c)), (op, ('bcsel', a, b, c))),
+ (('bcsel', a, (op, b), (op + '(is_used_once)', c)), (op, ('bcsel', a, b, c))),
+ ]
+
# This section contains "late" optimizations that should be run before
# creating ffmas and calling regular optimizations for the final time.
# Optimizations should go here if they help code generation and conflict
# optimization loop can prevent other optimizations.
(('fneg', ('fneg', a)), a),
+ # Subtractions get lowered during optimization, so we need to recombine them
+ (('fadd', 'a', ('fneg', 'b')), ('fsub', 'a', 'b'), '!options->lower_sub'),
+ (('iadd', 'a', ('ineg', 'b')), ('isub', 'a', 'b'), '!options->lower_sub'),
+ (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
+ (('ineg', a), ('isub', 0, a), 'options->lower_negate'),
+
+ # These are duplicated from the main optimizations table. The late
+ # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
+ # new patterns like these. The patterns that compare with zero are removed
+ # because they are unlikely to be created in by anything in
+ # late_optimizations.
+ (('flt', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('flt', a, b)),
+ (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
+ (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
+ (('fge', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('fge', b, a)),
+ (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
+ (('fne', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fne', a, b)),
+
+ (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
+ (('flt', ('fsat(is_used_once)', a), 1.0), ('flt', a, 1.0)),
+
(('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))),
(('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
(('ior', a, a), a),
(('iand', a, a), a),
+ (('iand', ('ine(is_used_once)', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('umin', a, b), 0)),
+ (('ior', ('ieq(is_used_once)', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('umin', a, b), 0)),
+
+ (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
+
(('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'),
(('fdot3', a, b), ('fdot_replicated3', a, b), 'options->fdot_replicates'),
(('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),
(('~fadd@32', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp32'),
(('~fadd@64', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp64'),
+ # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this
+ # particular operation is common for expanding values stored in a texture
+ # from [0,1] to [-1,1].
+ (('~ffma@32', a, 2.0, -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'),
+ (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'),
+ (('~ffma@32', a, -2.0, 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'),
+ (('~ffma@32', a, 2.0, 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
+ (('~fadd@32', ('fmul(is_used_once)', 2.0, a), -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'),
+ (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'),
+ (('~fadd@32', ('fmul(is_used_once)', -2.0, a), 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'),
+ (('~fadd@32', ('fmul(is_used_once)', 2.0, a), 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
+
+ # flrp(a, b, a)
+ # a*(1-a) + b*a
+ # a + -a*a + a*b (1)
+ # a + a*(b - a)
+ # Option 1: ffma(a, (b-a), a)
+ #
+ # Alternately, after (1):
+ # a*(1+b) + -a*a
+ # a*((1+b) + -a)
+ #
+ # Let b=1
+ #
+ # Option 2: ffma(a, 2, -(a*a))
+ # Option 3: ffma(a, 2, (-a)*a)
+ # Option 4: ffma(a, -a, (2*a)
+ # Option 5: a * (2 - a)
+ #
+ # There are a lot of other possible combinations.
+ (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'),
+ (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
+ (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
+ (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
+ (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
+
# we do these late so that we don't get in the way of creating ffmas
(('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
(('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),
(('bcsel', a, 0, ('b2f32', ('inot', 'b@bool'))), ('b2f32', ('inot', ('ior', a, b)))),
+
+ # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c),
+ # op(b, d)) => op(b, bcsel(a, c, d)) transformations. I do not know why.
+ (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)),
+ ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))),
+
+ # Things that look like DPH in the source shader may get expanded to
+ # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets
+ # to NIR. After FFMA is generated, this can look like:
+ #
+ # fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w)
+ #
+ # Reassociate the last addition into the first multiplication.
+ #
+ # Some shaders do not use 'invariant' in vertex and (possibly) geometry
+ # shader stages on some outputs that are intended to be invariant. For
+ # various reasons, this optimization may not be fully applied in all
+ # shaders used for different rendering passes of the same geometry. This
+ # can result in Z-fighting artifacts (at best). For now, disable this
+ # optimization in these stages. See bugzilla #111490. In tessellation
+ # stages applications seem to use 'precise' when necessary, so allow the
+ # optimization in those stages.
+ (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
+ ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
+ (('~fadd', ('ffma(is_used_once)', a, b, ('fmul', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
+ ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
+
+ # Convert f2fmp instructions to concrete f2f16 instructions. At this point
+ # any conversions that could have been removed will have been removed in
+ # nir_opt_algebraic so any remaining ones are required.
+ (('f2fmp', a), ('f2f16', a)),
+]
+
+for op in ['fadd']:
+ late_optimizations += [
+ (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
+ (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
+ ]
+
+for op in ['ffma']:
+ late_optimizations += [
+ (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
+ (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
+
+ (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
+ (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
+ ]
+
+distribute_src_mods = [
+ # Try to remove some spurious negations rather than pushing them down.
+ (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)),
+ (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
+ (('fdot_replicated2', ('fneg', a), ('fneg', b)), ('fdot_replicated2', a, b)),
+ (('fdot_replicated3', ('fneg', a), ('fneg', b)), ('fdot_replicated3', a, b)),
+ (('fdot_replicated4', ('fneg', a), ('fneg', b)), ('fdot_replicated4', a, b)),
+ (('fneg', ('fneg', a)), a),
+
+ (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
+ (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
+ (('fneg', ('fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
+
+ # Note that fmin <-> fmax. I don't think there is a way to distribute
+ # fabs() into fmin or fmax.
+ (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))),
+ (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))),
+
+ # fdph works mostly like fdot, but to get the correct result, the negation
+ # must be applied to the second source.
+ (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))),
+ (('fabs', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', ('fabs', a), ('fabs', b))),
+
+ (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))),
+ (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),
]
+for op in ['fmul', 'fdot_replicated2', 'fdot_replicated3', 'fdot_replicated4']:
+ distribute_src_mods.extend([
+ (('fneg', (op + '(is_used_once)', a, b)), (op, ('fneg', a), b)),
+ (('fabs', (op + '(is_used_once)', a, b)), (op, ('fabs', a), ('fabs', b))),
+ ])
+
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
before_ffma_optimizations).render())
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
late_optimizations).render())
+print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods",
+ distribute_src_mods).render())