import nir_algebraic
from nir_opcodes import type_sizes
import itertools
+import struct
+from math import pi
# Convenience variables
a = 'a'
# however, be used for backend-requested lowering operations as those need to
# happen regardless of precision.
#
-# Variable names are specified as "[#]name[@type][(cond)]" where "#" inicates
-# that the given variable will only match constants and the type indicates that
-# the given variable will only match values from ALU instructions with the
-# given output type, and (cond) specifies an additional condition function
-# (see nir_search_helpers.h).
+# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where:
+# "#" indicates that the given variable will only match constants,
+# type indicates that the given variable will only match values from ALU
+# instructions with the given output type,
+# (cond) specifies an additional condition function (see nir_search_helpers.h),
+# swiz is a swizzle applied to the variable (only in the <replace> expression)
#
# For constants, you have to be careful to make sure that it is the right
# type because python is unaware of the source and destination types of the
# expression this indicates that the constructed value should have that
# bit-size.
#
+# If the opcode in a replacement expression is prefixed by a '!' character,
+# this indicated that the new expression will be marked exact.
+#
# A special condition "many-comm-expr" can be used with expressions to note
# that the expression and its subexpressions have more commutative expressions
# than nir_replace_instr can handle. If this special condition is needed with
# another condition, the two can be separated by a comma (e.g.,
# "(many-comm-expr,is_used_once)").
+# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
+def lowered_sincos(c):
+ x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0)
+ x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0)
+ return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x)
+
+def intBitsToFloat(i):
+ return struct.unpack('!f', struct.pack('!I', i))[0]
+
optimizations = [
- (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitshift'),
- (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitshift'),
- (('ishl', a, '#b@32'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitshift'),
+ (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
+ (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
+ (('ishl', a, '#b@32'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'),
(('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
(('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
(('idiv', a, 1), a),
(('umod', a, 1), 0),
(('imod', a, 1), 0),
- (('udiv', a, '#b@32(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitshift'),
+ (('udiv', a, '#b@32(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'),
(('idiv', a, '#b@32(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), 'options->lower_idiv'),
(('idiv', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), 'options->lower_idiv'),
(('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1))),
- (('fneg', ('fneg', a)), a),
+ (('~fneg', ('fneg', a)), a),
(('ineg', ('ineg', a)), a),
(('fabs', ('fabs', a)), ('fabs', a)),
(('fabs', ('fneg', a)), ('fabs', a)),
(('imul', a, 0), 0),
(('umul_unorm_4x8', a, 0), 0),
(('umul_unorm_4x8', a, ~0), a),
- (('fmul', a, 1.0), a),
+ (('~fmul', a, 1.0), a),
(('imul', a, 1), a),
(('fmul', a, -1.0), ('fneg', a)),
(('imul', a, -1), ('ineg', a)),
(('~fmul', ('fadd', ('iand', ('ineg', ('b2i32', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
- (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d)),
+ (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'),
+
+ (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d), '!options->lower_fdph'),
(('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)),
(('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)),
(('fdot4', ('vec4', a, b, c, 0.0), d), ('fdot3', ('vec3', a, b, c), d)),
(('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)),
(('fdot3', ('vec3', a, b, 0.0), c), ('fdot2', ('vec2', a, b), c)),
+ (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)),
+ (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')),
+
+ # Lower fdot to fsum when it is available
+ (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'),
+ (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'),
+ (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'),
+ (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'),
+
# If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially
# If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1
# If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0
# (a * #b) << #c
# a * (#b << #c)
(('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))),
+]
+
+# Care must be taken here. Shifts in NIR uses only the lower log2(bitsize)
+# bits of the second source. These replacements must correctly handle the
+# case where (b % bitsize) + (c % bitsize) >= bitsize.
+for s in [8, 16, 32, 64]:
+ mask = (1 << s) - 1
+
+ ishl = "ishl@{}".format(s)
+ ishr = "ishr@{}".format(s)
+ ushr = "ushr@{}".format(s)
+
+ in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s)
+
+ optimizations.extend([
+ ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)),
+ ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)),
+
+ # To get get -1 for large shifts of negative values, ishr must instead
+ # clamp the shift count to the maximum value.
+ ((ishr, (ishr, a, '#b'), '#c'),
+ (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),
+ ])
+
+# Optimize a pattern of address calculation created by DXVK where the offset is
+# divided by 4 and then multipled by 4. This can be turned into an iand and the
+# additions before can be reassociated to CSE the iand instruction.
+for log2 in range(1, 7): # powers of two from 2 to 64
+ v = 1 << log2
+ mask = 0xffffffff & ~(v - 1)
+ b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v)
+
+ optimizations.extend([
+ # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)'
+ (('ishl@32', ('ushr@32', a, log2), log2), ('iand', a, mask)),
+
+ # Reassociate for improved CSE
+ (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),
+ ])
+
+optimizations.extend([
+ # This is common for address calculations. Reassociating may enable the
+ # 'a<<c' to be CSE'd. It also helps architectures that have an ISHLADD
+ # instruction or a constant offset field for in load / store instructions.
+ (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))),
# Comparison simplifications
(('~inot', ('flt', a, b)), ('fge', a, b)),
(('~fge', ('fadd', a, b), a), ('fge', b, 0.0)),
(('~feq', ('fadd', a, b), a), ('feq', b, 0.0)),
(('~fne', ('fadd', a, b), a), ('fne', b, 0.0)),
+ (('~flt', ('fadd(is_used_once)', a, '#b'), '#c'), ('flt', a, ('fadd', c, ('fneg', b)))),
+ (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)),
+ (('~fge', ('fadd(is_used_once)', a, '#b'), '#c'), ('fge', a, ('fadd', c, ('fneg', b)))),
+ (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)),
+ (('~feq', ('fadd(is_used_once)', a, '#b'), '#c'), ('feq', a, ('fadd', c, ('fneg', b)))),
+ (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)),
+ (('~fne', ('fadd(is_used_once)', a, '#b'), '#c'), ('fne', a, ('fadd', c, ('fneg', b)))),
+ (('~fne', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fne', ('fneg', ('fadd', c, b)), a)),
# Cannot remove the addition from ilt or ige due to overflow.
(('ieq', ('iadd', a, b), a), ('ieq', b, 0)),
(('feq', ('fmin', ('fneg', ('b2f', 'a@1')), 'b@1'), 0.0), ('iand', ('inot', a), ('fge', b, 0.0))),
(('feq', ('b2f', 'a@1'), 0.0), ('inot', a)),
- (('fne', ('b2f', 'a@1'), 0.0), a),
+ (('~fne', ('b2f', 'a@1'), 0.0), a),
(('ieq', ('b2i', 'a@1'), 0), ('inot', a)),
(('ine', ('b2i', 'a@1'), 0), a),
# 0.0 >= fabs(a)
(('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
+ # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a
+ (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'),
+
+ # (a < 0.0) || (a > 1.0)
+ # !(!(a < 0.0) && !(a > 1.0))
+ # !((a >= 0.0) && (a <= 1.0))
+ # !(a == fsat(a))
+ # a != fsat(a)
+ (('ior', ('flt', a, 0.0), ('flt', 1.0, a)), ('fne', a, ('fsat', a)), '!options->lower_fsat'),
+
(('fmax', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('ior', a, b))),
(('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))),
(('fmin', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))),
(('bcsel', a, a, b), ('ior', a, b)),
(('bcsel', a, b, False), ('iand', a, b)),
(('bcsel', a, b, a), ('iand', a, b)),
- (('fmin', a, a), a),
- (('fmax', a, a), a),
+ (('~fmin', a, a), a),
+ (('~fmax', a, a), a),
(('imin', a, a), a),
(('imax', a, a), a),
(('umin', a, a), a),
(('fmin', ('fmin', a, b), b), ('fmin', a, b)),
(('umin', ('umin', a, b), b), ('umin', a, b)),
(('imin', ('imin', a, b), b), ('imin', a, b)),
- (('fmax', a, ('fneg', a)), ('fabs', a)),
- (('imax', a, ('ineg', a)), ('iabs', a)),
+ (('iand@32', a, ('inot', ('ishr', a, 31))), ('imax', a, 0)),
(('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))),
(('imin', a, ('ineg', a)), ('ineg', ('iabs', a))),
(('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))),
(('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))),
- (('fmin', a, ('fabs', a)), a),
+ (('~fmin', a, ('fabs', a)), a),
(('imin', a, ('iabs', a)), a),
- (('fmax', a, ('fneg', ('fabs', a))), a),
+ (('~fmax', a, ('fneg', ('fabs', a))), a),
(('imax', a, ('ineg', ('iabs', a))), a),
(('fmax', a, ('fabs', a)), ('fabs', a)),
(('imax', a, ('iabs', a)), ('iabs', a)),
(('~fmax', ('fabs', a), 0.0), ('fabs', a)),
(('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
(('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
- (('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_negate && !options->lower_fsat'),
- (('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_negate && !options->lower_fsat'),
+ (('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
+ (('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
(('fsat', ('fsign', a)), ('b2f', ('flt', 0.0, a))),
(('fsat', ('b2f', a)), ('b2f', a)),
(('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
(('fsat', ('fsat', a)), ('fsat', a)),
- (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_negate && !options->lower_fsat'),
- (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_negate && !options->lower_fsat'),
+ (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'),
+ (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'),
(('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'),
(('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),
(('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
(('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))),
(('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)),
+ # These derive from the previous patterns with the application of b < 0 <=>
+ # 0 < -b. The transformation should be applied if either comparison is
+ # used once as this ensures that the number of comparisons will not
+ # increase. The sources to the ior and iand are not symmetric, so the
+ # rules have to be duplicated to get this behavior.
+ (('~ior', ('flt(is_used_once)', 0.0, 'a@32'), ('flt', 'b@32', 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
+ (('~ior', ('flt', 0.0, 'a@32'), ('flt(is_used_once)', 'b@32', 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
+ (('~ior', ('fge(is_used_once)', 0.0, 'a@32'), ('fge', 'b@32', 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
+ (('~ior', ('fge', 0.0, 'a@32'), ('fge(is_used_once)', 'b@32', 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
+ (('~iand', ('flt(is_used_once)', 0.0, 'a@32'), ('flt', 'b@32', 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
+ (('~iand', ('flt', 0.0, 'a@32'), ('flt(is_used_once)', 'b@32', 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
+ (('~iand', ('fge(is_used_once)', 0.0, 'a@32'), ('fge', 'b@32', 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
+ (('~iand', ('fge', 0.0, 'a@32'), ('fge(is_used_once)', 'b@32', 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
+
# Common pattern like 'if (i == 0 || i == 1 || ...)'
(('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)),
(('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)),
(('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)),
(('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))),
- (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', 'a@32', 'b@32'), 0)),
+ (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', 'a@32', 'b@32'), 0), '!options->lower_bitops'),
# These patterns can result when (a < b || a < c) => (a < min(b, c))
# transformations occur before constant propagation and loop-unrolling.
(('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
(('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
(('sne', a, b), ('b2f', ('fne', a, b)), 'options->lower_scmp'),
+ (('seq', ('seq', a, b), 1.0), ('seq', a, b)),
+ (('seq', ('sne', a, b), 1.0), ('sne', a, b)),
+ (('seq', ('slt', a, b), 1.0), ('slt', a, b)),
+ (('seq', ('sge', a, b), 1.0), ('sge', a, b)),
+ (('sne', ('seq', a, b), 0.0), ('seq', a, b)),
+ (('sne', ('sne', a, b), 0.0), ('sne', a, b)),
+ (('sne', ('slt', a, b), 0.0), ('slt', a, b)),
+ (('sne', ('sge', a, b), 0.0), ('sge', a, b)),
+ (('seq', ('seq', a, b), 0.0), ('sne', a, b)),
+ (('seq', ('sne', a, b), 0.0), ('seq', a, b)),
+ (('seq', ('slt', a, b), 0.0), ('sge', a, b)),
+ (('seq', ('sge', a, b), 0.0), ('slt', a, b)),
+ (('sne', ('seq', a, b), 1.0), ('sne', a, b)),
+ (('sne', ('sne', a, b), 1.0), ('seq', a, b)),
+ (('sne', ('slt', a, b), 1.0), ('sge', a, b)),
+ (('sne', ('sge', a, b), 1.0), ('slt', a, b)),
+ (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
+ (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'),
+ (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'),
+ (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
+ (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
+ (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
(('fne', ('fneg', a), a), ('fne', a, 0.0)),
(('feq', ('fneg', a), a), ('feq', a, 0.0)),
# Emulating booleans
# True/False are ~0 and 0 in NIR. b2i of True is 1, and -1 is ~0 (True).
(('ineg', ('b2i32', 'a@32')), a),
(('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF.
- (('flt', ('fsub', 0.0, ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF.
# Comparison with the same args. Note that these are not done for
# the float versions because NaN always returns false on float
# inequalities.
(('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
(('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))),
(('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))),
+ (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)),
# Division and reciprocal
(('~fdiv', 1.0, a), ('frcp', a)),
(('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
(('~frcp', ('fsqrt', a)), ('frsq', a)),
(('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
(('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
+ # Trig
+ (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'),
+ (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'),
# Boolean simplifications
(('i2b32(is_used_by_if)', a), ('ine32', a, 0)),
(('i2b1(is_used_by_if)', a), ('ine', a, 0)),
(('bcsel', a, ('b2f(is_used_once)', 'b@32'), ('b2f', 'c@32')), ('b2f', ('bcsel', a, b, c))),
(('bcsel', a, b, b), b),
- (('fcsel', a, b, b), b),
+ (('~fcsel', a, b, b), b),
# D3D Boolean emulation
(('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))),
(('ine', ('ineg', ('b2i', 'a@1')), 0), a),
(('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)),
(('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)),
+ (('iand', ('ineg', ('b2i', a)), 1), ('b2i', a)),
# SM5 32-bit shifts are defined to use the 5 least significant bits
(('ishl', 'a@32', ('iand', 31, b)), ('ishl', a, b)),
(('f2u', ('ftrunc', a)), ('f2u', a)),
(('i2b', ('ineg', a)), ('i2b', a)),
(('i2b', ('iabs', a)), ('i2b', a)),
- (('fabs', ('b2f', a)), ('b2f', a)),
- (('iabs', ('b2i', a)), ('b2i', a)),
(('inot', ('f2b1', a)), ('feq', a, 0.0)),
+ # The C spec says, "If the value of the integral part cannot be represented
+ # by the integer type, the behavior is undefined." "Undefined" can mean
+ # "the conversion doesn't happen at all."
+ (('~i2f32', ('f2i32', 'a@32')), ('ftrunc', a)),
+
# Ironically, mark these as imprecise because removing the conversions may
# preserve more precision than doing the conversions (e.g.,
# uint(float(0x81818181u)) == 0x81818200).
(('~f2u32', ('i2f', 'a@32')), a),
(('~f2u32', ('u2f', 'a@32')), a),
+ (('ffloor', 'a(is_integral)'), a),
+ (('fceil', 'a(is_integral)'), a),
+ (('ftrunc', 'a(is_integral)'), a),
+ # fract(x) = x - floor(x), so fract(NaN) = NaN
+ (('~ffract', 'a(is_integral)'), 0.0),
+ (('fabs', 'a(is_not_negative)'), a),
+ (('iabs', 'a(is_not_negative)'), a),
+ (('fsat', 'a(is_not_positive)'), 0.0),
+
# Section 5.4.1 (Conversion and Scalar Constructors) of the GLSL 4.60 spec
# says:
#
(('ilt', ('f2u', a), b), ('ilt', ('f2i', a), b)),
(('ilt', b, ('f2u', a)), ('ilt', b, ('f2i', a))),
- (('~fmin', ('fabs', a), 1.0), ('fsat', ('fabs', a)), '!options->lower_fsat'),
+ (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'),
# The result of the multiply must be in [-1, 0], so the result of the ffma
# must be in [0, 1].
(('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),
(('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),
+ (('fne', 'a(is_not_zero)', 0.0), True),
+ (('feq', 'a(is_not_zero)', 0.0), False),
+
+ # In this chart, + means value > 0 and - means value < 0.
+ #
+ # + >= + -> unknown 0 >= + -> false - >= + -> false
+ # + >= 0 -> true 0 >= 0 -> true - >= 0 -> false
+ # + >= - -> true 0 >= - -> true - >= - -> unknown
+ #
+ # Using grouping conceptually similar to a Karnaugh map...
+ #
+ # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true
+ # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false
+ # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false
+ #
+ # The flt / ilt cases just invert the expected result.
+ #
+ # The results expecting true, must be marked imprecise. The results
+ # expecting false are fine because NaN compared >= or < anything is false.
+
+ (('~fge', 'a(is_not_negative)', 'b(is_not_positive)'), True),
+ (('fge', 'a(is_not_positive)', 'b(is_gt_zero)'), False),
+ (('fge', 'a(is_lt_zero)', 'b(is_not_negative)'), False),
+
+ (('flt', 'a(is_not_negative)', 'b(is_not_positive)'), False),
+ (('~flt', 'a(is_not_positive)', 'b(is_gt_zero)'), True),
+ (('~flt', 'a(is_lt_zero)', 'b(is_not_negative)'), True),
+
+ (('ine', 'a(is_not_zero)', 0), True),
+ (('ieq', 'a(is_not_zero)', 0), False),
+
+ (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True),
+ (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'), False),
+ (('ige', 'a(is_lt_zero)', 'b(is_not_negative)'), False),
+
+ (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False),
+ (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'), True),
+ (('ilt', 'a(is_lt_zero)', 'b(is_not_negative)'), True),
+
+ (('ult', 0, 'a(is_gt_zero)'), True),
+
# Packing and then unpacking does nothing
(('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
(('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b),
(('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
(('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),
(('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),
- (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte')
-]
+ (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
+
+ # Useless masking before unpacking
+ (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)),
+ (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)),
+ (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)),
+ (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)),
+ (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),
+ (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),
+
+ # Optimize half packing
+ (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))),
+ (('ishr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))),
+
+ (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
+ ('pack_half_2x16', ('vec2', a, b))),
+ (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
+ ('pack_half_2x16', ('vec2', a, b))),
+])
# After the ('extract_u8', a, 0) pattern, above, triggers, there will be
# patterns like those below.
(('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
# Subtracts
- (('~fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
- (('isub', a, ('isub', 0, b)), ('iadd', a, b)),
(('ussub_4x8', a, 0), a),
(('ussub_4x8', a, ~0), 0),
- (('fsub', a, b), ('fadd', a, ('fneg', b)), 'options->lower_sub'),
- (('isub', a, b), ('iadd', a, ('ineg', b)), 'options->lower_sub'),
- (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
- (('ineg', a), ('isub', 0, a), 'options->lower_negate'),
- (('~fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)),
- (('iadd', a, ('isub', 0, b)), ('isub', a, b)),
- (('fabs', ('fsub', 0.0, a)), ('fabs', a)),
- (('iabs', ('isub', 0, a)), ('iabs', a)),
+ # Lower all Subtractions first - they can get recombined later
+ (('fsub', a, b), ('fadd', a, ('fneg', b))),
+ (('isub', a, b), ('iadd', a, ('ineg', b))),
+ (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
+ # This is correct. We don't need isub_sat because the result type is unsigned, so it cannot overflow.
+ (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
# Propagate negation up multiplication chains
(('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))),
(('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
(('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
(('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
+ (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
+ (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
+ (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
+ (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
+
+ (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'),
(('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat'),
(('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_add_sat'),
+ (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
+
+ # int64_t sum = a + b;
+ #
+ # if (a < 0 && b < 0 && a < sum)
+ # sum = INT64_MIN;
+ # } else if (a >= 0 && b >= 0 && sum < a)
+ # sum = INT64_MAX;
+ # }
+ #
+ # A couple optimizations are applied.
+ #
+ # 1. a < sum => sum >= 0. This replacement works because it is known that
+ # a < 0 and b < 0, so sum should also be < 0 unless there was
+ # underflow.
+ #
+ # 2. sum < a => sum < 0. This replacement works because it is known that
+ # a >= 0 and b >= 0, so sum should also be >= 0 unless there was
+ # overflow.
+ #
+ # 3. Invert the second if-condition and swap the order of parameters for
+ # the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >=
+ # 0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0)
+ #
+ # On Intel Gen11, this saves ~11 instructions.
+ (('iadd_sat@64', a, b), ('bcsel',
+ ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
+ 0x8000000000000000,
+ ('bcsel',
+ ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
+ ('iadd', a, b),
+ 0x7fffffffffffffff)),
+ '(options->lower_int64_options & nir_lower_iadd64) != 0'),
+
+ # int64_t sum = a - b;
+ #
+ # if (a < 0 && b >= 0 && a < sum)
+ # sum = INT64_MIN;
+ # } else if (a >= 0 && b < 0 && a >= sum)
+ # sum = INT64_MAX;
+ # }
+ #
+ # Optimizations similar to the iadd_sat case are applied here.
+ (('isub_sat@64', a, b), ('bcsel',
+ ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
+ 0x8000000000000000,
+ ('bcsel',
+ ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
+ ('isub', a, b),
+ 0x7fffffffffffffff)),
+ '(options->lower_int64_options & nir_lower_iadd64) != 0'),
+
+ # These are done here instead of in the backend because the int64 lowering
+ # pass will make a mess of the patterns. The first patterns are
+ # conditioned on nir_lower_minmax64 because it was not clear that it was
+ # always an improvement on platforms that have real int64 support. No
+ # shaders in shader-db hit this, so it was hard to say one way or the
+ # other.
+ (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
+ (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
+ (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
+ (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
+ (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
+ (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
+
+ (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
+ (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
+ # 0u < uint(a) <=> uint(a) != 0u
+ (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
# Alternative lowering that doesn't rely on bfi.
(('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
127.0))),
'options->lower_unpack_snorm_4x8'),
+ (('pack_half_2x16_split', 'a@32', 'b@32'),
+ ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))),
+ 'options->lower_pack_half_2x16_split'),
+
+ (('unpack_half_2x16_split_x', 'a@32'),
+ ('f2f32', ('u2u16', a)),
+ 'options->lower_unpack_half_2x16_split'),
+
+ (('unpack_half_2x16_split_y', 'a@32'),
+ ('f2f32', ('u2u16', ('ushr', a, 16))),
+ 'options->lower_unpack_half_2x16_split'),
+
(('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
(('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'),
+
+ # Address/offset calculations:
+ # Drivers supporting imul24 should use the nir_lower_amul() pass, this
+ # rule converts everyone else to imul:
+ (('amul', a, b), ('imul', a, b), '!options->has_imul24'),
+
+ (('imad24_ir3', a, b, 0), ('imul24', a, b)),
+ (('imad24_ir3', a, 0, c), (c)),
+ (('imad24_ir3', a, 1, c), ('iadd', a, c)),
+
+ # if first two srcs are const, crack apart the imad so constant folding
+ # can clean up the imul:
+ # TODO ffma should probably get a similar rule:
+ (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)),
+
+ # These will turn 24b address/offset calc back into 32b shifts, but
+ # it should be safe to get back some of the bits of precision that we
+ # already decided were no necessary:
+ (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
+ (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
+ (('imul24', a, 0), (0)),
])
# bit_size dependent lowerings
('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_add_sat'),
]
-invert = OrderedDict([('feq', 'fne'), ('fne', 'feq'), ('fge', 'flt'), ('flt', 'fge')])
+invert = OrderedDict([('feq', 'fne'), ('fne', 'feq')])
for left, right in itertools.combinations_with_replacement(invert.keys(), 2):
optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))),
return step5
-optimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'))]
+optimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
# For any float comparison operation, "cmp", if you have "a == a && a cmp b"
# then the "a == a" is redundant because it's equivalent to "a is not NaN"
# and, if a is a NaN then the second comparison will fail anyway.
for op in ['flt', 'fge', 'feq']:
optimizations += [
- (('iand', ('feq', a, a), (op, a, b)), (op, a, b)),
- (('iand', ('feq', a, a), (op, b, a)), (op, b, a)),
+ (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)),
+ (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)),
]
# Add optimizations to handle the case where the result of a ternary is
(('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')),
]
+# These kinds of sequences can occur after nir_opt_peephole_select.
+#
+# NOTE: fadd is not handled here because that gets in the way of ffma
+# generation in the i965 driver. Instead, fadd and ffma are handled in
+# late_optimizations.
+
+for op in ['flrp']:
+ optimizations += [
+ (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
+ (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
+ (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
+ (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
+ (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)),
+ (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)),
+ ]
+
+for op in ['fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']:
+ optimizations += [
+ (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
+ (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))),
+ (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
+ (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
+ ]
+
+for op in ['fpow']:
+ optimizations += [
+ (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
+ (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
+ (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)),
+ (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)),
+ ]
+
+for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos']:
+ optimizations += [
+ (('bcsel', a, (op + '(is_used_once)', b), (op, c)), (op, ('bcsel', a, b, c))),
+ (('bcsel', a, (op, b), (op + '(is_used_once)', c)), (op, ('bcsel', a, b, c))),
+ ]
+
# This section contains "late" optimizations that should be run before
# creating ffmas and calling regular optimizations for the final time.
# Optimizations should go here if they help code generation and conflict
# optimization loop can prevent other optimizations.
(('fneg', ('fneg', a)), a),
+ # Subtractions get lowered during optimization, so we need to recombine them
+ (('fadd', 'a', ('fneg', 'b')), ('fsub', 'a', 'b'), '!options->lower_sub'),
+ (('iadd', 'a', ('ineg', 'b')), ('isub', 'a', 'b'), '!options->lower_sub'),
+ (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
+ (('ineg', a), ('isub', 0, a), 'options->lower_negate'),
+
# These are duplicated from the main optimizations table. The late
# patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
# new patterns like these. The patterns that compare with zero are removed
(('bcsel', a, 0, ('b2f32', ('inot', 'b@bool'))), ('b2f32', ('inot', ('ior', a, b)))),
+ # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c),
+ # op(b, d)) => op(b, bcsel(a, c, d)) transformations. I do not know why.
+ (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)),
+ ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))),
+
# Things that look like DPH in the source shader may get expanded to
# something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets
# to NIR. After FFMA is generated, this can look like:
# fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w)
#
# Reassociate the last addition into the first multiplication.
+ #
+ # Some shaders do not use 'invariant' in vertex and (possibly) geometry
+ # shader stages on some outputs that are intended to be invariant. For
+ # various reasons, this optimization may not be fully applied in all
+ # shaders used for different rendering passes of the same geometry. This
+ # can result in Z-fighting artifacts (at best). For now, disable this
+ # optimization in these stages. See bugzilla #111490. In tessellation
+ # stages applications seem to use 'precise' when necessary, so allow the
+ # optimization in those stages.
(('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
- ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '!options->intel_vec4'),
- (('~fadd', ('ffma(is_used_once)', a, b, ('fmul', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)') ), 'g(is_not_const)'),
- ('ffma', a, b, ('ffma', e, 'f', 'g') ), '!options->intel_vec4'),
+ ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
+ (('~fadd', ('ffma(is_used_once)', a, b, ('fmul', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
+ ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
]
+for op in ['fadd']:
+ late_optimizations += [
+ (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
+ (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
+ ]
+
+for op in ['ffma']:
+ late_optimizations += [
+ (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
+ (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
+
+ (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
+ (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
+ ]
+
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
before_ffma_optimizations).render())