b = 'b'
c = 'c'
d = 'd'
+e = 'e'
# Written in the form (<search>, <replace>) where <search> is an expression
# and <replace> is either an expression or a value. An expression is
optimizations = [
- (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b))),
- (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b))))),
+ (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitshift'),
+ (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitshift'),
+ (('ishl', a, '#b@32'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitshift'),
+
(('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
(('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
(('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'),
(('idiv', a, 1), a),
(('umod', a, 1), 0),
(('imod', a, 1), 0),
- (('udiv', a, '#b@32(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b))),
+ (('udiv', a, '#b@32(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitshift'),
(('idiv', a, '#b@32(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), 'options->lower_idiv'),
(('idiv', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), 'options->lower_idiv'),
(('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1))),
(('iadd', a, ('iadd', ('ineg', a), b)), b),
(('~fadd', ('fneg', a), ('fadd', a, b)), b),
(('~fadd', a, ('fadd', ('fneg', a), b)), b),
+ (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))),
(('~fmul', a, 0.0), 0.0),
(('imul', a, 0), 0),
(('umul_unorm_4x8', a, 0), 0),
(('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)),
(('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)),
(('~ffma', 0.0, a, b), b),
- (('~ffma', a, 0.0, b), b),
(('~ffma', a, b, 0.0), ('fmul', a, b)),
- (('ffma', a, 1.0, b), ('fadd', a, b)),
(('ffma', 1.0, a, b), ('fadd', a, b)),
+ (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)),
(('~flrp', a, b, 0.0), a),
(('~flrp', a, b, 1.0), b),
(('~flrp', a, a, b), a),
(('~flrp', 0.0, a, b), ('fmul', a, b)),
+
+ # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
+ (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
+ (('~flrp@32', a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp32'),
+ (('~flrp@64', a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp64'),
+
+ (('~flrp@32', ('fadd', a, b), ('fadd', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp32'),
+ (('~flrp@64', ('fadd', a, b), ('fadd', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp64'),
+
+ (('~flrp@32', a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp32'),
+ (('~flrp@64', a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp64'),
+
+ (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)),
+
(('~flrp', a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp32'),
(('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),
- (('flrp@16', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp16'),
- (('flrp@32', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp32'),
- (('flrp@64', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp64'),
+ (('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),
(('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
(('fadd', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
(('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
(('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'),
- (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp32'),
- (('~fadd@32', ('fmul', a, ('fadd', 1.0, ('fneg', c ))), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp32'),
- (('~fadd@64', ('fmul', a, ('fadd', 1.0, ('fneg', c ))), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp64'),
+ (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp32'),
+ (('~fadd@32', ('fmul', a, ('fadd', 1.0, ('fneg', c ) )), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp32'),
+ (('~fadd@64', ('fmul', a, ('fadd', 1.0, ('fneg', c ) )), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp64'),
+ # These are the same as the previous three rules, but it depends on
+ # 1-fsat(x) <=> fsat(1-x). See below.
+ (('~fadd@32', ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c )))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp32'),
+ (('~fadd@64', ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c )))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp64'),
+
(('~fadd', a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp32'),
(('~fadd@32', a, ('fmul', c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp32'),
(('~fadd@64', a, ('fmul', c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp64'),
(('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
(('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'),
+ (('~fmul', ('fadd', ('iand', ('ineg', ('b2i32', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
+ ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
+
(('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d)),
(('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)),
(('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)),
(('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)),
(('fdot3', ('vec3', a, b, 0.0), c), ('fdot2', ('vec2', a, b), c)),
+ # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially
+ # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1
+ # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0
+ (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
+
# (a * #b + #c) << #d
# ((a * #b) << #d) + (#c << #d)
# (a * (#b << #d)) + (#c << #d)
(('inot', ('ieq', a, b)), ('ine', a, b)),
(('inot', ('ine', a, b)), ('ieq', a, b)),
+ # This helps some shaders because, after some optimizations, they end up
+ # with patterns like (-a < -b) || (b < a). In an ideal world, this sort of
+ # matching would be handled by CSE.
+ (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
+ (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
+ (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
+ (('fne', ('fneg', a), ('fneg', b)), ('fne', b, a)),
+ (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
+ (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
+ (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
+ (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
+ (('fne', ('fneg', a), -1.0), ('fne', 1.0, a)),
+ (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
+
+ (('flt', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('flt', a, b)),
+ (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
+ (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
+ (('fge', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('fge', b, a)),
+ (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
+ (('fne', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fne', a, b)),
+
+ (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
+ (('flt', ('fsat(is_used_once)', a), 1.0), ('flt', a, 1.0)),
+ (('fge', 0.0, ('fsat(is_used_once)', a)), ('fge', 0.0, a)),
+ (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)),
+
# 0.0 >= b2f(a)
# b2f(a) <= 0.0
# b2f(a) == 0.0 because b2f(a) can only be 0 or 1
(('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))),
(('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))),
+ (('flt', a, ('fneg', a)), ('flt', a, 0.0)),
+ (('fge', a, ('fneg', a)), ('fge', a, 0.0)),
+
# Some optimizations (below) convert things like (a < b || c < b) into
# (min(a, c) < b). However, this interfers with the previous optimizations
# that try to remove comparisons with negated sums of b2f. This just
(('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
(('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
(('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
- (('bcsel', a, True, 'b@bool'), ('ior', a, b)),
+ (('bcsel', a, True, b), ('ior', a, b)),
+ (('bcsel', a, a, b), ('ior', a, b)),
+ (('bcsel', a, b, False), ('iand', a, b)),
+ (('bcsel', a, b, a), ('iand', a, b)),
(('fmin', a, a), a),
(('fmax', a, a), a),
(('imin', a, a), a),
(('imax', a, ('iabs', a)), ('iabs', a)),
(('fmax', a, ('fneg', a)), ('fabs', a)),
(('imax', a, ('ineg', a)), ('iabs', a)),
+ (('~fmax', ('fabs', a), 0.0), ('fabs', a)),
(('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
(('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
+ (('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_negate && !options->lower_fsat'),
+ (('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_negate && !options->lower_fsat'),
(('fsat', ('fsign', a)), ('b2f', ('flt', 0.0, a))),
(('fsat', ('b2f', a)), ('b2f', a)),
(('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
(('fsat', ('fsat', a)), ('fsat', a)),
+ (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_negate && !options->lower_fsat'),
+ (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_negate && !options->lower_fsat'),
+ (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'),
(('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),
(('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
(('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),
(('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)),
(('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)),
- (('ior', 'a@bool', ('ieq', a, False)), True),
+ # The (i2f32, ...) part is an open-coded fsign. When that is combined with
+ # the bcsel, it's basically copysign(1.0, a). There is no copysign in NIR,
+ # so emit an open-coded version of that.
+ (('bcsel@32', ('feq', a, 0.0), 1.0, ('i2f32', ('iadd', ('b2i32', ('flt', 0.0, 'a@32')), ('ineg', ('b2i32', ('flt', 'a@32', 0.0)))))),
+ ('ior', 0x3f800000, ('iand', a, 0x80000000))),
+
+ (('ior', a, ('ieq', a, False)), True),
(('ior', a, ('inot', a)), -1),
+ (('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)),
+ (('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))),
+
(('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', 'a@32', 'b@32'), 0)),
# These patterns can result when (a < b || a < c) => (a < min(b, c))
(('ult', ('umax', a, b), a), False),
(('uge', a, ('umax', b, a)), ('uge', a, b)),
(('uge', ('umin', a, b), a), ('uge', b, a)),
+ (('ult', a, ('iand', b, a)), False),
+ (('ult', ('ior', a, b), a), False),
+ (('uge', a, ('iand', b, a)), True),
+ (('uge', ('ior', a, b), a), True),
(('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))),
(('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))),
(('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
(('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
(('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))),
+ (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))),
# Division and reciprocal
(('~fdiv', 1.0, a), ('frcp', a)),
(('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
# Boolean simplifications
(('i2b32(is_used_by_if)', a), ('ine32', a, 0)),
(('i2b1(is_used_by_if)', a), ('ine', a, 0)),
- (('ieq', 'a@bool', True), a),
- (('ine(is_not_used_by_if)', 'a@bool', True), ('inot', a)),
- (('ine', 'a@bool', False), a),
- (('ieq(is_not_used_by_if)', 'a@bool', False), ('inot', 'a')),
+ (('ieq', a, True), a),
+ (('ine(is_not_used_by_if)', a, True), ('inot', a)),
+ (('ine', a, False), a),
+ (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')),
(('bcsel', a, True, False), a),
(('bcsel', a, False, True), ('inot', a)),
(('bcsel@32', a, 1.0, 0.0), ('b2f', a)),
(('bcsel', True, b, c), b),
(('bcsel', False, b, c), c),
(('bcsel', a, ('b2f(is_used_once)', 'b@32'), ('b2f', 'c@32')), ('b2f', ('bcsel', a, b, c))),
- # The result of this should be hit by constant propagation and, in the
- # next round of opt_algebraic, get picked up by one of the above two.
- (('bcsel', '#a', b, c), ('bcsel', ('ine', 'a', 0), b, c)),
(('bcsel', a, b, b), b),
(('fcsel', a, b, b), b),
(('ilt', ('f2u', a), b), ('ilt', ('f2i', a), b)),
(('ilt', b, ('f2u', a)), ('ilt', b, ('f2i', a))),
+ (('~fmin', ('fabs', a), 1.0), ('fsat', ('fabs', a)), '!options->lower_fsat'),
+
+ # The result of the multiply must be in [-1, 0], so the result of the ffma
+ # must be in [0, 1].
+ (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False),
+ (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False),
+ (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),
+ (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),
+
# Packing and then unpacking does nothing
(('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
(('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b),
('unpack_64_2x32_split_y', a)), a),
# Byte extraction
+ (('ushr', 'a@16', 8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
(('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
+ (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'),
+ (('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
(('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),
+ (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),
(('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte')
]
# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be
# patterns like those below.
for op in ('extract_u8', 'extract_i8'):
+ optimizations.extend([((op, ('ishl', 'a@16', 8), 1), (op, a, 0))])
optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)])
+ optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)])
optimizations.extend([
# Word extraction
(('iabs', ('isub', 0, a)), ('iabs', a)),
# Propagate negation up multiplication chains
- (('fmul', ('fneg', a), b), ('fneg', ('fmul', a, b))),
+ (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))),
(('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
# Propagate constants up multiplication chains
(('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
# Misc. lowering
- (('fmod@16', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod16'),
- (('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod32'),
- (('fmod@64', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod64'),
- (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod32'),
+ (('fmod@16', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
+ (('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
+ (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
(('uadd_carry@32', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
(('usub_borrow@32', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
'options->lower_unpack_snorm_4x8'),
(('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
+ (('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'),
])
# bit_size dependent lowerings
x2yN = '{}2{}'.format(x, y)
optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
+# Optimize away x2xN(a@N)
+for t in ['int', 'uint', 'float']:
+ for N in type_sizes(t):
+ x2xN = '{0}2{0}{1}'.format(t[0], N)
+ aN = 'a@{0}'.format(N)
+ optimizations.append(((x2xN, aN), a))
+
+# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers
+# In particular, we can optimize away everything except upcast of downcast and
+# upcasts where the type differs from the other cast
+for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')):
+ if N < M:
+ # The outer cast is a down-cast. It doesn't matter what the size of the
+ # argument of the inner cast is because we'll never been in the upcast
+ # of downcast case. Regardless of types, we'll always end up with y2yN
+ # in the end.
+ for x, y in itertools.product(['i', 'u'], ['i', 'u']):
+ x2xN = '{0}2{0}{1}'.format(x, N)
+ y2yM = '{0}2{0}{1}'.format(y, M)
+ y2yN = '{0}2{0}{1}'.format(y, N)
+ optimizations.append(((x2xN, (y2yM, a)), (y2yN, a)))
+ elif N > M:
+ # If the outer cast is an up-cast, we have to be more careful about the
+ # size of the argument of the inner cast and with types. In this case,
+ # the type is always the type of type up-cast which is given by the
+ # outer cast.
+ for P in type_sizes('uint'):
+ # We can't optimize away up-cast of down-cast.
+ if M < P:
+ continue
+
+ # Because we're doing down-cast of down-cast, the types always have
+ # to match between the two casts
+ for x in ['i', 'u']:
+ x2xN = '{0}2{0}{1}'.format(x, N)
+ x2xM = '{0}2{0}{1}'.format(x, M)
+ aP = 'a@{0}'.format(P)
+ optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a)))
+ else:
+ # The N == M case is handled by other optimizations
+ pass
+
def fexp2i(exp, bits):
# We assume that exp is already in the right range.
if bits == 16:
((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d)))
]
+# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives
+# states:
+#
+# If neither layout qualifier is specified, derivatives in compute shaders
+# return zero, which is consistent with the handling of built-in texture
+# functions like texture() in GLSL 4.50 compute shaders.
+for op in ['fddx', 'fddx_fine', 'fddx_coarse',
+ 'fddy', 'fddy_fine', 'fddy_coarse']:
+ optimizations += [
+ ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE')
+]
+
+# Some optimizations for ir3-specific instructions.
+optimizations += [
+ # 'al * bl': If either 'al' or 'bl' is zero, return zero.
+ (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)),
+ # '(ah * bl) << 16 + c': If either 'ah' or 'bl' is zero, return 'c'.
+ (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')),
+ (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')),
+]
+
# This section contains "late" optimizations that should be run before
# creating ffmas and calling regular optimizations for the final time.
# Optimizations should go here if they help code generation and conflict
(('iadd', a, ('iadd', ('ineg', a), b)), b),
(('~fadd', ('fneg', a), ('fadd', a, b)), b),
(('~fadd', a, ('fadd', ('fneg', a), b)), b),
+
+ (('~flrp@32', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a, 1.0), d), ('fadd', ('flrp', -1.0, 1.0, d), a)),
+ (('~flrp@32', ('fadd(is_used_once)', a, 1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp', 1.0, -1.0, d), a)),
+ (('~flrp@32', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))),
]
# This section contains "late" optimizations that should be run after the
(('~feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))),
(('~fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))),
+ # nir_lower_to_source_mods will collapse this, but its existence during the
+ # optimization loop can prevent other optimizations.
+ (('fneg', ('fneg', a)), a),
+
+ # These are duplicated from the main optimizations table. The late
+ # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
+ # new patterns like these. The patterns that compare with zero are removed
+ # because they are unlikely to be created in by anything in
+ # late_optimizations.
+ (('flt', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('flt', a, b)),
+ (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
+ (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
+ (('fge', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('fge', b, a)),
+ (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
+ (('fne', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fne', a, b)),
+
+ (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
+ (('flt', ('fsat(is_used_once)', a), 1.0), ('flt', a, 1.0)),
+
(('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))),
+ (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
+ (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
+ (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
+ (('fne', ('fneg', a), ('fneg', b)), ('fne', b, a)),
+ (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
+ (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
+ (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
+ (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
+ (('fne', ('fneg', a), -1.0), ('fne', 1.0, a)),
+ (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
+
+ (('ior', a, a), a),
+ (('iand', a, a), a),
+
+ (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
+
(('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'),
(('fdot3', a, b), ('fdot_replicated3', a, b), 'options->fdot_replicates'),
(('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),
(('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
- (('b2f(is_used_more_than_once)', ('inot', 'a@1')), ('bcsel', a, 0.0, 1.0)),
- (('fneg(is_used_more_than_once)', ('b2f', ('inot', 'a@1'))), ('bcsel', a, -0.0, -1.0)),
+ (('~flrp@32', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
+ (('~flrp@64', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
+
+ (('~fadd@32', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp32'),
+ (('~fadd@64', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp64'),
# we do these late so that we don't get in the way of creating ffmas
(('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
(('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),
- (('bcsel', 'a@bool', 0, ('b2f32', ('inot', 'b@bool'))), ('b2f32', ('inot', ('ior', a, b)))),
+ (('bcsel', a, 0, ('b2f32', ('inot', 'b@bool'))), ('b2f32', ('inot', ('ior', a, b)))),
]
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())