(('~fneg', ('fneg', a)), a),
(('ineg', ('ineg', a)), a),
- (('fabs', ('fabs', a)), ('fabs', a)),
(('fabs', ('fneg', a)), ('fabs', a)),
(('fabs', ('u2f', a)), ('u2f', a)),
(('iabs', ('iabs', a)), ('iabs', a)),
(('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),
])
+# To save space in the state tables, reduce to the set that is known to help.
+# Previously, this was range(1, 32). In addition, a couple rules inside the
+# loop are commented out. Revisit someday, probably after mesa/#2635 has some
+# resolution.
+for i in [1, 2, 16, 24]:
+ lo_mask = 0xffffffff >> i
+ hi_mask = (0xffffffff << i) & 0xffffffff
+
+ optimizations.extend([
+ # This pattern seems to only help in the soft-fp64 code.
+ (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)),
+# (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)),
+# (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)),
+
+ (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)),
+ (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)),
+# (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct
+ ])
+
optimizations.extend([
# This is common for address calculations. Reassociating may enable the
# 'a<<c' to be CSE'd. It also helps architectures that have an ISHLADD
(('iand@32', a, ('inot', ('ishr', a, 31))), ('imax', a, 0)),
# Simplify logic to detect sign of an integer.
- (('ieq', ('iand', a, 0x80000000), 0x00000000), ('ige', a, 0)),
- (('ine', ('iand', a, 0x80000000), 0x80000000), ('ige', a, 0)),
- (('ine', ('iand', a, 0x80000000), 0x00000000), ('ilt', a, 0)),
- (('ieq', ('iand', a, 0x80000000), 0x80000000), ('ilt', a, 0)),
+ (('ieq', ('iand', 'a@32', 0x80000000), 0x00000000), ('ige', a, 0)),
+ (('ine', ('iand', 'a@32', 0x80000000), 0x80000000), ('ige', a, 0)),
+ (('ine', ('iand', 'a@32', 0x80000000), 0x00000000), ('ilt', a, 0)),
+ (('ieq', ('iand', 'a@32', 0x80000000), 0x80000000), ('ilt', a, 0)),
(('ine', ('ushr', 'a@32', 31), 0), ('ilt', a, 0)),
(('ieq', ('ushr', 'a@32', 31), 0), ('ige', a, 0)),
(('ieq', ('ushr', 'a@32', 31), 1), ('ilt', a, 0)),
(('fmax', a, ('fneg', a)), ('fabs', a)),
(('imax', a, ('ineg', a)), ('iabs', a)),
(('~fmax', ('fabs', a), 0.0), ('fabs', a)),
- (('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
+ (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
+ # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while
+ # fsat(a) returns 0.0.
(('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
+ # fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while
+ # fneg(fsat(fneg(a))) returns -0.0 on NaN.
(('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
+ # fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while
+ # fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if
+ # SignedZeroInfNanPreserve is set, but we don't currently have any way of
+ # representing this in the optimizations other than the usual ~.
(('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
(('fsat', ('fsign', a)), ('b2f', ('flt', 0.0, a))),
(('fsat', ('b2f', a)), ('b2f', a)),
(('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),
(('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
(('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),
+ # Both the left and right patterns are "b" when isnan(a), so this is exact.
(('fmax', ('fsat', a), '#b@32(is_zero_to_one)'), ('fsat', ('fmax', a, b))),
- (('fmin', ('fsat', a), '#b@32(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
+ # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) ->
+ # fmin(0.0, b)) while the right one is "b", so this optimization is inexact.
+ (('~fmin', ('fsat', a), '#b@32(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
+
+ # If a in [0,b] then b-a is also in [0,b]. Since b in [0,1], max(b-a, 0) =
+ # fsat(b-a).
+ #
+ # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0
+ #
+ # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0.
+ (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b@32(is_zero_to_one)'), 0.0),
+ ('fsat', ('fadd', ('fneg', a), b)), '!options->lower_fsat'),
+
(('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)),
(('~ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))),
(('~ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
(('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)),
(('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))),
- (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', a, b), 0), '!options->lower_bitops'),
- (('ior', ('ine', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('ior', a, b), 0), '!options->lower_bitops'),
+ (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('umax', a, b), 0)),
+ (('ior', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('umin', a, b), 0)),
+ (('iand', ('ine', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('umin', a, b), 0)),
+ (('ior', ('ine', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('umax', a, b), 0)),
+
+ # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code.
+ # The first part of the iand comes from the !__feq64_nonnan.
+ #
+ # The second pattern is a reformulation of the first based on the relation
+ # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation
+ # happens to be y == 0.
+ (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0), b), c)), ('ilt', a, 0)),
+ ('iand', ('inot', ('iand', b , c)), ('ilt', a, 0))),
+ (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)),
+ ('iand', ('inot', ('iand', ('ieq', b , 0), c)), ('ilt', a, 0))),
# These patterns can result when (a < b || a < c) => (a < min(b, c))
# transformations occur before constant propagation and loop-unrolling.
(('ishr', a, 0), a),
(('ushr', 0, a), 0),
(('ushr', a, 0), a),
- (('iand', 0xff, ('ushr@32', a, 24)), ('ushr', a, 24)),
- (('iand', 0xffff, ('ushr@32', a, 16)), ('ushr', a, 16)),
(('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
(('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), '!options->lower_rotate'),
(('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
(('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
(('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
+ (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)),
(('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)),
(('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))),
(('~fpow', a, 1.0), a),
# D3D Boolean emulation
(('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))),
(('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))),
+ (('bcsel', a, 1, 0), ('b2i', 'a@1')),
+ (('bcsel', a, 0, 1), ('b2i', ('inot', a))),
(('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
('ineg', ('b2i', ('iand', a, b)))),
(('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))),
(('~f2u32', ('i2f', 'a@32')), a),
(('~f2u32', ('u2f', 'a@32')), a),
- # Conversions from float16 to float32 and back can always be removed
+ # Conversions from 16 bits to 32 bits and back can always be removed
(('f2f16', ('f2f32', 'a@16')), a),
(('f2fmp', ('f2f32', 'a@16')), a),
- # Conversions to float16 would be lossy so they should only be removed if
+ (('i2i16', ('i2i32', 'a@16')), a),
+ (('i2imp', ('i2i32', 'a@16')), a),
+ (('u2u16', ('u2u32', 'a@16')), a),
+ (('u2ump', ('u2u32', 'a@16')), a),
+ (('f2f16', ('b2f32', 'a@1')), ('b2f16', a)),
+ (('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)),
+ (('i2i16', ('b2i32', 'a@1')), ('b2i16', a)),
+ (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
+ (('u2u16', ('b2i32', 'a@1')), ('b2i16', a)),
+ (('u2ump', ('b2i32', 'a@1')), ('b2i16', a)),
+ # Conversions to 16 bits would be lossy so they should only be removed if
# the instruction was generated by the precision lowering pass.
(('f2f32', ('f2fmp', 'a@32')), a),
+ (('i2i32', ('i2imp', 'a@32')), a),
+ (('u2u32', ('u2ump', 'a@32')), a),
(('ffloor', 'a(is_integral)'), a),
(('fceil', 'a(is_integral)'), a),
(('ilt', 'a(is_lt_zero)', 'b(is_not_negative)'), True),
(('ult', 0, 'a(is_gt_zero)'), True),
+ (('ult', a, 0), False),
# Packing and then unpacking does nothing
(('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
(('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b),
+ (('unpack_64_2x32', ('pack_64_2x32_split', a, b)), ('vec2', a, b)),
+ (('unpack_64_2x32', ('pack_64_2x32', a)), a),
(('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
('unpack_64_2x32_split_y', a)), a),
+ (('pack_64_2x32', ('vec2', ('unpack_64_2x32_split_x', a),
+ ('unpack_64_2x32_split_y', a))), a),
+ (('pack_64_2x32', ('unpack_64_2x32', a)), a),
# Comparing two halves of an unpack separately. While this optimization
# should be correct for non-constant values, it's less obvious that it's
(('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),
(('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
+ (('ubfe', a, 0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'),
+ (('ubfe', a, 8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
+ (('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'),
+ (('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'),
+ (('ibfe', a, 0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'),
+ (('ibfe', a, 8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
+ (('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'),
+ (('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'),
+
+ # Word extraction
+ (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
+ (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
+ (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
+ (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
+ (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
+
+ (('ubfe', a, 0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
+ (('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
+ (('ibfe', a, 0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
+ (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
+
# Useless masking before unpacking
(('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)),
(('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)),
(('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),
(('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),
+ (('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)),
+ (('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)),
+ (('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)),
+ (('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)),
+
# Optimize half packing
(('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))),
- (('ishr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))),
+ (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))),
(('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
('pack_half_2x16', ('vec2', a, b))),
(('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
('pack_half_2x16', ('vec2', a, b))),
+
+ (('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)),
+ (('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)),
+ (('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)),
+
+ (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
+ (('ior', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
])
# After the ('extract_u8', a, 0) pattern, above, triggers, there will be
optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)])
optimizations.extend([
- # Word extraction
- (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
- (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
- (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
- (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
- (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
-
# Subtracts
(('ussub_4x8', a, 0), a),
(('ussub_4x8', a, ~0), 0),
(('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)),
(('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
(('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
+ (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)),
+ (('ior', '#a', ('ior', 'b(is_not_const)', '#c')), ('ior', ('ior', a, c), b)),
+ (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)),
# Drop mul-div by the same value when there's no wrapping.
(('idiv', ('imul(no_signed_wrap)', a, b), b), a),
(('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
+ (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
+ (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
+
+ (('fmin3@64', a, b, c), ('fmin@64', a, ('fmin@64', b, c))),
+ (('fmax3@64', a, b, c), ('fmax@64', a, ('fmax@64', b, c))),
+ (('fmed3@64', a, b, c), ('fmax@64', ('fmin@64', ('fmax@64', a, b), c), ('fmin@64', a, b))),
+
# Misc. lowering
(('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
(('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
(('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')),
(('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')),
+ # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
+ #
+ # If bits is zero, the result will be zero.
+ #
+ # These patterns prevent other patterns from generating invalid results
+ # when count is zero.
+ (('ubfe', a, b, 0), 0),
+ (('ibfe', a, b, 0), 0),
+
+ (('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))),
+
+ (('b2i32', ('i2b', ('ubfe', a, b, 1))), ('ubfe', a, b, 1)),
+ (('b2i32', ('i2b', ('ibfe', a, b, 1))), ('ubfe', a, b, 1)), # ubfe in the replacement is correct
+ (('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
+ (('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
+ (('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
+ (('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
+
(('ibitfield_extract', 'value', 'offset', 'bits'),
('bcsel', ('ieq', 0, 'bits'),
0,
(('pack_half_2x16_split', 'a@32', 'b@32'),
('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))),
- 'options->lower_pack_half_2x16_split'),
+ 'options->lower_pack_split'),
(('unpack_half_2x16_split_x', 'a@32'),
('f2f32', ('u2u16', a)),
- 'options->lower_unpack_half_2x16_split'),
+ 'options->lower_pack_split'),
(('unpack_half_2x16_split_y', 'a@32'),
('f2f32', ('u2u16', ('ushr', a, 16))),
- 'options->lower_unpack_half_2x16_split'),
+ 'options->lower_pack_split'),
+
+ (('pack_32_2x16_split', 'a@16', 'b@16'),
+ ('ior', ('ishl', ('u2u32', b), 16), ('u2u32', a)),
+ 'options->lower_pack_split'),
+
+ (('unpack_32_2x16_split_x', 'a@32'),
+ ('u2u16', a),
+ 'options->lower_pack_split'),
+
+ (('unpack_32_2x16_split_y', 'a@32'),
+ ('u2u16', ('ushr', 'a', 16)),
+ 'options->lower_pack_split'),
(('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
+ (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'),
+ (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'),
(('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'),
+ (('fadd', ('b2f32', ('flt', 0.0, 'a@32')), ('fneg', ('b2f32', ('flt', 'a@32', 0.0)))), ('fsign', a), '!options->lower_fsign'),
+ (('iadd', ('b2i32', ('flt', 0, 'a@32')), ('ineg', ('b2i32', ('flt', 'a@32', 0)))), ('f2i32', ('fsign', a)), '!options->lower_fsign'),
# Address/offset calculations:
# Drivers supporting imul24 should use the nir_lower_amul() pass, this
# rule converts everyone else to imul:
(('amul', a, b), ('imul', a, b), '!options->has_imul24'),
+ (('umul24', a, b),
+ ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)),
+ '!options->has_umul24'),
+ (('umad24', a, b, c),
+ ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c),
+ '!options->has_umad24'),
+
(('imad24_ir3', a, b, 0), ('imul24', a, b)),
(('imad24_ir3', a, 0, c), (c)),
(('imad24_ir3', a, 1, c), ('iadd', a, c)),
optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
# Optimize away x2xN(a@N)
-for t in ['int', 'uint', 'float']:
+for t in ['int', 'uint', 'float', 'bool']:
for N in type_sizes(t):
x2xN = '{0}2{0}{1}'.format(t[0], N)
aN = 'a@{0}'.format(N)
# The N == M case is handled by other optimizations
pass
+# Downcast operations should be able to see through pack
+for t in ['i', 'u']:
+ for N in [8, 16, 32]:
+ x2xN = '{0}2{0}{1}'.format(t, N)
+ optimizations += [
+ ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
+ ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
+ ]
+
# Optimize comparisons with up-casts
for t in ['int', 'uint', 'float']:
for N, M in itertools.product(type_sizes(t), repeat=2):
if N == 1 or N >= M:
continue
+ cond = 'true'
+ if N == 8:
+ cond = 'options->support_8bit_alu'
+ elif N == 16:
+ cond = 'options->support_16bit_alu'
x2xM = '{0}2{0}{1}'.format(t[0], M)
x2xN = '{0}2{0}{1}'.format(t[0], N)
aN = 'a@' + str(N)
bP = 'b@' + str(P)
optimizations += [
- ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b))),
- ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b))),
- ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b))),
- ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b))),
- ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a)),
- ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a)),
+ ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond),
+ ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond),
+ ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond),
+ ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond),
+ ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond),
+ ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond),
]
# The next bit doesn't work on floats because the range checks would
# and a check that the constant fits in the smaller bit size.
optimizations += [
((xeq, (x2xM, aN), '#b'),
- ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b))),
+ ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond),
((xne, (x2xM, aN), '#b'),
- ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b))),
+ ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond),
((xlt, (x2xM, aN), '#b'),
('iand', (xlt, xN_min, b),
- ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b))))),
+ ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond),
((xlt, '#a', (x2xM, bN)),
('iand', (xlt, a, xN_max),
- ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b)))),
+ ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond),
((xge, (x2xM, aN), '#b'),
('iand', (xge, xN_max, b),
- ('ior', (xge, xN_min, b), (xge, a, (x2xN, b))))),
+ ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond),
((xge, '#a', (x2xM, bN)),
('iand', (xge, a, xN_min),
- ('ior', (xge, a, xN_max), (xge, (x2xN, a), b)))),
+ ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond),
]
def fexp2i(exp, bits):
(('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)),
]
-for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos']:
+for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fneg', 'fabs', 'fsign']:
+ optimizations += [
+ (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))),
+ ]
+
+for op in ['ineg', 'iabs', 'inot', 'isign']:
optimizations += [
- (('bcsel', a, (op + '(is_used_once)', b), (op, c)), (op, ('bcsel', a, b, c))),
- (('bcsel', a, (op, b), (op + '(is_used_once)', c)), (op, ('bcsel', a, b, c))),
+ ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))),
]
+# This section contains optimizations to propagate downsizing conversions of
+# constructed vectors into vectors of downsized components. Whether this is
+# useful depends on the SIMD semantics of the backend. On a true SIMD machine,
+# this reduces the register pressure of the vector itself and often enables the
+# conversions to be eliminated via other algebraic rules or constant folding.
+# In the worst case on a SIMD architecture, the propagated conversions may be
+# revectorized via nir_opt_vectorize so instruction count is minimally
+# impacted.
+#
+# On a machine with SIMD-within-a-register only, this actually
+# counterintuitively hurts instruction count. These machines are the same that
+# require vectorize_vec2_16bit, so we predicate the optimizations on that flag
+# not being set.
+#
+# Finally for scalar architectures, there should be no difference in generated
+# code since it all ends up scalarized at the end, but it might minimally help
+# compile-times.
+
+for i in range(2, 4 + 1):
+ for T in ('f', 'u', 'i'):
+ vec_inst = ('vec' + str(i),)
+
+ indices = ['a', 'b', 'c', 'd']
+ suffix_in = tuple((indices[j] + '@32') for j in range(i))
+
+ to_16 = '{}2{}16'.format(T, T)
+ to_mp = '{}2{}mp'.format(T, T)
+
+ out_16 = tuple((to_16, indices[j]) for j in range(i))
+ out_mp = tuple((to_mp, indices[j]) for j in range(i))
+
+ optimizations += [
+ ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'),
+ ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit')
+ ]
+
# This section contains "late" optimizations that should be run before
# creating ffmas and calling regular optimizations for the final time.
# Optimizations should go here if they help code generation and conflict
(('~fadd', ('ffma(is_used_once)', a, b, ('fmul', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
- # Convert f2fmp instructions to concrete f2f16 instructions. At this point
+ # Convert *2*mp instructions to concrete *2*16 instructions. At this point
# any conversions that could have been removed will have been removed in
# nir_opt_algebraic so any remaining ones are required.
(('f2fmp', a), ('f2f16', a)),
+ (('i2imp', a), ('i2i16', a)),
+ (('u2ump', a), ('u2u16', a)),
+
+ # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
+ #
+ # If bits is zero, the result will be zero.
+ #
+ # These prevent the next two lowerings generating incorrect results when
+ # count is zero.
+ (('ubfe', a, b, 0), 0),
+ (('ibfe', a, b, 0), 0),
+
+ # On Intel GPUs, BFE is a 3-source instruction. Like all 3-source
+ # instructions on Intel GPUs, it cannot have an immediate values as
+ # sources. There are also limitations on source register strides. As a
+ # result, it is very easy for 3-source instruction combined with either
+ # loads of immediate values or copies from weird register strides to be
+ # more expensive than the primitive instructions it represents.
+ (('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->lower_bfe_with_two_constants'),
+
+ # b is the lowest order bit to be extracted and c is the number of bits to
+ # extract. The inner shift removes the bits above b + c by shifting left
+ # 32 - (b + c). ishl only sees the low 5 bits of the shift count, which is
+ # -(b + c). The outer shift moves the bit that was at b to bit zero.
+ # After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c.
+ # This means that it must be shifted right by 32 - c or -c bits.
+ (('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->lower_bfe_with_two_constants'),
+
+ # Clean up no-op shifts that may result from the bfe lowerings.
+ (('ishl', a, 0), a),
+ (('ishl', a, -32), a),
+ (('ishr', a, 0), a),
+ (('ishr', a, -32), a),
+ (('ushr', a, 0), a),
]
for op in ['fadd']:
(('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
]
+distribute_src_mods = [
+ # Try to remove some spurious negations rather than pushing them down.
+ (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)),
+ (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
+ (('fdot_replicated2', ('fneg', a), ('fneg', b)), ('fdot_replicated2', a, b)),
+ (('fdot_replicated3', ('fneg', a), ('fneg', b)), ('fdot_replicated3', a, b)),
+ (('fdot_replicated4', ('fneg', a), ('fneg', b)), ('fdot_replicated4', a, b)),
+ (('fneg', ('fneg', a)), a),
+
+ (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)),
+ (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))),
+
+ (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
+ (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
+ (('fneg', ('fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
+
+ # Note that fmin <-> fmax. I don't think there is a way to distribute
+ # fabs() into fmin or fmax.
+ (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))),
+ (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))),
+
+ (('fneg', ('fdot_replicated2(is_used_once)', a, b)), ('fdot_replicated2', ('fneg', a), b)),
+ (('fneg', ('fdot_replicated3(is_used_once)', a, b)), ('fdot_replicated3', ('fneg', a), b)),
+ (('fneg', ('fdot_replicated4(is_used_once)', a, b)), ('fdot_replicated4', ('fneg', a), b)),
+
+ # fdph works mostly like fdot, but to get the correct result, the negation
+ # must be applied to the second source.
+ (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))),
+
+ (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))),
+ (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),
+]
+
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
before_ffma_optimizations).render())
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
late_optimizations).render())
+print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods",
+ distribute_src_mods).render())