# If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1
# If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0
(('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
+ (('~fsub', 1.0, ('fsat', a)), ('fsat', ('fsub', 1.0, a))),
# 1 - ((1 - a) * (1 - b))
# 1 - (1 - a - b + a*b)
# (a * #b) << #c
# a * (#b << #c)
(('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))),
+]
+
+# Care must be taken here. Shifts in NIR uses only the lower log2(bitsize)
+# bits of the second source. These replacements must correctly handle the
+# case where (b % bitsize) + (c % bitsize) >= bitsize.
+for s in [8, 16, 32, 64]:
+ mask = (1 << s) - 1
+
+ ishl = "ishl@{}".format(s)
+ ishr = "ishr@{}".format(s)
+ ushr = "ushr@{}".format(s)
+
+ in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s)
+
+ optimizations.extend([
+ ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)),
+ ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)),
+
+ # To get get -1 for large shifts of negative values, ishr must instead
+ # clamp the shift count to the maximum value.
+ ((ishr, (ishr, a, '#b'), '#c'),
+ (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),
+ ])
+
+optimizations.extend([
+ # This is common for address calculations. Reassociating may enable the
+ # 'a<<c' to be CSE'd. It also helps architectures that have an ISHLADD
+ # instruction or a constant offset field for in load / store instructions.
+ (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))),
# Comparison simplifications
(('~inot', ('flt', a, b)), ('fge', a, b)),
(('~fge', ('fadd', a, b), a), ('fge', b, 0.0)),
(('~feq', ('fadd', a, b), a), ('feq', b, 0.0)),
(('~fne', ('fadd', a, b), a), ('fne', b, 0.0)),
+ (('~flt', ('fadd(is_used_once)', a, '#b'), '#c'), ('flt', a, ('fadd', c, ('fneg', b)))),
+ (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)),
+ (('~fge', ('fadd(is_used_once)', a, '#b'), '#c'), ('fge', a, ('fadd', c, ('fneg', b)))),
+ (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)),
+ (('~feq', ('fadd(is_used_once)', a, '#b'), '#c'), ('feq', a, ('fadd', c, ('fneg', b)))),
+ (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)),
+ (('~fne', ('fadd(is_used_once)', a, '#b'), '#c'), ('fne', a, ('fadd', c, ('fneg', b)))),
+ (('~fne', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fne', ('fneg', ('fadd', c, b)), a)),
# Cannot remove the addition from ilt or ige due to overflow.
(('ieq', ('iadd', a, b), a), ('ieq', b, 0)),
(('f2u', ('ftrunc', a)), ('f2u', a)),
(('i2b', ('ineg', a)), ('i2b', a)),
(('i2b', ('iabs', a)), ('i2b', a)),
- (('fabs', ('b2f', a)), ('b2f', a)),
- (('iabs', ('b2i', a)), ('b2i', a)),
(('inot', ('f2b1', a)), ('feq', a, 0.0)),
# Ironically, mark these as imprecise because removing the conversions may
(('~f2u32', ('i2f', 'a@32')), a),
(('~f2u32', ('u2f', 'a@32')), a),
+ (('ffloor', 'a(is_integral)'), a),
+ (('fceil', 'a(is_integral)'), a),
+ (('ftrunc', 'a(is_integral)'), a),
+ # fract(x) = x - floor(x), so fract(NaN) = NaN
+ (('~ffract', 'a(is_integral)'), 0.0),
+ (('fabs', 'a(is_not_negative)'), a),
+ (('iabs', 'a(is_not_negative)'), a),
+ (('fsat', 'a(is_not_positive)'), 0.0),
+
# Section 5.4.1 (Conversion and Scalar Constructors) of the GLSL 4.60 spec
# says:
#
(('ilt', ('f2u', a), b), ('ilt', ('f2i', a), b)),
(('ilt', b, ('f2u', a)), ('ilt', b, ('f2i', a))),
- (('~fmin', ('fabs', a), 1.0), ('fsat', ('fabs', a)), '!options->lower_fsat'),
+ (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'),
# The result of the multiply must be in [-1, 0], so the result of the ffma
# must be in [0, 1].
(('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),
(('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),
+ (('fne', 'a(is_not_zero)', 0.0), True),
+ (('feq', 'a(is_not_zero)', 0.0), False),
+
+ # The results expecting true, must be marked imprecise. The results
+ # expecting false are fine because NaN compared >= or < anything is false.
+
+ (('~fge', 'a(is_not_negative)', 'b(is_not_positive)'), True),
+ (('fge', 'b(is_not_positive)', 'a(is_gt_zero)'), False),
+ (('fge', 'a(is_lt_zero)', 'b(is_not_negative)'), False),
+ (('~fge', 'b(is_not_negative)', 'a(is_not_positive)'), True),
+
+ (('flt', 'a(is_not_negative)', 'b(is_not_positive)'), False),
+ (('~flt', 'b(is_not_positive)', 'a(is_gt_zero)'), True),
+ (('~flt', 'a(is_lt_zero)', 'b(is_not_negative)'), True),
+ (('flt', 'b(is_not_negative)', 'a(is_not_positive)'), False),
+
+ (('ine', 'a(is_not_zero)', 0), True),
+ (('ieq', 'a(is_not_zero)', 0), False),
+
+ (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True),
+ (('ige', 'b(is_not_positive)', 'a(is_gt_zero)'), False),
+ (('ige', 'a(is_lt_zero)', 'b(is_not_negative)'), False),
+ (('ige', 'b(is_not_negative)', 'a(is_not_positive)'), True),
+
+ (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False),
+ (('ilt', 'b(is_not_positive)', 'a(is_gt_zero)'), True),
+ (('ilt', 'a(is_lt_zero)', 'b(is_not_negative)'), True),
+ (('ilt', 'b(is_not_negative)', 'a(is_not_positive)'), False),
+
+ (('ult', 0, 'a(is_gt_zero)'), True),
+
# Packing and then unpacking does nothing
(('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
(('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b),
(('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
(('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),
(('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),
- (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte')
-]
+ (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
+
+ # Useless masking before unpacking
+ (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)),
+ (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)),
+ (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)),
+ (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)),
+ (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),
+ (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),
+])
# After the ('extract_u8', a, 0) pattern, above, triggers, there will be
# patterns like those below.
# Subtracts
(('~fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
(('isub', a, ('isub', 0, b)), ('iadd', a, b)),
+ (('isub', ('iadd', a, b), b), a),
+ (('~fsub', ('fadd', a, b), b), a),
(('ussub_4x8', a, 0), a),
(('ussub_4x8', a, ~0), 0),
(('fsub', a, b), ('fadd', a, ('fneg', b)), 'options->lower_sub'),
return step5
-optimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'))]
+optimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
# For any float comparison operation, "cmp", if you have "a == a && a cmp b"
# then the "a == a" is redundant because it's equivalent to "a is not NaN"