(('usadd_4x8', a, ~0), ~0),
(('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
(('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
+ (('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))),
+ (('ior', ('iand', a, b), ('iand', a, c)), ('iand', a, ('ior', b, c))),
(('~fadd', ('fneg', a), a), 0.0),
(('iadd', ('ineg', a), a), 0),
(('iadd', ('ineg', a), ('iadd', a, b)), b),
# Comparison simplifications
(('~inot', ('flt', a, b)), ('fge', a, b)),
(('~inot', ('fge', a, b)), ('flt', a, b)),
- (('inot', ('feq', a, b)), ('fne', a, b)),
- (('inot', ('fne', a, b)), ('feq', a, b)),
+ (('inot', ('feq', a, b)), ('fneu', a, b)),
+ (('inot', ('fneu', a, b)), ('feq', a, b)),
(('inot', ('ilt', a, b)), ('ige', a, b)),
(('inot', ('ult', a, b)), ('uge', a, b)),
(('inot', ('ige', a, b)), ('ilt', a, b)),
(('inot', ('ieq', a, b)), ('ine', a, b)),
(('inot', ('ine', a, b)), ('ieq', a, b)),
- (('iand', ('feq', a, b), ('fne', a, b)), False),
+ (('iand', ('feq', a, b), ('fneu', a, b)), False),
(('iand', ('flt', a, b), ('flt', b, a)), False),
(('iand', ('ieq', a, b), ('ine', a, b)), False),
(('iand', ('ilt', a, b), ('ilt', b, a)), False),
(('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
(('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
(('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
- (('fne', ('fneg', a), ('fneg', b)), ('fne', b, a)),
+ (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
(('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
(('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
(('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
(('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
- (('fne', ('fneg', a), -1.0), ('fne', 1.0, a)),
+ (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),
(('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
(('flt', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('flt', a, b)),
(('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
(('fge', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('fge', b, a)),
(('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
- (('fne', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fne', a, b)),
+ (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
(('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
(('flt', ('fsat(is_used_once)', a), 1.0), ('flt', a, 1.0)),
(('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)),
- (('fne', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
- (('fne', ('fmax', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
- (('fne', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('ior', a, b)),
- (('fne', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('ior', a, b)),
- (('fne', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
- (('fne', ('fmin', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
- (('fne', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('iand', a, b)),
- (('fne', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)),
- (('fne', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ixor', a, b)),
- (('fne', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ixor', a, b)),
+ (('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
+ (('fneu', ('fmax', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
+ (('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('ior', a, b)),
+ (('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('ior', a, b)),
+ (('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
+ (('fneu', ('fmin', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
+ (('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('iand', a, b)),
+ (('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)),
+ (('fneu', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ixor', a, b)),
+ (('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ixor', a, b)),
(('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))),
(('feq', ('fmax', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))),
(('feq', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('inot', ('ior', a, b))),
(('~flt', ('fadd', a, b), a), ('flt', b, 0.0)),
(('~fge', ('fadd', a, b), a), ('fge', b, 0.0)),
(('~feq', ('fadd', a, b), a), ('feq', b, 0.0)),
- (('~fne', ('fadd', a, b), a), ('fne', b, 0.0)),
+ (('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)),
(('~flt', ('fadd(is_used_once)', a, '#b'), '#c'), ('flt', a, ('fadd', c, ('fneg', b)))),
(('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)),
(('~fge', ('fadd(is_used_once)', a, '#b'), '#c'), ('fge', a, ('fadd', c, ('fneg', b)))),
(('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)),
(('~feq', ('fadd(is_used_once)', a, '#b'), '#c'), ('feq', a, ('fadd', c, ('fneg', b)))),
(('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)),
- (('~fne', ('fadd(is_used_once)', a, '#b'), '#c'), ('fne', a, ('fadd', c, ('fneg', b)))),
- (('~fne', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fne', ('fneg', ('fadd', c, b)), a)),
+ (('~fneu', ('fadd(is_used_once)', a, '#b'), '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))),
+ (('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)),
# Cannot remove the addition from ilt or ige due to overflow.
(('ieq', ('iadd', a, b), a), ('ieq', b, 0)),
(('feq', ('fmin', ('fneg', ('b2f', 'a@1')), 'b@1'), 0.0), ('iand', ('inot', a), ('fge', b, 0.0))),
(('feq', ('b2f', 'a@1'), 0.0), ('inot', a)),
- (('~fne', ('b2f', 'a@1'), 0.0), a),
+ (('~fneu', ('b2f', 'a@1'), 0.0), a),
(('ieq', ('b2i', 'a@1'), 0), ('inot', a)),
(('ine', ('b2i', 'a@1'), 0), a),
- (('fne', ('u2f', a), 0.0), ('ine', a, 0)),
+ (('fneu', ('u2f', a), 0.0), ('ine', a, 0)),
(('feq', ('u2f', a), 0.0), ('ieq', a, 0)),
(('fge', ('u2f', a), 0.0), True),
(('fge', 0.0, ('u2f', a)), ('uge', 0, a)), # ieq instead?
(('flt', ('u2f', a), 0.0), False),
(('flt', 0.0, ('u2f', a)), ('ult', 0, a)), # ine instead?
- (('fne', ('i2f', a), 0.0), ('ine', a, 0)),
+ (('fneu', ('i2f', a), 0.0), ('ine', a, 0)),
(('feq', ('i2f', a), 0.0), ('ieq', a, 0)),
(('fge', ('i2f', a), 0.0), ('ige', a, 0)),
(('fge', 0.0, ('i2f', a)), ('ige', 0, a)),
# fabs(a) > 0.0
# fabs(a) != 0.0 because fabs(a) must be >= 0
# a != 0.0
- (('~flt', 0.0, ('fabs', a)), ('fne', a, 0.0)),
+ (('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)),
# -fabs(a) < 0.0
# fabs(a) > 0.0
- (('~flt', ('fneg', ('fabs', a)), 0.0), ('fne', a, 0.0)),
+ (('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)),
# 0.0 >= fabs(a)
# 0.0 == fabs(a) because fabs(a) must be >= 0
# !((a >= 0.0) && (a <= 1.0))
# !(a == fsat(a))
# a != fsat(a)
- (('ior', ('flt', a, 0.0), ('flt', 1.0, a)), ('fne', a, ('fsat', a)), '!options->lower_fsat'),
+ (('ior', ('flt', a, 0.0), ('flt', 1.0, a)), ('fneu', a, ('fsat', a)), '!options->lower_fsat'),
(('fmax', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('ior', a, b))),
(('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))),
(('fmax', a, ('fabs', a)), ('fabs', a)),
(('imax', a, ('iabs', a)), ('iabs', a)),
(('fmax', a, ('fneg', a)), ('fabs', a)),
- (('imax', a, ('ineg', a)), ('iabs', a)),
+ (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'),
(('~fmax', ('fabs', a), 0.0), ('fabs', a)),
(('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
# fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while
(('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)),
(('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))),
- (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', a, b), 0), '!options->lower_bitops'),
- (('ior', ('ine', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('ior', a, b), 0), '!options->lower_bitops'),
+ (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('umax', a, b), 0)),
+ (('ior', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('umin', a, b), 0)),
+ (('iand', ('ine', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('umin', a, b), 0)),
+ (('ior', ('ine', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('umax', a, b), 0)),
# This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code.
# The first part of the iand comes from the !__feq64_nonnan.
(('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'),
(('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
(('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
- (('sne', a, b), ('b2f', ('fne', a, b)), 'options->lower_scmp'),
+ (('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'),
(('seq', ('seq', a, b), 1.0), ('seq', a, b)),
(('seq', ('sne', a, b), 1.0), ('sne', a, b)),
(('seq', ('slt', a, b), 1.0), ('slt', a, b)),
(('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
(('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
(('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
- (('fne', ('fneg', a), a), ('fne', a, 0.0)),
+ (('fneu', ('fneg', a), a), ('fneu', a, 0.0)),
(('feq', ('fneg', a), a), ('feq', a, 0.0)),
# Emulating booleans
(('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
+ (('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
+ (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))),
(('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))),
(('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))),
(('iand', 'a@bool32', 1.0), ('b2f', a)),
# D3D Boolean emulation
(('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))),
(('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))),
+ (('bcsel', a, 1, 0), ('b2i', 'a@1')),
+ (('bcsel', a, 0, 1), ('b2i', ('inot', a))),
(('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
('ineg', ('b2i', ('iand', a, b)))),
(('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))),
(('~f2u32', ('i2f', 'a@32')), a),
(('~f2u32', ('u2f', 'a@32')), a),
- # Conversions from float16 to float32 and back can always be removed
+ # Conversions from 16 bits to 32 bits and back can always be removed
(('f2f16', ('f2f32', 'a@16')), a),
(('f2fmp', ('f2f32', 'a@16')), a),
+ (('i2i16', ('i2i32', 'a@16')), a),
+ (('i2imp', ('i2i32', 'a@16')), a),
+ (('u2u16', ('u2u32', 'a@16')), a),
+ (('u2ump', ('u2u32', 'a@16')), a),
(('f2f16', ('b2f32', 'a@1')), ('b2f16', a)),
(('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)),
- # Conversions to float16 would be lossy so they should only be removed if
+ (('i2i16', ('b2i32', 'a@1')), ('b2i16', a)),
+ (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
+ (('u2u16', ('b2i32', 'a@1')), ('b2i16', a)),
+ (('u2ump', ('b2i32', 'a@1')), ('b2i16', a)),
+ # Conversions to 16 bits would be lossy so they should only be removed if
# the instruction was generated by the precision lowering pass.
(('f2f32', ('f2fmp', 'a@32')), a),
+ (('i2i32', ('i2imp', 'a@32')), a),
+ (('u2u32', ('u2ump', 'a@32')), a),
(('ffloor', 'a(is_integral)'), a),
(('fceil', 'a(is_integral)'), a),
(('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),
(('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),
- (('fne', 'a(is_not_zero)', 0.0), True),
+ (('fneu', 'a(is_not_zero)', 0.0), True),
(('feq', 'a(is_not_zero)', 0.0), False),
# In this chart, + means value > 0 and - means value < 0.
# Packing and then unpacking does nothing
(('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
(('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b),
+ (('unpack_64_2x32', ('pack_64_2x32_split', a, b)), ('vec2', a, b)),
+ (('unpack_64_2x32', ('pack_64_2x32', a)), a),
(('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
('unpack_64_2x32_split_y', a)), a),
+ (('pack_64_2x32', ('vec2', ('unpack_64_2x32_split_x', a),
+ ('unpack_64_2x32_split_y', a))), a),
+ (('pack_64_2x32', ('unpack_64_2x32', a)), a),
# Comparing two halves of an unpack separately. While this optimization
# should be correct for non-constant values, it's less obvious that it's
(('ibfe', a, 0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
(('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
+ # Lower pack/unpack
+ (('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'),
+ (('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split'),
+ (('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'),
+ (('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'),
+ (('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split'),
+ (('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split'),
+
# Useless masking before unpacking
(('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)),
(('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)),
(('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
- (('fmin3@64', a, b, c), ('fmin@64', a, ('fmin@64', b, c))),
- (('fmax3@64', a, b, c), ('fmax@64', a, ('fmax@64', b, c))),
- (('fmed3@64', a, b, c), ('fmax@64', ('fmin@64', ('fmax@64', a, b), c), ('fmin@64', a, b))),
+ (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
+ (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
+ (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)),
+
+ (('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)),
+ (('bcsel', a, b, ('bcsel', c, b, d)), ('bcsel', ('ior', a, c), b, d)),
# Misc. lowering
(('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
'options->lower_pack_split'),
(('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
+ (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'),
+ (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'),
(('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'),
+ (('fadd', ('b2f32', ('flt', 0.0, 'a@32')), ('fneg', ('b2f32', ('flt', 'a@32', 0.0)))), ('fsign', a), '!options->lower_fsign'),
+ (('iadd', ('b2i32', ('flt', 0, 'a@32')), ('ineg', ('b2i32', ('flt', 'a@32', 0)))), ('f2i32', ('fsign', a)), '!options->lower_fsign'),
# Address/offset calculations:
# Drivers supporting imul24 should use the nir_lower_amul() pass, this
('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_add_sat'),
]
-invert = OrderedDict([('feq', 'fne'), ('fne', 'feq')])
+invert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')])
for left, right in itertools.combinations_with_replacement(invert.keys(), 2):
optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))),
aN = 'a@' + str(N)
bN = 'b@' + str(N)
xeq = 'feq' if t == 'float' else 'ieq'
- xne = 'fne' if t == 'float' else 'ine'
+ xne = 'fneu' if t == 'float' else 'ine'
xge = '{0}ge'.format(t[0])
xlt = '{0}lt'.format(t[0])
('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond),
]
+# Convert masking followed by signed downcast to just unsigned downcast
+optimizations += [
+ (('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)),
+ (('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)),
+ (('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)),
+ (('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)),
+ (('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)),
+ (('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)),
+]
+
def fexp2i(exp, bits):
# Generate an expression which constructs value 2.0^exp or 0.0.
#
# which constant folding will eat for lunch. The resulting ternary will
# further get cleaned up by the boolean reductions above and we will be
# left with just the original variable "a".
-for op in ['flt', 'fge', 'feq', 'fne',
+for op in ['flt', 'fge', 'feq', 'fneu',
'ilt', 'ige', 'ieq', 'ine', 'ult', 'uge']:
optimizations += [
((op, ('bcsel', 'a', '#b', '#c'), '#d'),
(('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)),
]
-for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos']:
+for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fneg', 'fabs', 'fsign']:
optimizations += [
- (('bcsel', a, (op + '(is_used_once)', b), (op, c)), (op, ('bcsel', a, b, c))),
- (('bcsel', a, (op, b), (op + '(is_used_once)', c)), (op, ('bcsel', a, b, c))),
+ (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))),
]
+for op in ['ineg', 'iabs', 'inot', 'isign']:
+ optimizations += [
+ ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))),
+ ]
+
+# This section contains optimizations to propagate downsizing conversions of
+# constructed vectors into vectors of downsized components. Whether this is
+# useful depends on the SIMD semantics of the backend. On a true SIMD machine,
+# this reduces the register pressure of the vector itself and often enables the
+# conversions to be eliminated via other algebraic rules or constant folding.
+# In the worst case on a SIMD architecture, the propagated conversions may be
+# revectorized via nir_opt_vectorize so instruction count is minimally
+# impacted.
+#
+# On a machine with SIMD-within-a-register only, this actually
+# counterintuitively hurts instruction count. These machines are the same that
+# require vectorize_vec2_16bit, so we predicate the optimizations on that flag
+# not being set.
+#
+# Finally for scalar architectures, there should be no difference in generated
+# code since it all ends up scalarized at the end, but it might minimally help
+# compile-times.
+
+for i in range(2, 4 + 1):
+ for T in ('f', 'u', 'i'):
+ vec_inst = ('vec' + str(i),)
+
+ indices = ['a', 'b', 'c', 'd']
+ suffix_in = tuple((indices[j] + '@32') for j in range(i))
+
+ to_16 = '{}2{}16'.format(T, T)
+ to_mp = '{}2{}mp'.format(T, T)
+
+ out_16 = tuple((to_16, indices[j]) for j in range(i))
+ out_mp = tuple((to_mp, indices[j]) for j in range(i))
+
+ optimizations += [
+ ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'),
+ ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit')
+ ]
+
# This section contains "late" optimizations that should be run before
# creating ffmas and calling regular optimizations for the final time.
# Optimizations should go here if they help code generation and conflict
(('~fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))),
(('~fge', ('fneg', ('fadd', a, b)), 0.0), ('fge', ('fneg', a), b)),
(('~feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))),
- (('~fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))),
+ (('~fneu', ('fadd', a, b), 0.0), ('fneu', a, ('fneg', b))),
# nir_lower_to_source_mods will collapse this, but its existence during the
# optimization loop can prevent other optimizations.
(('iadd', 'a', ('ineg', 'b')), ('isub', 'a', 'b'), '!options->lower_sub'),
(('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
(('ineg', a), ('isub', 0, a), 'options->lower_negate'),
+ (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'),
# These are duplicated from the main optimizations table. The late
# patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
(('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
(('fge', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('fge', b, a)),
(('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
- (('fne', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fne', a, b)),
+ (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
(('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
(('flt', ('fsat(is_used_once)', a), 1.0), ('flt', a, 1.0)),
(('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
(('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
(('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
- (('fne', ('fneg', a), ('fneg', b)), ('fne', b, a)),
+ (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
(('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
(('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
(('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
(('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
- (('fne', ('fneg', a), -1.0), ('fne', 1.0, a)),
+ (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),
(('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
(('ior', a, a), a),
(('~fadd', ('ffma(is_used_once)', a, b, ('fmul', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
- # Convert f2fmp instructions to concrete f2f16 instructions. At this point
+ # Convert *2*mp instructions to concrete *2*16 instructions. At this point
# any conversions that could have been removed will have been removed in
# nir_opt_algebraic so any remaining ones are required.
(('f2fmp', a), ('f2f16', a)),
+ (('i2imp', a), ('i2i16', a)),
+ (('u2ump', a), ('u2u16', a)),
# Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
#
(('fdot_replicated4', ('fneg', a), ('fneg', b)), ('fdot_replicated4', a, b)),
(('fneg', ('fneg', a)), a),
+ (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)),
+ (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))),
+
(('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
(('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
(('fneg', ('fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
(('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))),
(('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))),
+ (('fneg', ('fdot_replicated2(is_used_once)', a, b)), ('fdot_replicated2', ('fneg', a), b)),
+ (('fneg', ('fdot_replicated3(is_used_once)', a, b)), ('fdot_replicated3', ('fneg', a), b)),
+ (('fneg', ('fdot_replicated4(is_used_once)', a, b)), ('fdot_replicated4', ('fneg', a), b)),
+
# fdph works mostly like fdot, but to get the correct result, the negation
# must be applied to the second source.
(('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))),
- (('fabs', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', ('fabs', a), ('fabs', b))),
(('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))),
(('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),
]
-for op in ['fmul', 'fdot_replicated2', 'fdot_replicated3', 'fdot_replicated4']:
- distribute_src_mods.extend([
- (('fneg', (op + '(is_used_once)', a, b)), (op, ('fneg', a), b)),
- (('fabs', (op + '(is_used_once)', a, b)), (op, ('fabs', a), ('fabs', b))),
- ])
-
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
before_ffma_optimizations).render())