b = 'b'
c = 'c'
d = 'd'
+e = 'e'
# Written in the form (<search>, <replace>) where <search> is an expression
# and <replace> is either an expression or a value. An expression is
(('~flrp', a, b, 1.0), b),
(('~flrp', a, a, b), a),
(('~flrp', 0.0, a, b), ('fmul', a, b)),
+
+ # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
+ (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
+ (('~flrp@32', a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp32'),
+ (('~flrp@64', a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp64'),
+
+ (('~flrp@32', ('fadd', a, b), ('fadd', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp32'),
+ (('~flrp@64', ('fadd', a, b), ('fadd', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp64'),
+
+ (('~flrp@32', a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp32'),
+ (('~flrp@64', a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp64'),
+
+ (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)),
+
(('~flrp', a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp32'),
(('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),
- (('flrp@16', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp16'),
- (('flrp@32', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp32'),
- (('flrp@64', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp64'),
(('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),
(('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
(('fadd', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
(('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
(('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'),
+ (('~fmul', ('fadd', ('iand', ('ineg', ('b2i32', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
+ ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
+
(('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d)),
(('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)),
(('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)),
(('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))),
(('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))),
+ (('flt', a, ('fneg', a)), ('flt', a, 0.0)),
+ (('fge', a, ('fneg', a)), ('fge', a, 0.0)),
+
# Some optimizations (below) convert things like (a < b || c < b) into
# (min(a, c) < b). However, this interfers with the previous optimizations
# that try to remove comparisons with negated sums of b2f. This just
(('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)),
(('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)),
+ # The (i2f32, ...) part is an open-coded fsign. When that is combined with
+ # the bcsel, it's basically copysign(1.0, a). There is no copysign in NIR,
+ # so emit an open-coded version of that.
+ (('bcsel@32', ('feq', a, 0.0), 1.0, ('i2f32', ('iadd', ('b2i32', ('flt', 0.0, 'a@32')), ('ineg', ('b2i32', ('flt', 'a@32', 0.0)))))),
+ ('ior', 0x3f800000, ('iand', a, 0x80000000))),
+
(('ior', a, ('ieq', a, False)), True),
(('ior', a, ('inot', a)), -1),
+ (('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)),
+ (('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))),
+
(('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', 'a@32', 'b@32'), 0)),
# These patterns can result when (a < b || a < c) => (a < min(b, c))
'options->lower_unpack_snorm_4x8'),
(('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
+ (('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'),
])
# bit_size dependent lowerings
x2yN = '{}2{}'.format(x, y)
optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
+# Optimize away x2xN(a@N)
+for t in ['int', 'uint', 'float']:
+ for N in type_sizes(t):
+ x2xN = '{0}2{0}{1}'.format(t[0], N)
+ aN = 'a@{0}'.format(N)
+ optimizations.append(((x2xN, aN), a))
+
+# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers
+# In particular, we can optimize away everything except upcast of downcast and
+# upcasts where the type differs from the other cast
+for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')):
+ if N < M:
+ # The outer cast is a down-cast. It doesn't matter what the size of the
+ # argument of the inner cast is because we'll never been in the upcast
+ # of downcast case. Regardless of types, we'll always end up with y2yN
+ # in the end.
+ for x, y in itertools.product(['i', 'u'], ['i', 'u']):
+ x2xN = '{0}2{0}{1}'.format(x, N)
+ y2yM = '{0}2{0}{1}'.format(y, M)
+ y2yN = '{0}2{0}{1}'.format(y, N)
+ optimizations.append(((x2xN, (y2yM, a)), (y2yN, a)))
+ elif N > M:
+ # If the outer cast is an up-cast, we have to be more careful about the
+ # size of the argument of the inner cast and with types. In this case,
+ # the type is always the type of type up-cast which is given by the
+ # outer cast.
+ for P in type_sizes('uint'):
+ # We can't optimize away up-cast of down-cast.
+ if M < P:
+ continue
+
+ # Because we're doing down-cast of down-cast, the types always have
+ # to match between the two casts
+ for x in ['i', 'u']:
+ x2xN = '{0}2{0}{1}'.format(x, N)
+ x2xM = '{0}2{0}{1}'.format(x, M)
+ aP = 'a@{0}'.format(P)
+ optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a)))
+ else:
+ # The N == M case is handled by other optimizations
+ pass
+
def fexp2i(exp, bits):
# We assume that exp is already in the right range.
if bits == 16:
(('iadd', a, ('iadd', ('ineg', a), b)), b),
(('~fadd', ('fneg', a), ('fadd', a, b)), b),
(('~fadd', a, ('fadd', ('fneg', a), b)), b),
+
+ (('~flrp@32', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a, 1.0), d), ('fadd', ('flrp', -1.0, 1.0, d), a)),
+ (('~flrp@32', ('fadd(is_used_once)', a, 1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp', 1.0, -1.0, d), a)),
+ (('~flrp@32', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))),
]
# This section contains "late" optimizations that should be run after the
(('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),
(('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
+ (('~flrp@32', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
+ (('~flrp@64', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
+
(('b2f(is_used_more_than_once)', ('inot', 'a@1')), ('bcsel', a, 0.0, 1.0)),
(('fneg(is_used_more_than_once)', ('b2f', ('inot', 'a@1'))), ('bcsel', a, -0.0, -1.0)),
+ (('~fadd@32', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp32'),
+ (('~fadd@64', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp64'),
+
# we do these late so that we don't get in the way of creating ffmas
(('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
(('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),