X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fcompiler%2Fnir%2Fnir_opt_algebraic.py;h=472db765026cab36a0f095933f0b4a5f35ea80a7;hb=a18c4ee7b07cb0c78b7d93005cc76eded4e8001c;hp=bdf432be09ddca76f23302025c54413eecedb7c3;hpb=3b747909419d35b4d23def90ba3c49a79b404170;p=mesa.git diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index bdf432be09d..472db765026 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -29,6 +29,8 @@ from collections import OrderedDict import nir_algebraic from nir_opcodes import type_sizes import itertools +import struct +from math import pi # Convenience variables a = 'a' @@ -50,11 +52,12 @@ e = 'e' # however, be used for backend-requested lowering operations as those need to # happen regardless of precision. # -# Variable names are specified as "[#]name[@type][(cond)]" where "#" inicates -# that the given variable will only match constants and the type indicates that -# the given variable will only match values from ALU instructions with the -# given output type, and (cond) specifies an additional condition function -# (see nir_search_helpers.h). +# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where: +# "#" indicates that the given variable will only match constants, +# type indicates that the given variable will only match values from ALU +# instructions with the given output type, +# (cond) specifies an additional condition function (see nir_search_helpers.h), +# swiz is a swizzle applied to the variable (only in the expression) # # For constants, you have to be careful to make sure that it is the right # type because python is unaware of the source and destination types of the @@ -66,11 +69,31 @@ e = 'e' # should only match that particular bit-size. In the replace half of the # expression this indicates that the constructed value should have that # bit-size. +# +# If the opcode in a replacement expression is prefixed by a '!' character, +# this indicated that the new expression will be marked exact. +# +# A special condition "many-comm-expr" can be used with expressions to note +# that the expression and its subexpressions have more commutative expressions +# than nir_replace_instr can handle. If this special condition is needed with +# another condition, the two can be separated by a comma (e.g., +# "(many-comm-expr,is_used_once)"). + +# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648 +def lowered_sincos(c): + x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0) + x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0) + return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x) + +def intBitsToFloat(i): + return struct.unpack('!f', struct.pack('!I', i))[0] optimizations = [ - (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b))), - (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b))))), + (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), + (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), + (('ishl', a, '#b@32'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'), + (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'), @@ -79,12 +102,12 @@ optimizations = [ (('idiv', a, 1), a), (('umod', a, 1), 0), (('imod', a, 1), 0), - (('udiv', a, '#b@32(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b))), + (('udiv', a, '#b@32(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'), (('idiv', a, '#b@32(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), 'options->lower_idiv'), (('idiv', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), 'options->lower_idiv'), (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1))), - (('fneg', ('fneg', a)), a), + (('~fneg', ('fneg', a)), a), (('ineg', ('ineg', a)), a), (('fabs', ('fabs', a)), ('fabs', a)), (('fabs', ('fneg', a)), ('fabs', a)), @@ -105,11 +128,12 @@ optimizations = [ (('iadd', a, ('iadd', ('ineg', a), b)), b), (('~fadd', ('fneg', a), ('fadd', a, b)), b), (('~fadd', a, ('fadd', ('fneg', a), b)), b), + (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))), (('~fmul', a, 0.0), 0.0), (('imul', a, 0), 0), (('umul_unorm_4x8', a, 0), 0), (('umul_unorm_4x8', a, ~0), a), - (('fmul', a, 1.0), a), + (('~fmul', a, 1.0), a), (('imul', a, 1), a), (('fmul', a, -1.0), ('fneg', a)), (('imul', a, -1), ('ineg', a)), @@ -151,11 +175,7 @@ optimizations = [ (('~fadd@32', ('fmul', a, ('fadd', 1.0, ('fneg', c ) )), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp32'), (('~fadd@64', ('fmul', a, ('fadd', 1.0, ('fneg', c ) )), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp64'), # These are the same as the previous three rules, but it depends on - # 1-fsat(x) <=> fsat(1-x): - # - # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially - # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1 - # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0 + # 1-fsat(x) <=> fsat(1-x). See below. (('~fadd@32', ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c )))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp32'), (('~fadd@64', ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c )))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp64'), @@ -168,7 +188,9 @@ optimizations = [ (('~fmul', ('fadd', ('iand', ('ineg', ('b2i32', 'a@bool')), ('fmul', b, c)), '#d'), '#e'), ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))), - (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d)), + (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'), + + (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d), '!options->lower_fdph'), (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)), (('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)), (('fdot4', ('vec4', a, b, c, 0.0), d), ('fdot3', ('vec3', a, b, c), d)), @@ -176,6 +198,30 @@ optimizations = [ (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)), (('fdot3', ('vec3', a, b, 0.0), c), ('fdot2', ('vec2', a, b), c)), + (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)), + (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')), + + # Lower fdot to fsum when it is available + (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'), + (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'), + (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'), + (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'), + + # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially + # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1 + # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0 + (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), + + # 1 - ((1 - a) * (1 - b)) + # 1 - (1 - a - b + a*b) + # 1 - 1 + a + b - a*b + # a + b - a*b + # a + b*(1 - a) + # b*(1 - a) + 1*a + # flrp(b, 1, a) + (('~fadd@32', 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), + ('flrp', b, 1.0, a), '!options->lower_flrp32'), + # (a * #b + #c) << #d # ((a * #b) << #d) + (#c << #d) # (a * (#b << #d)) + (#c << #d) @@ -185,12 +231,76 @@ optimizations = [ # (a * #b) << #c # a * (#b << #c) (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))), +] + +# Care must be taken here. Shifts in NIR uses only the lower log2(bitsize) +# bits of the second source. These replacements must correctly handle the +# case where (b % bitsize) + (c % bitsize) >= bitsize. +for s in [8, 16, 32, 64]: + mask = (1 << s) - 1 + + ishl = "ishl@{}".format(s) + ishr = "ishr@{}".format(s) + ushr = "ushr@{}".format(s) + + in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s) + + optimizations.extend([ + ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)), + ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)), + + # To get get -1 for large shifts of negative values, ishr must instead + # clamp the shift count to the maximum value. + ((ishr, (ishr, a, '#b'), '#c'), + (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))), + ]) + +# Optimize a pattern of address calculation created by DXVK where the offset is +# divided by 4 and then multipled by 4. This can be turned into an iand and the +# additions before can be reassociated to CSE the iand instruction. +for log2 in range(1, 7): # powers of two from 2 to 64 + v = 1 << log2 + mask = 0xffffffff & ~(v - 1) + b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v) + + optimizations.extend([ + # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)' + (('ishl@32', ('ushr@32', a, log2), log2), ('iand', a, mask)), + + # Reassociate for improved CSE + (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)), + ]) + +# To save space in the state tables, reduce to the set that is known to help. +# Previously, this was range(1, 32). In addition, a couple rules inside the +# loop are commented out. Revisit someday, probably after mesa/#2635 has some +# resolution. +for i in [1, 2, 16, 24]: + lo_mask = 0xffffffff >> i + hi_mask = (0xffffffff << i) & 0xffffffff + + optimizations.extend([ + # This pattern seems to only help in the soft-fp64 code. + (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)), +# (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)), +# (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)), + + (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)), + (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)), +# (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct + ]) + +optimizations.extend([ + # This is common for address calculations. Reassociating may enable the + # 'a<= b2f(a) # b2f(a) <= 0.0 # b2f(a) == 0.0 because b2f(a) can only be 0 or 1 @@ -255,6 +397,14 @@ optimizations = [ (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)), (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)), (('~fne', ('fadd', a, b), a), ('fne', b, 0.0)), + (('~flt', ('fadd(is_used_once)', a, '#b'), '#c'), ('flt', a, ('fadd', c, ('fneg', b)))), + (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)), + (('~fge', ('fadd(is_used_once)', a, '#b'), '#c'), ('fge', a, ('fadd', c, ('fneg', b)))), + (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)), + (('~feq', ('fadd(is_used_once)', a, '#b'), '#c'), ('feq', a, ('fadd', c, ('fneg', b)))), + (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)), + (('~fne', ('fadd(is_used_once)', a, '#b'), '#c'), ('fne', a, ('fadd', c, ('fneg', b)))), + (('~fne', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fne', ('fneg', ('fadd', c, b)), a)), # Cannot remove the addition from ilt or ige due to overflow. (('ieq', ('iadd', a, b), a), ('ieq', b, 0)), @@ -274,7 +424,7 @@ optimizations = [ (('feq', ('fmin', ('fneg', ('b2f', 'a@1')), 'b@1'), 0.0), ('iand', ('inot', a), ('fge', b, 0.0))), (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)), - (('fne', ('b2f', 'a@1'), 0.0), a), + (('~fne', ('b2f', 'a@1'), 0.0), a), (('ieq', ('b2i', 'a@1'), 0), ('inot', a)), (('ine', ('b2i', 'a@1'), 0), a), @@ -310,6 +460,16 @@ optimizations = [ # 0.0 >= fabs(a) (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), + # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a + (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'), + + # (a < 0.0) || (a > 1.0) + # !(!(a < 0.0) && !(a > 1.0)) + # !((a >= 0.0) && (a <= 1.0)) + # !(a == fsat(a)) + # a != fsat(a) + (('ior', ('flt', a, 0.0), ('flt', 1.0, a)), ('fne', a, ('fsat', a)), '!options->lower_fsat'), + (('fmax', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('ior', a, b))), (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))), (('fmin', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), @@ -343,8 +503,8 @@ optimizations = [ (('bcsel', a, a, b), ('ior', a, b)), (('bcsel', a, b, False), ('iand', a, b)), (('bcsel', a, b, a), ('iand', a, b)), - (('fmin', a, a), a), - (('fmax', a, a), a), + (('~fmin', a, a), a), + (('~fmax', a, a), a), (('imin', a, a), a), (('imax', a, a), a), (('umin', a, a), a), @@ -355,26 +515,46 @@ optimizations = [ (('fmin', ('fmin', a, b), b), ('fmin', a, b)), (('umin', ('umin', a, b), b), ('umin', a, b)), (('imin', ('imin', a, b), b), ('imin', a, b)), - (('fmax', a, ('fneg', a)), ('fabs', a)), - (('imax', a, ('ineg', a)), ('iabs', a)), + (('iand@32', a, ('inot', ('ishr', a, 31))), ('imax', a, 0)), + + # Simplify logic to detect sign of an integer. + (('ieq', ('iand', a, 0x80000000), 0x00000000), ('ige', a, 0)), + (('ine', ('iand', a, 0x80000000), 0x80000000), ('ige', a, 0)), + (('ine', ('iand', a, 0x80000000), 0x00000000), ('ilt', a, 0)), + (('ieq', ('iand', a, 0x80000000), 0x80000000), ('ilt', a, 0)), + (('ine', ('ushr', 'a@32', 31), 0), ('ilt', a, 0)), + (('ieq', ('ushr', 'a@32', 31), 0), ('ige', a, 0)), + (('ieq', ('ushr', 'a@32', 31), 1), ('ilt', a, 0)), + (('ine', ('ushr', 'a@32', 31), 1), ('ige', a, 0)), + (('ine', ('ishr', 'a@32', 31), 0), ('ilt', a, 0)), + (('ieq', ('ishr', 'a@32', 31), 0), ('ige', a, 0)), + (('ieq', ('ishr', 'a@32', 31), -1), ('ilt', a, 0)), + (('ine', ('ishr', 'a@32', 31), -1), ('ige', a, 0)), + (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))), (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))), (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))), (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))), - (('fmin', a, ('fabs', a)), a), + (('~fmin', a, ('fabs', a)), a), (('imin', a, ('iabs', a)), a), - (('fmax', a, ('fneg', ('fabs', a))), a), + (('~fmax', a, ('fneg', ('fabs', a))), a), (('imax', a, ('ineg', ('iabs', a))), a), (('fmax', a, ('fabs', a)), ('fabs', a)), (('imax', a, ('iabs', a)), ('iabs', a)), (('fmax', a, ('fneg', a)), ('fabs', a)), (('imax', a, ('ineg', a)), ('iabs', a)), + (('~fmax', ('fabs', a), 0.0), ('fabs', a)), (('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'), (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), + (('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), + (('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), (('fsat', ('fsign', a)), ('b2f', ('flt', 0.0, a))), (('fsat', ('b2f', a)), ('b2f', a)), (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'), (('fsat', ('fsat', a)), ('fsat', a)), + (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'), + (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'), + (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'), (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)), (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)), (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)), @@ -415,6 +595,20 @@ optimizations = [ (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))), (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)), + # These derive from the previous patterns with the application of b < 0 <=> + # 0 < -b. The transformation should be applied if either comparison is + # used once as this ensures that the number of comparisons will not + # increase. The sources to the ior and iand are not symmetric, so the + # rules have to be duplicated to get this behavior. + (('~ior', ('flt(is_used_once)', 0.0, 'a@32'), ('flt', 'b@32', 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), + (('~ior', ('flt', 0.0, 'a@32'), ('flt(is_used_once)', 'b@32', 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), + (('~ior', ('fge(is_used_once)', 0.0, 'a@32'), ('fge', 'b@32', 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), + (('~ior', ('fge', 0.0, 'a@32'), ('fge(is_used_once)', 'b@32', 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), + (('~iand', ('flt(is_used_once)', 0.0, 'a@32'), ('flt', 'b@32', 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), + (('~iand', ('flt', 0.0, 'a@32'), ('flt(is_used_once)', 'b@32', 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), + (('~iand', ('fge(is_used_once)', 0.0, 'a@32'), ('fge', 'b@32', 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), + (('~iand', ('fge', 0.0, 'a@32'), ('fge(is_used_once)', 'b@32', 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), + # Common pattern like 'if (i == 0 || i == 1 || ...)' (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)), (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)), @@ -432,7 +626,19 @@ optimizations = [ (('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)), (('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))), - (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', 'a@32', 'b@32'), 0)), + (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', a, b), 0), '!options->lower_bitops'), + (('ior', ('ine', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('ior', a, b), 0), '!options->lower_bitops'), + + # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code. + # The first part of the iand comes from the !__feq64_nonnan. + # + # The second pattern is a reformulation of the first based on the relation + # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation + # happens to be y == 0. + (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0), b), c)), ('ilt', a, 0)), + ('iand', ('inot', ('iand', b , c)), ('ilt', a, 0))), + (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)), + ('iand', ('inot', ('iand', ('ieq', b , 0), c)), ('ilt', a, 0))), # These patterns can result when (a < b || a < c) => (a < min(b, c)) # transformations occur before constant propagation and loop-unrolling. @@ -461,6 +667,10 @@ optimizations = [ (('ult', ('umax', a, b), a), False), (('uge', a, ('umax', b, a)), ('uge', a, b)), (('uge', ('umin', a, b), a), ('uge', b, a)), + (('ult', a, ('iand', b, a)), False), + (('ult', ('ior', a, b), a), False), + (('uge', a, ('iand', b, a)), True), + (('uge', ('ior', a, b), a), True), (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))), (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))), @@ -493,6 +703,28 @@ optimizations = [ (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'), (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'), (('sne', a, b), ('b2f', ('fne', a, b)), 'options->lower_scmp'), + (('seq', ('seq', a, b), 1.0), ('seq', a, b)), + (('seq', ('sne', a, b), 1.0), ('sne', a, b)), + (('seq', ('slt', a, b), 1.0), ('slt', a, b)), + (('seq', ('sge', a, b), 1.0), ('sge', a, b)), + (('sne', ('seq', a, b), 0.0), ('seq', a, b)), + (('sne', ('sne', a, b), 0.0), ('sne', a, b)), + (('sne', ('slt', a, b), 0.0), ('slt', a, b)), + (('sne', ('sge', a, b), 0.0), ('sge', a, b)), + (('seq', ('seq', a, b), 0.0), ('sne', a, b)), + (('seq', ('sne', a, b), 0.0), ('seq', a, b)), + (('seq', ('slt', a, b), 0.0), ('sge', a, b)), + (('seq', ('sge', a, b), 0.0), ('slt', a, b)), + (('sne', ('seq', a, b), 1.0), ('sne', a, b)), + (('sne', ('sne', a, b), 1.0), ('seq', a, b)), + (('sne', ('slt', a, b), 1.0), ('sge', a, b)), + (('sne', ('sge', a, b), 1.0), ('slt', a, b)), + (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'), + (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'), + (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'), + (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'), + (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), + (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), (('fne', ('fneg', a), a), ('fne', a, 0.0)), (('feq', ('fneg', a), a), ('feq', a, 0.0)), # Emulating booleans @@ -503,7 +735,6 @@ optimizations = [ # True/False are ~0 and 0 in NIR. b2i of True is 1, and -1 is ~0 (True). (('ineg', ('b2i32', 'a@32')), a), (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF. - (('flt', ('fsub', 0.0, ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF. # Comparison with the same args. Note that these are not done for # the float versions because NaN always returns false on float # inequalities. @@ -514,14 +745,12 @@ optimizations = [ (('ult', a, a), False), (('uge', a, a), True), # Logical and bit operations - (('fand', a, 0.0), 0.0), (('iand', a, a), a), (('iand', a, ~0), a), (('iand', a, 0), 0), (('ior', a, a), a), (('ior', a, 0), a), (('ior', a, True), True), - (('fxor', a, a), 0.0), (('ixor', a, a), 0), (('ixor', a, 0), a), (('inot', ('inot', a)), a), @@ -539,8 +768,18 @@ optimizations = [ (('ishr', a, 0), a), (('ushr', 0, a), 0), (('ushr', a, 0), a), - (('iand', 0xff, ('ushr@32', a, 24)), ('ushr', a, 24)), - (('iand', 0xffff, ('ushr@32', a, 16)), ('ushr', a, 16)), + (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'), + (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), '!options->lower_rotate'), + (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'), + (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), '!options->lower_rotate'), + (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'), + (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), '!options->lower_rotate'), + (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'), + (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), '!options->lower_rotate'), + (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), 'options->lower_rotate'), + (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), 'options->lower_rotate'), + (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), 'options->lower_rotate'), + (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), 'options->lower_rotate'), # Exponential/logarithmic identities (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a @@ -548,6 +787,7 @@ optimizations = [ (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))), ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d + (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)), (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)), (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))), (('~fpow', a, 1.0), a), @@ -564,6 +804,8 @@ optimizations = [ (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))), (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))), (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))), + (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))), + (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)), # Division and reciprocal (('~fdiv', 1.0, a), ('frcp', a)), (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), @@ -571,6 +813,9 @@ optimizations = [ (('~frcp', ('fsqrt', a)), ('frsq', a)), (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'), (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'), + # Trig + (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'), + (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'), # Boolean simplifications (('i2b32(is_used_by_if)', a), ('ine32', a, 0)), (('i2b1(is_used_by_if)', a), ('ine', a, 0)), @@ -587,12 +832,9 @@ optimizations = [ (('bcsel', True, b, c), b), (('bcsel', False, b, c), c), (('bcsel', a, ('b2f(is_used_once)', 'b@32'), ('b2f', 'c@32')), ('b2f', ('bcsel', a, b, c))), - # The result of this should be hit by constant propagation and, in the - # next round of opt_algebraic, get picked up by one of the above two. - (('bcsel', '#a', b, c), ('bcsel', ('ine', 'a', 0), b, c)), (('bcsel', a, b, b), b), - (('fcsel', a, b, b), b), + (('~fcsel', a, b, b), b), # D3D Boolean emulation (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))), @@ -606,6 +848,7 @@ optimizations = [ (('ine', ('ineg', ('b2i', 'a@1')), 0), a), (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)), (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)), + (('iand', ('ineg', ('b2i', a)), 1), ('b2i', a)), # SM5 32-bit shifts are defined to use the 5 least significant bits (('ishl', 'a@32', ('iand', 31, b)), ('ishl', a, b)), @@ -618,10 +861,13 @@ optimizations = [ (('f2u', ('ftrunc', a)), ('f2u', a)), (('i2b', ('ineg', a)), ('i2b', a)), (('i2b', ('iabs', a)), ('i2b', a)), - (('fabs', ('b2f', a)), ('b2f', a)), - (('iabs', ('b2i', a)), ('b2i', a)), (('inot', ('f2b1', a)), ('feq', a, 0.0)), + # The C spec says, "If the value of the integral part cannot be represented + # by the integer type, the behavior is undefined." "Undefined" can mean + # "the conversion doesn't happen at all." + (('~i2f32', ('f2i32', 'a@32')), ('ftrunc', a)), + # Ironically, mark these as imprecise because removing the conversions may # preserve more precision than doing the conversions (e.g., # uint(float(0x81818181u)) == 0x81818200). @@ -630,6 +876,22 @@ optimizations = [ (('~f2u32', ('i2f', 'a@32')), a), (('~f2u32', ('u2f', 'a@32')), a), + # Conversions from float16 to float32 and back can always be removed + (('f2f16', ('f2f32', 'a@16')), a), + (('f2fmp', ('f2f32', 'a@16')), a), + # Conversions to float16 would be lossy so they should only be removed if + # the instruction was generated by the precision lowering pass. + (('f2f32', ('f2fmp', 'a@32')), a), + + (('ffloor', 'a(is_integral)'), a), + (('fceil', 'a(is_integral)'), a), + (('ftrunc', 'a(is_integral)'), a), + # fract(x) = x - floor(x), so fract(NaN) = NaN + (('~ffract', 'a(is_integral)'), 0.0), + (('fabs', 'a(is_not_negative)'), a), + (('iabs', 'a(is_not_negative)'), a), + (('fsat', 'a(is_not_positive)'), 0.0), + # Section 5.4.1 (Conversion and Scalar Constructors) of the GLSL 4.60 spec # says: # @@ -643,12 +905,71 @@ optimizations = [ (('ilt', ('f2u', a), b), ('ilt', ('f2i', a), b)), (('ilt', b, ('f2u', a)), ('ilt', b, ('f2i', a))), + (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'), + + # The result of the multiply must be in [-1, 0], so the result of the ffma + # must be in [0, 1]. + (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False), + (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False), + (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)), + (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)), + + (('fne', 'a(is_not_zero)', 0.0), True), + (('feq', 'a(is_not_zero)', 0.0), False), + + # In this chart, + means value > 0 and - means value < 0. + # + # + >= + -> unknown 0 >= + -> false - >= + -> false + # + >= 0 -> true 0 >= 0 -> true - >= 0 -> false + # + >= - -> true 0 >= - -> true - >= - -> unknown + # + # Using grouping conceptually similar to a Karnaugh map... + # + # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true + # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false + # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false + # + # The flt / ilt cases just invert the expected result. + # + # The results expecting true, must be marked imprecise. The results + # expecting false are fine because NaN compared >= or < anything is false. + + (('~fge', 'a(is_not_negative)', 'b(is_not_positive)'), True), + (('fge', 'a(is_not_positive)', 'b(is_gt_zero)'), False), + (('fge', 'a(is_lt_zero)', 'b(is_not_negative)'), False), + + (('flt', 'a(is_not_negative)', 'b(is_not_positive)'), False), + (('~flt', 'a(is_not_positive)', 'b(is_gt_zero)'), True), + (('~flt', 'a(is_lt_zero)', 'b(is_not_negative)'), True), + + (('ine', 'a(is_not_zero)', 0), True), + (('ieq', 'a(is_not_zero)', 0), False), + + (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True), + (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'), False), + (('ige', 'a(is_lt_zero)', 'b(is_not_negative)'), False), + + (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False), + (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'), True), + (('ilt', 'a(is_lt_zero)', 'b(is_not_negative)'), True), + + (('ult', 0, 'a(is_gt_zero)'), True), + (('ult', a, 0), False), + # Packing and then unpacking does nothing (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a), (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b), (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), a), + # Comparing two halves of an unpack separately. While this optimization + # should be correct for non-constant values, it's less obvious that it's + # useful in that case. For constant values, the pack will fold and we're + # guaranteed to reduce the whole tree to one instruction. + (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'), + ('ieq', ('unpack_32_2x16_split_y', a), '#c')), + ('ieq', a, ('pack_32_2x16_split', b, c))), + # Byte extraction (('ushr', 'a@16', 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'), @@ -656,8 +977,25 @@ optimizations = [ (('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'), (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'), - (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte') -] + (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'), + + # Useless masking before unpacking + (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)), + (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)), + (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)), + (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)), + (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)), + (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)), + + # Optimize half packing + (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))), + (('ishr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))), + + (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), + ('pack_half_2x16', ('vec2', a, b))), + (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), + ('pack_half_2x16', ('vec2', a, b))), +]) # After the ('extract_u8', a, 0) pattern, above, triggers, there will be # patterns like those below. @@ -684,21 +1022,17 @@ optimizations.extend([ (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'), # Subtracts - (('~fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)), - (('isub', a, ('isub', 0, b)), ('iadd', a, b)), (('ussub_4x8', a, 0), a), (('ussub_4x8', a, ~0), 0), - (('fsub', a, b), ('fadd', a, ('fneg', b)), 'options->lower_sub'), - (('isub', a, b), ('iadd', a, ('ineg', b)), 'options->lower_sub'), - (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'), - (('ineg', a), ('isub', 0, a), 'options->lower_negate'), - (('~fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)), - (('iadd', a, ('isub', 0, b)), ('isub', a, b)), - (('fabs', ('fsub', 0.0, a)), ('fabs', a)), - (('iabs', ('isub', 0, a)), ('iabs', a)), + # Lower all Subtractions first - they can get recombined later + (('fsub', a, b), ('fadd', a, ('fneg', b))), + (('isub', a, b), ('iadd', a, ('ineg', b))), + (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), + # This is correct. We don't need isub_sat because the result type is unsigned, so it cannot overflow. + (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), # Propagate negation up multiplication chains - (('fmul', ('fneg', a), b), ('fneg', ('fmul', a, b))), + (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))), (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))), # Propagate constants up multiplication chains @@ -715,6 +1049,12 @@ optimizations.extend([ (('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)), (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))), (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)), + (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)), + (('ior', '#a', ('ior', 'b(is_not_const)', '#c')), ('ior', ('ior', a, c), b)), + (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)), + + # Drop mul-div by the same value when there's no wrapping. + (('idiv', ('imul(no_signed_wrap)', a, b), b), a), # By definition... (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)), @@ -728,40 +1068,114 @@ optimizations.extend([ (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)), # Misc. lowering - (('fmod@16', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod16'), - (('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod32'), - (('fmod@64', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod64'), - (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod32'), + (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'), + (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'), (('uadd_carry@32', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'), (('usub_borrow@32', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'), (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), - ('bcsel', ('ilt', 31, 'bits'), 'insert', + ('bcsel', ('ult', 31, 'bits'), 'insert', ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')), 'options->lower_bitfield_insert'), (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), + (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), + (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), + (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), + (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), + + (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'), (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat'), (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_add_sat'), + (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), + + # int64_t sum = a + b; + # + # if (a < 0 && b < 0 && a < sum) + # sum = INT64_MIN; + # } else if (a >= 0 && b >= 0 && sum < a) + # sum = INT64_MAX; + # } + # + # A couple optimizations are applied. + # + # 1. a < sum => sum >= 0. This replacement works because it is known that + # a < 0 and b < 0, so sum should also be < 0 unless there was + # underflow. + # + # 2. sum < a => sum < 0. This replacement works because it is known that + # a >= 0 and b >= 0, so sum should also be >= 0 unless there was + # overflow. + # + # 3. Invert the second if-condition and swap the order of parameters for + # the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >= + # 0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0) + # + # On Intel Gen11, this saves ~11 instructions. + (('iadd_sat@64', a, b), ('bcsel', + ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), + 0x8000000000000000, + ('bcsel', + ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), + ('iadd', a, b), + 0x7fffffffffffffff)), + '(options->lower_int64_options & nir_lower_iadd64) != 0'), + + # int64_t sum = a - b; + # + # if (a < 0 && b >= 0 && a < sum) + # sum = INT64_MIN; + # } else if (a >= 0 && b < 0 && a >= sum) + # sum = INT64_MAX; + # } + # + # Optimizations similar to the iadd_sat case are applied here. + (('isub_sat@64', a, b), ('bcsel', + ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), + 0x8000000000000000, + ('bcsel', + ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), + ('isub', a, b), + 0x7fffffffffffffff)), + '(options->lower_int64_options & nir_lower_iadd64) != 0'), + + # These are done here instead of in the backend because the int64 lowering + # pass will make a mess of the patterns. The first patterns are + # conditioned on nir_lower_minmax64 because it was not clear that it was + # always an improvement on platforms that have real int64 support. No + # shaders in shader-db hit this, so it was hard to say one way or the + # other. + (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), + (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), + (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), + (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), + (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), + (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), + + (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), + (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), + # 0u < uint(a) <=> uint(a) != 0u + (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), # Alternative lowering that doesn't rely on bfi. (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), - ('bcsel', ('ilt', 31, 'bits'), + ('bcsel', ('ult', 31, 'bits'), 'insert', - ('ior', - ('iand', 'base', ('inot', ('bfm', 'bits', 'offset'))), - ('iand', ('ishl', 'insert', 'offset'), ('bfm', 'bits', 'offset')))), + (('ior', + ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))), + ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))), 'options->lower_bitfield_insert_to_shifts'), - # bfm lowering -- note that the NIR opcode is undefined if either arg is 32. - (('bfm', 'bits', 'offset'), - ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'), - 'options->lower_bfm'), + # Alternative lowering that uses bitfield_select. + (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), + ('bcsel', ('ult', 31, 'bits'), 'insert', + ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')), + 'options->lower_bitfield_insert_to_bitfield_select'), (('ibitfield_extract', 'value', 'offset', 'bits'), - ('bcsel', ('ilt', 31, 'bits'), 'value', + ('bcsel', ('ult', 31, 'bits'), 'value', ('ibfe', 'value', 'offset', 'bits')), 'options->lower_bitfield_extract'), @@ -770,6 +1184,14 @@ optimizations.extend([ ('ubfe', 'value', 'offset', 'bits')), 'options->lower_bitfield_extract'), + # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits' + (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')), + (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')), + (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')), + (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')), + (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')), + (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')), + (('ibitfield_extract', 'value', 'offset', 'bits'), ('bcsel', ('ieq', 0, 'bits'), 0, @@ -783,7 +1205,7 @@ optimizations.extend([ ('ushr', 'value', 'offset'), ('bcsel', ('ieq', 'bits', 32), 0xffffffff, - ('bfm', 'bits', 0))), + ('isub', ('ishl', 1, 'bits'), 1))), 'options->lower_bitfield_extract_to_shifts'), (('ifind_msb', 'value'), @@ -858,8 +1280,41 @@ optimizations.extend([ 127.0))), 'options->lower_unpack_snorm_4x8'), + (('pack_half_2x16_split', 'a@32', 'b@32'), + ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))), + 'options->lower_pack_half_2x16_split'), + + (('unpack_half_2x16_split_x', 'a@32'), + ('f2f32', ('u2u16', a)), + 'options->lower_unpack_half_2x16_split'), + + (('unpack_half_2x16_split_y', 'a@32'), + ('f2f32', ('u2u16', ('ushr', a, 16))), + 'options->lower_unpack_half_2x16_split'), + (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'), (('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'), + + # Address/offset calculations: + # Drivers supporting imul24 should use the nir_lower_amul() pass, this + # rule converts everyone else to imul: + (('amul', a, b), ('imul', a, b), '!options->has_imul24'), + + (('imad24_ir3', a, b, 0), ('imul24', a, b)), + (('imad24_ir3', a, 0, c), (c)), + (('imad24_ir3', a, 1, c), ('iadd', a, c)), + + # if first two srcs are const, crack apart the imad so constant folding + # can clean up the imul: + # TODO ffma should probably get a similar rule: + (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)), + + # These will turn 24b address/offset calc back into 32b shifts, but + # it should be safe to get back some of the bits of precision that we + # already decided were no necessary: + (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), + (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), + (('imul24', a, 0), (0)), ]) # bit_size dependent lowerings @@ -877,7 +1332,7 @@ for bit_size in [8, 16, 32, 64]: ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_add_sat'), ] -invert = OrderedDict([('feq', 'fne'), ('fne', 'feq'), ('fge', 'flt'), ('flt', 'fge')]) +invert = OrderedDict([('feq', 'fne'), ('fne', 'feq')]) for left, right in itertools.combinations_with_replacement(invert.keys(), 2): optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))), @@ -904,7 +1359,7 @@ for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']): optimizations.append(((x2yN, (b2x, a)), (b2y, a))) # Optimize away x2xN(a@N) -for t in ['int', 'uint', 'float']: +for t in ['int', 'uint', 'float', 'bool']: for N in type_sizes(t): x2xN = '{0}2{0}{1}'.format(t[0], N) aN = 'a@{0}'.format(N) @@ -945,8 +1400,93 @@ for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')): # The N == M case is handled by other optimizations pass +# Downcast operations should be able to see through pack +for t in ['i', 'u']: + for N in [8, 16, 32]: + x2xN = '{0}2{0}{1}'.format(t, N) + optimizations += [ + ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), + ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), + ] + +# Optimize comparisons with up-casts +for t in ['int', 'uint', 'float']: + for N, M in itertools.product(type_sizes(t), repeat=2): + if N == 1 or N >= M: + continue + + x2xM = '{0}2{0}{1}'.format(t[0], M) + x2xN = '{0}2{0}{1}'.format(t[0], N) + aN = 'a@' + str(N) + bN = 'b@' + str(N) + xeq = 'feq' if t == 'float' else 'ieq' + xne = 'fne' if t == 'float' else 'ine' + xge = '{0}ge'.format(t[0]) + xlt = '{0}lt'.format(t[0]) + + # Up-casts are lossless so for correctly signed comparisons of + # up-casted values we can do the comparison at the largest of the two + # original sizes and drop one or both of the casts. (We have + # optimizations to drop the no-op casts which this may generate.) + for P in type_sizes(t): + if P == 1 or P > N: + continue + + bP = 'b@' + str(P) + optimizations += [ + ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b))), + ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b))), + ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b))), + ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b))), + ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a)), + ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a)), + ] + + # The next bit doesn't work on floats because the range checks would + # get way too complicated. + if t in ['int', 'uint']: + if t == 'int': + xN_min = -(1 << (N - 1)) + xN_max = (1 << (N - 1)) - 1 + elif t == 'uint': + xN_min = 0 + xN_max = (1 << N) - 1 + else: + assert False + + # If we're up-casting and comparing to a constant, we can unfold + # the comparison into a comparison with the shrunk down constant + # and a check that the constant fits in the smaller bit size. + optimizations += [ + ((xeq, (x2xM, aN), '#b'), + ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b))), + ((xne, (x2xM, aN), '#b'), + ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b))), + ((xlt, (x2xM, aN), '#b'), + ('iand', (xlt, xN_min, b), + ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b))))), + ((xlt, '#a', (x2xM, bN)), + ('iand', (xlt, a, xN_max), + ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b)))), + ((xge, (x2xM, aN), '#b'), + ('iand', (xge, xN_max, b), + ('ior', (xge, xN_min, b), (xge, a, (x2xN, b))))), + ((xge, '#a', (x2xM, bN)), + ('iand', (xge, a, xN_min), + ('ior', (xge, a, xN_max), (xge, (x2xN, a), b)))), + ] + def fexp2i(exp, bits): - # We assume that exp is already in the right range. + # Generate an expression which constructs value 2.0^exp or 0.0. + # + # We assume that exp is already in a valid range: + # + # * [-15, 15] for 16-bit float + # * [-127, 127] for 32-bit float + # * [-1023, 1023] for 16-bit float + # + # If exp is the lowest value in the valid range, a value of 0.0 is + # constructed. Otherwise, the value 2.0^exp is constructed. if bits == 16: return ('i2i16', ('ishl', ('iadd', exp, 15), 10)) elif bits == 32: @@ -957,22 +1497,40 @@ def fexp2i(exp, bits): assert False def ldexp(f, exp, bits): - # First, we clamp exp to a reasonable range. The maximum possible range - # for a normal exponent is [-126, 127] and, throwing in denormals, you get - # a maximum range of [-149, 127]. This means that we can potentially have - # a swing of +-276. If you start with FLT_MAX, you actually have to do - # ldexp(FLT_MAX, -278) to get it to flush all the way to zero. The GLSL - # spec, on the other hand, only requires that we handle an exponent value - # in the range [-126, 128]. This implementation is *mostly* correct; it - # handles a range on exp of [-252, 254] which allows you to create any - # value (including denorms if the hardware supports it) and to adjust the - # exponent of any normal value to anything you want. + # The maximum possible range for a normal exponent is [-126, 127] and, + # throwing in denormals, you get a maximum range of [-149, 127]. This + # means that we can potentially have a swing of +-276. If you start with + # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush + # all the way to zero. The GLSL spec only requires that we handle a subset + # of this range. From version 4.60 of the spec: + # + # "If exp is greater than +128 (single-precision) or +1024 + # (double-precision), the value returned is undefined. If exp is less + # than -126 (single-precision) or -1022 (double-precision), the value + # returned may be flushed to zero. Additionally, splitting the value + # into a significand and exponent using frexp() and then reconstructing + # a floating-point value using ldexp() should yield the original input + # for zero and all finite non-denormalized values." + # + # The SPIR-V spec has similar language. + # + # In order to handle the maximum value +128 using the fexp2i() helper + # above, we have to split the exponent in half and do two multiply + # operations. + # + # First, we clamp exp to a reasonable range. Specifically, we clamp to + # twice the full range that is valid for the fexp2i() function above. If + # exp/2 is the bottom value of that range, the fexp2i() expression will + # yield 0.0f which, when multiplied by f, will flush it to zero which is + # allowed by the GLSL and SPIR-V specs for low exponent values. If the + # value is clamped from above, then it must have been above the supported + # range of the GLSL built-in and therefore any return value is acceptable. if bits == 16: - exp = ('imin', ('imax', exp, -28), 30) + exp = ('imin', ('imax', exp, -30), 30) elif bits == 32: - exp = ('imin', ('imax', exp, -252), 254) + exp = ('imin', ('imax', exp, -254), 254) elif bits == 64: - exp = ('imin', ('imax', exp, -2044), 2046) + exp = ('imin', ('imax', exp, -2046), 2046) else: assert False @@ -1000,19 +1558,19 @@ def bitfield_reverse(u): step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8)) step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4)) step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2)) - step5 = ('ior', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1)) + step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1)) return step5 -optimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'))] +optimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] # For any float comparison operation, "cmp", if you have "a == a && a cmp b" # then the "a == a" is redundant because it's equivalent to "a is not NaN" # and, if a is a NaN then the second comparison will fail anyway. for op in ['flt', 'fge', 'feq']: optimizations += [ - (('iand', ('feq', a, a), (op, a, b)), (op, a, b)), - (('iand', ('feq', a, a), (op, b, a)), (op, b, a)), + (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)), + (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)), ] # Add optimizations to handle the case where the result of a ternary is @@ -1063,6 +1621,53 @@ for op in ['fddx', 'fddx_fine', 'fddx_coarse', ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE') ] +# Some optimizations for ir3-specific instructions. +optimizations += [ + # 'al * bl': If either 'al' or 'bl' is zero, return zero. + (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)), + # '(ah * bl) << 16 + c': If either 'ah' or 'bl' is zero, return 'c'. + (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')), + (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')), +] + +# These kinds of sequences can occur after nir_opt_peephole_select. +# +# NOTE: fadd is not handled here because that gets in the way of ffma +# generation in the i965 driver. Instead, fadd and ffma are handled in +# late_optimizations. + +for op in ['flrp']: + optimizations += [ + (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), + (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), + (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), + (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), + (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)), + (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)), + ] + +for op in ['fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']: + optimizations += [ + (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), + (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))), + (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), + (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), + ] + +for op in ['fpow']: + optimizations += [ + (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), + (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), + (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)), + (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)), + ] + +for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos']: + optimizations += [ + (('bcsel', a, (op + '(is_used_once)', b), (op, c)), (op, ('bcsel', a, b, c))), + (('bcsel', a, (op, b), (op + '(is_used_once)', c)), (op, ('bcsel', a, b, c))), + ] + # This section contains "late" optimizations that should be run before # creating ffmas and calling regular optimizations for the final time. # Optimizations should go here if they help code generation and conflict @@ -1102,8 +1707,52 @@ late_optimizations = [ (('~feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))), (('~fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))), + # nir_lower_to_source_mods will collapse this, but its existence during the + # optimization loop can prevent other optimizations. + (('fneg', ('fneg', a)), a), + + # Subtractions get lowered during optimization, so we need to recombine them + (('fadd', 'a', ('fneg', 'b')), ('fsub', 'a', 'b'), '!options->lower_sub'), + (('iadd', 'a', ('ineg', 'b')), ('isub', 'a', 'b'), '!options->lower_sub'), + (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'), + (('ineg', a), ('isub', 0, a), 'options->lower_negate'), + + # These are duplicated from the main optimizations table. The late + # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create + # new patterns like these. The patterns that compare with zero are removed + # because they are unlikely to be created in by anything in + # late_optimizations. + (('flt', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('flt', a, b)), + (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)), + (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)), + (('fge', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('fge', b, a)), + (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)), + (('fne', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fne', a, b)), + + (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)), + (('flt', ('fsat(is_used_once)', a), 1.0), ('flt', a, 1.0)), + (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))), + (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)), + (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)), + (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)), + (('fne', ('fneg', a), ('fneg', b)), ('fne', b, a)), + (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)), + (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)), + (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)), + (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)), + (('fne', ('fneg', a), -1.0), ('fne', 1.0, a)), + (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)), + + (('ior', a, a), a), + (('iand', a, a), a), + + (('iand', ('ine(is_used_once)', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('umin', a, b), 0)), + (('ior', ('ieq(is_used_once)', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('umin', a, b), 0)), + + (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), + (('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'), (('fdot3', a, b), ('fdot_replicated3', a, b), 'options->fdot_replicates'), (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'), @@ -1112,19 +1761,98 @@ late_optimizations = [ (('~flrp@32', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)), (('~flrp@64', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)), - (('b2f(is_used_more_than_once)', ('inot', 'a@1')), ('bcsel', a, 0.0, 1.0)), - (('fneg(is_used_more_than_once)', ('b2f', ('inot', 'a@1'))), ('bcsel', a, -0.0, -1.0)), - (('~fadd@32', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp32'), (('~fadd@64', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp64'), + # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this + # particular operation is common for expanding values stored in a texture + # from [0,1] to [-1,1]. + (('~ffma@32', a, 2.0, -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), + (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), + (('~ffma@32', a, -2.0, 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), + (('~ffma@32', a, 2.0, 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), + (('~fadd@32', ('fmul(is_used_once)', 2.0, a), -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), + (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), + (('~fadd@32', ('fmul(is_used_once)', -2.0, a), 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), + (('~fadd@32', ('fmul(is_used_once)', 2.0, a), 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), + + # flrp(a, b, a) + # a*(1-a) + b*a + # a + -a*a + a*b (1) + # a + a*(b - a) + # Option 1: ffma(a, (b-a), a) + # + # Alternately, after (1): + # a*(1+b) + -a*a + # a*((1+b) + -a) + # + # Let b=1 + # + # Option 2: ffma(a, 2, -(a*a)) + # Option 3: ffma(a, 2, (-a)*a) + # Option 4: ffma(a, -a, (2*a) + # Option 5: a * (2 - a) + # + # There are a lot of other possible combinations. + (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'), + (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), + (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), + (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), + (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), + # we do these late so that we don't get in the way of creating ffmas (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))), (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))), (('bcsel', a, 0, ('b2f32', ('inot', 'b@bool'))), ('b2f32', ('inot', ('ior', a, b)))), + + # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c), + # op(b, d)) => op(b, bcsel(a, c, d)) transformations. I do not know why. + (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)), + ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))), + + # Things that look like DPH in the source shader may get expanded to + # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets + # to NIR. After FFMA is generated, this can look like: + # + # fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w) + # + # Reassociate the last addition into the first multiplication. + # + # Some shaders do not use 'invariant' in vertex and (possibly) geometry + # shader stages on some outputs that are intended to be invariant. For + # various reasons, this optimization may not be fully applied in all + # shaders used for different rendering passes of the same geometry. This + # can result in Z-fighting artifacts (at best). For now, disable this + # optimization in these stages. See bugzilla #111490. In tessellation + # stages applications seem to use 'precise' when necessary, so allow the + # optimization in those stages. + (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'), + ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), + (('~fadd', ('ffma(is_used_once)', a, b, ('fmul', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'), + ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), + + # Convert f2fmp instructions to concrete f2f16 instructions. At this point + # any conversions that could have been removed will have been removed in + # nir_opt_algebraic so any remaining ones are required. + (('f2fmp', a), ('f2f16', a)), ] +for op in ['fadd']: + late_optimizations += [ + (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), + (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), + ] + +for op in ['ffma']: + late_optimizations += [ + (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), + (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), + + (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), + (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), + ] + print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render()) print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma", before_ffma_optimizations).render())