X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fcompiler%2Fnir%2Fnir_opt_algebraic.py;h=6379a3994318fa2888b282415221a1894a233f70;hb=d41cdef2a591930ad848b27f936aafcafc93b308;hp=789a57e38b63944a0811e2db3cb926a897023a92;hpb=d40640efe8a6f3cc1d86870678ae1f89264354ec;p=mesa.git diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 789a57e38b6..6379a399431 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -35,6 +35,7 @@ a = 'a' b = 'b' c = 'c' d = 'd' +e = 'e' # Written in the form (, ) where is an expression # and is either an expression or a value. An expression is @@ -70,6 +71,10 @@ optimizations = [ (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b))), (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b))))), + (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), + (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), + (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'), + (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'), (('udiv', a, 1), a), (('idiv', a, 1), a), (('umod', a, 1), 0), @@ -122,11 +127,25 @@ optimizations = [ (('~flrp', a, b, 1.0), b), (('~flrp', a, a, b), a), (('~flrp', 0.0, a, b), ('fmul', a, b)), + + # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c) + (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)), + (('~flrp@32', a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp32'), + (('~flrp@64', a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp64'), + + (('~flrp@32', ('fadd', a, b), ('fadd', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp32'), + (('~flrp@64', ('fadd', a, b), ('fadd', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp64'), + + (('~flrp@32', a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp32'), + (('~flrp@64', a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp64'), + + (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)), + (('~flrp', a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp32'), (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)), - (('flrp@32', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp32'), - (('flrp@64', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp64'), + (('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'), (('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), + (('fadd', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'), (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp32'), @@ -138,6 +157,9 @@ optimizations = [ (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'), (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'), + (('~fmul', ('fadd', ('iand', ('ineg', ('b2i32', 'a@bool')), ('fmul', b, c)), '#d'), '#e'), + ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))), + (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d)), (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)), (('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)), @@ -211,6 +233,9 @@ optimizations = [ (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))), (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))), + (('flt', a, ('fneg', a)), ('flt', a, 0.0)), + (('fge', a, ('fneg', a)), ('fge', a, 0.0)), + # Some optimizations (below) convert things like (a < b || c < b) into # (min(a, c) < b). However, this interfers with the previous optimizations # that try to remove comparisons with negated sums of b2f. This just @@ -306,7 +331,10 @@ optimizations = [ (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), - (('bcsel', a, True, 'b@bool'), ('ior', a, b)), + (('bcsel', a, True, b), ('ior', a, b)), + (('bcsel', a, a, b), ('ior', a, b)), + (('bcsel', a, b, False), ('iand', a, b)), + (('bcsel', a, b, a), ('iand', a, b)), (('fmin', a, a), a), (('fmax', a, a), a), (('imin', a, a), a), @@ -336,6 +364,7 @@ optimizations = [ (('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'), (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), (('fsat', ('fsign', a)), ('b2f', ('flt', 0.0, a))), + (('fsat', ('b2f', a)), ('b2f', a)), (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'), (('fsat', ('fsat', a)), ('fsat', a)), (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)), @@ -383,9 +412,18 @@ optimizations = [ (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)), (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)), - (('ior', 'a@bool', ('ieq', a, False)), True), + # The (i2f32, ...) part is an open-coded fsign. When that is combined with + # the bcsel, it's basically copysign(1.0, a). There is no copysign in NIR, + # so emit an open-coded version of that. + (('bcsel@32', ('feq', a, 0.0), 1.0, ('i2f32', ('iadd', ('b2i32', ('flt', 0.0, 'a@32')), ('ineg', ('b2i32', ('flt', 'a@32', 0.0)))))), + ('ior', 0x3f800000, ('iand', a, 0x80000000))), + + (('ior', a, ('ieq', a, False)), True), (('ior', a, ('inot', a)), -1), + (('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)), + (('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))), + (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', 'a@32', 'b@32'), 0)), # These patterns can result when (a < b || a < c) => (a < min(b, c)) @@ -433,6 +471,12 @@ optimizations = [ (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))), (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))), + # Thanks to sign extension, the ishr(a, b) is negative if and only if a is + # negative. + (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)), + ('iabs', ('ishr', a, b))), + (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)), + (('fabs', ('slt', a, b)), ('slt', a, b)), (('fabs', ('sge', a, b)), ('sge', a, b)), (('fabs', ('seq', a, b)), ('seq', a, b)), @@ -522,10 +566,10 @@ optimizations = [ # Boolean simplifications (('i2b32(is_used_by_if)', a), ('ine32', a, 0)), (('i2b1(is_used_by_if)', a), ('ine', a, 0)), - (('ieq', 'a@bool', True), a), - (('ine(is_not_used_by_if)', 'a@bool', True), ('inot', a)), - (('ine', 'a@bool', False), a), - (('ieq(is_not_used_by_if)', 'a@bool', False), ('inot', 'a')), + (('ieq', a, True), a), + (('ine(is_not_used_by_if)', a, True), ('inot', a)), + (('ine', a, False), a), + (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')), (('bcsel', a, True, False), a), (('bcsel', a, False, True), ('inot', a)), (('bcsel@32', a, 1.0, 0.0), ('b2f', a)), @@ -598,18 +642,32 @@ optimizations = [ ('unpack_64_2x32_split_y', a)), a), # Byte extraction - (('ushr', ('ishl', 'a@32', 24), 24), ('extract_u8', a, 0), '!options->lower_extract_byte'), - (('ushr', ('ishl', 'a@32', 16), 24), ('extract_u8', a, 1), '!options->lower_extract_byte'), - (('ushr', ('ishl', 'a@32', 8), 24), ('extract_u8', a, 2), '!options->lower_extract_byte'), + (('ushr', 'a@16', 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'), - (('ishr', ('ishl', 'a@32', 24), 24), ('extract_i8', a, 0), '!options->lower_extract_byte'), - (('ishr', ('ishl', 'a@32', 16), 24), ('extract_i8', a, 1), '!options->lower_extract_byte'), - (('ishr', ('ishl', 'a@32', 8), 24), ('extract_i8', a, 2), '!options->lower_extract_byte'), + (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'), + (('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'), - (('iand', 0xff, ('ushr', a, 16)), ('extract_u8', a, 2), '!options->lower_extract_byte'), - (('iand', 0xff, ('ushr', a, 8)), ('extract_u8', a, 1), '!options->lower_extract_byte'), - (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'), + (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'), + (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte') +] + +# After the ('extract_u8', a, 0) pattern, above, triggers, there will be +# patterns like those below. +for op in ('ushr', 'ishr'): + optimizations.extend([(('extract_u8', (op, 'a@16', 8), 0), ('extract_u8', a, 1))]) + optimizations.extend([(('extract_u8', (op, 'a@32', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)]) + optimizations.extend([(('extract_u8', (op, 'a@64', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)]) + +optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))]) + +# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be +# patterns like those below. +for op in ('extract_u8', 'extract_i8'): + optimizations.extend([((op, ('ishl', 'a@16', 8), 1), (op, a, 0))]) + optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)]) + optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)]) +optimizations.extend([ # Word extraction (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'), (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'), @@ -662,6 +720,7 @@ optimizations = [ (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)), # Misc. lowering + (('fmod@16', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod16'), (('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod32'), (('fmod@64', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod64'), (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod32'), @@ -672,6 +731,12 @@ optimizations = [ ('bcsel', ('ilt', 31, 'bits'), 'insert', ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')), 'options->lower_bitfield_insert'), + (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), + (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), + (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), + (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), + (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat'), + (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_add_sat'), # Alternative lowering that doesn't rely on bfi. (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), @@ -786,7 +851,23 @@ optimizations = [ 'options->lower_unpack_snorm_4x8'), (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'), -] + (('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'), +]) + +# bit_size dependent lowerings +for bit_size in [8, 16, 32, 64]: + # convenience constants + intmax = (1 << (bit_size - 1)) - 1 + intmin = 1 << (bit_size - 1) + + optimizations += [ + (('iadd_sat@' + str(bit_size), a, b), + ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)), + ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_add_sat'), + (('isub_sat@' + str(bit_size), a, b), + ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)), + ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_add_sat'), + ] invert = OrderedDict([('feq', 'fne'), ('fne', 'feq'), ('fge', 'flt'), ('flt', 'fge')]) @@ -814,9 +895,53 @@ for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']): x2yN = '{}2{}'.format(x, y) optimizations.append(((x2yN, (b2x, a)), (b2y, a))) +# Optimize away x2xN(a@N) +for t in ['int', 'uint', 'float']: + for N in type_sizes(t): + x2xN = '{0}2{0}{1}'.format(t[0], N) + aN = 'a@{0}'.format(N) + optimizations.append(((x2xN, aN), a)) + +# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers +# In particular, we can optimize away everything except upcast of downcast and +# upcasts where the type differs from the other cast +for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')): + if N < M: + # The outer cast is a down-cast. It doesn't matter what the size of the + # argument of the inner cast is because we'll never been in the upcast + # of downcast case. Regardless of types, we'll always end up with y2yN + # in the end. + for x, y in itertools.product(['i', 'u'], ['i', 'u']): + x2xN = '{0}2{0}{1}'.format(x, N) + y2yM = '{0}2{0}{1}'.format(y, M) + y2yN = '{0}2{0}{1}'.format(y, N) + optimizations.append(((x2xN, (y2yM, a)), (y2yN, a))) + elif N > M: + # If the outer cast is an up-cast, we have to be more careful about the + # size of the argument of the inner cast and with types. In this case, + # the type is always the type of type up-cast which is given by the + # outer cast. + for P in type_sizes('uint'): + # We can't optimize away up-cast of down-cast. + if M < P: + continue + + # Because we're doing down-cast of down-cast, the types always have + # to match between the two casts + for x in ['i', 'u']: + x2xN = '{0}2{0}{1}'.format(x, N) + x2xM = '{0}2{0}{1}'.format(x, M) + aP = 'a@{0}'.format(P) + optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a))) + else: + # The N == M case is handled by other optimizations + pass + def fexp2i(exp, bits): # We assume that exp is already in the right range. - if bits == 32: + if bits == 16: + return ('i2i16', ('ishl', ('iadd', exp, 15), 10)) + elif bits == 32: return ('ishl', ('iadd', exp, 127), 23) elif bits == 64: return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20)) @@ -834,7 +959,9 @@ def ldexp(f, exp, bits): # handles a range on exp of [-252, 254] which allows you to create any # value (including denorms if the hardware supports it) and to adjust the # exponent of any normal value to anything you want. - if bits == 32: + if bits == 16: + exp = ('imin', ('imax', exp, -28), 30) + elif bits == 32: exp = ('imin', ('imax', exp, -252), 254) elif bits == 64: exp = ('imin', ('imax', exp, -2044), 2046) @@ -854,6 +981,7 @@ def ldexp(f, exp, bits): return ('fmul', ('fmul', f, pow2_1), pow2_2) optimizations += [ + (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'), (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'), (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'), ] @@ -915,6 +1043,18 @@ for op in ['fadd', 'fmul', 'iadd', 'imul']: ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d))) ] +# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives +# states: +# +# If neither layout qualifier is specified, derivatives in compute shaders +# return zero, which is consistent with the handling of built-in texture +# functions like texture() in GLSL 4.50 compute shaders. +for op in ['fddx', 'fddx_fine', 'fddx_coarse', + 'fddy', 'fddy_fine', 'fddy_coarse']: + optimizations += [ + ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE') +] + # This section contains "late" optimizations that should be run before # creating ffmas and calling regular optimizations for the final time. # Optimizations should go here if they help code generation and conflict @@ -934,6 +1074,10 @@ before_ffma_optimizations = [ (('iadd', a, ('iadd', ('ineg', a), b)), b), (('~fadd', ('fneg', a), ('fadd', a, b)), b), (('~fadd', a, ('fadd', ('fneg', a), b)), b), + + (('~flrp@32', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a, 1.0), d), ('fadd', ('flrp', -1.0, 1.0, d), a)), + (('~flrp@32', ('fadd(is_used_once)', a, 1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp', 1.0, -1.0, d), a)), + (('~flrp@32', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))), ] # This section contains "late" optimizations that should be run after the @@ -957,6 +1101,9 @@ late_optimizations = [ (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'), (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'), + (('~flrp@32', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)), + (('~flrp@64', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)), + (('b2f(is_used_more_than_once)', ('inot', 'a@1')), ('bcsel', a, 0.0, 1.0)), (('fneg(is_used_more_than_once)', ('b2f', ('inot', 'a@1'))), ('bcsel', a, -0.0, -1.0)), @@ -964,7 +1111,7 @@ late_optimizations = [ (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))), (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))), - (('bcsel', 'a@bool', 0, ('b2f32', ('inot', 'b@bool'))), ('b2f32', ('inot', ('ior', a, b)))), + (('bcsel', a, 0, ('b2f32', ('inot', 'b@bool'))), ('b2f32', ('inot', ('ior', a, b)))), ] print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())