nir/algebraic: add 16-bit versions of a few 32-bit patterns

[mesa.git] / src / compiler / nir / nir_opt_algebraic.py
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py

index 65cff5c04eb6ff18852105d8aa2ca62fd2d2ec0e..5bede77c3b98e28b99118397766dff284cbf6e9e 100644 (file)
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -90,9 +90,9 @@ def intBitsToFloat(i):
  
  optimizations = [
  
-   (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
-   (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
-   (('ishl', a, '#b@32'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'),
+   (('imul', a, '#b(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
+   (('imul', a, '#b(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
+   (('ishl', a, '#b'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'),
  
     (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
     (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
@@ -102,9 +102,9 @@ optimizations = [
     (('idiv', a, 1), a),
     (('umod', a, 1), 0),
     (('imod', a, 1), 0),
-   (('udiv', a, '#b@32(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'),
-   (('idiv', a, '#b@32(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), 'options->lower_idiv'),
-   (('idiv', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), 'options->lower_idiv'),
+   (('udiv', a, '#b(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'),
+   (('idiv', a, '#b(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), 'options->lower_idiv'),
+   (('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), 'options->lower_idiv'),
     (('umod', a, '#b(is_pos_power_of_two)'),    ('iand', a, ('isub', b, 1))),
  
     (('~fneg', ('fneg', a)), a),
@@ -186,7 +186,7 @@ optimizations = [
     (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
     (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'),
  
-   (('~fmul', ('fadd', ('iand', ('ineg', ('b2i32', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
+   (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
      ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
  
     (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'),
@@ -266,10 +266,10 @@ for log2 in range(1, 7): # powers of two from 2 to 64
  
     optimizations.extend([
         # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)'
-       (('ishl@32', ('ushr@32', a, log2), log2), ('iand', a, mask)),
+       (('ishl', ('ushr', a, log2), log2), ('iand', a, mask)),
  
         # Reassociate for improved CSE
-       (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),
+       (('iand', ('iadd', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),
     ])
  
  # To save space in the state tables, reduce to the set that is known to help.
@@ -329,10 +329,14 @@ optimizations.extend([
     (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),
     (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
  
-   (('flt', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('flt', a, b)),
+   # flt(fsat(a), b > 0 && b < 1) is inexact if a is NaN (fsat(NaN) is 0)
+   # because it returns True while flt(a, b) always returns False.
+   (('~flt', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('flt', a, b)),
     (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
     (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
-   (('fge', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('fge', b, a)),
+   # fge(b > 0 && b < 1, fsat(a)) is inexact if a is NaN (fsat(NaN) is 0)
+   # because it returns True while fge(b, a) always returns False.
+   (('~fge', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('fge', b, a)),
     (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
     (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
  
@@ -543,7 +547,7 @@ optimizations.extend([
     (('fmax', a, ('fabs', a)), ('fabs', a)),
     (('imax', a, ('iabs', a)), ('iabs', a)),
     (('fmax', a, ('fneg', a)), ('fabs', a)),
-   (('imax', a, ('ineg', a)), ('iabs', a)),
+   (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'),
     (('~fmax', ('fabs', a), 0.0), ('fabs', a)),
     (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
     # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while
@@ -568,10 +572,10 @@ optimizations.extend([
     (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
     (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),
     # Both the left and right patterns are "b" when isnan(a), so this is exact.
-   (('fmax', ('fsat', a), '#b@32(is_zero_to_one)'), ('fsat', ('fmax', a, b))),
+   (('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))),
     # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) ->
     # fmin(0.0, b)) while the right one is "b", so this optimization is inexact.
-   (('~fmin', ('fsat', a), '#b@32(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
+   (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
  
     # If a in [0,b] then b-a is also in [0,b].  Since b in [0,1], max(b-a, 0) =
     # fsat(b-a).
@@ -579,7 +583,7 @@ optimizations.extend([
     # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0
     #
     # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0.
-   (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b@32(is_zero_to_one)'), 0.0),
+   (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0),
      ('fsat', ('fadd', ('fneg',  a), b)), '!options->lower_fsat'),
  
     (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)),
@@ -645,8 +649,8 @@ optimizations.extend([
     (('ior', a, ('ieq', a, False)), True),
     (('ior', a, ('inot', a)), -1),
  
-   (('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)),
-   (('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))),
+   (('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)),
+   (('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))),
  
     (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('umax', a, b), 0)),
     (('ior',  ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('umin', a, b), 0)),
@@ -749,6 +753,23 @@ optimizations.extend([
     (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
     (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
     (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
+
+   (('ball_iequal2', a, b), ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
+   (('ball_iequal3', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('ieq', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
+   (('ball_iequal4', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('iand', ('ieq', 'a.z', 'b.z'), ('ieq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
+
+   (('bany_inequal2', a, b), ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
+   (('bany_inequal3', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ine', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
+   (('bany_inequal4', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ior', ('ine', 'a.z', 'b.z'), ('ine', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
+
+   (('ball_fequal2', a, b), ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
+   (('ball_fequal3', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('feq', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
+   (('ball_fequal4', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('iand', ('feq', 'a.z', 'b.z'), ('feq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
+
+   (('bany_fnequal2', a, b), ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
+   (('bany_fnequal3', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('fneu', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
+   (('bany_fnequal4', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('ior', ('fneu', 'a.z', 'b.z'), ('fneu', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
+
     (('fneu', ('fneg', a), a), ('fneu', a, 0.0)),
     (('feq', ('fneg', a), a), ('feq', a, 0.0)),
     # Emulating booleans
@@ -757,6 +778,7 @@ optimizations.extend([
     (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))),
     (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))),
     (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))),
+   (('iand', 'a@bool16', 1.0), ('b2f', a)),
     (('iand', 'a@bool32', 1.0), ('b2f', a)),
     # True/False are ~0 and 0 in NIR.  b2i of True is 1, and -1 is ~0 (True).
     (('ineg', ('b2i32', 'a@32')), a),
@@ -843,6 +865,7 @@ optimizations.extend([
     (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'),
     (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'),
     # Boolean simplifications
+   (('i2b16(is_used_by_if)', a), ('ine16', a, 0)),
     (('i2b32(is_used_by_if)', a), ('ine32', a, 0)),
     (('i2b1(is_used_by_if)', a), ('ine', a, 0)),
     (('ieq', a, True), a),
@@ -851,10 +874,10 @@ optimizations.extend([
     (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')),
     (('bcsel', a, True, False), a),
     (('bcsel', a, False, True), ('inot', a)),
-   (('bcsel@32', a, 1.0, 0.0), ('b2f', a)),
-   (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))),
-   (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))),
-   (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
+   (('bcsel', a, 1.0, 0.0), ('b2f', a)),
+   (('bcsel', a, 0.0, 1.0), ('b2f', ('inot', a))),
+   (('bcsel', a, -1.0, -0.0), ('fneg', ('b2f', a))),
+   (('bcsel', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
     (('bcsel', True, b, c), b),
     (('bcsel', False, b, c), c),
     (('bcsel', a, ('b2f(is_used_once)', 'b@32'), ('b2f', 'c@32')), ('b2f', ('bcsel', a, b, c))),
@@ -884,6 +907,7 @@ optimizations.extend([
     (('ushr', 'a@32', ('iand', 31, b)), ('ushr', a, b)),
  
     # Conversions
+   (('i2b16', ('b2i', 'a@16')), a),
     (('i2b32', ('b2i', 'a@32')), a),
     (('f2i', ('ftrunc', a)), ('f2i', a)),
     (('f2u', ('ftrunc', a)), ('f2u', a)),
@@ -1153,14 +1177,10 @@ optimizations.extend([
     (('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)),
     (('bcsel', a, b, ('bcsel', c, b, d)), ('bcsel', ('ior', a, c), b, d)),
  
-   (('fmin3@64', a, b, c), ('fmin@64', a, ('fmin@64', b, c))),
-   (('fmax3@64', a, b, c), ('fmax@64', a, ('fmax@64', b, c))),
-   (('fmed3@64', a, b, c), ('fmax@64', ('fmin@64', ('fmax@64', a, b), c), ('fmin@64', a, b))),
-
     # Misc. lowering
     (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
     (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
-   (('uadd_carry@32', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
+   (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
     (('usub_borrow@32', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
  
     (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
@@ -1874,9 +1894,9 @@ before_ffma_optimizations = [
     (('~fadd', ('fneg', a), ('fadd', a, b)), b),
     (('~fadd', a, ('fadd', ('fneg', a), b)), b),
  
-   (('~flrp@32', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a,  1.0), d), ('fadd', ('flrp', -1.0,  1.0, d), a)),
-   (('~flrp@32', ('fadd(is_used_once)', a,  1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp',  1.0, -1.0, d), a)),
-   (('~flrp@32', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))),
+   (('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a,  1.0), d), ('fadd', ('flrp', -1.0,  1.0, d), a)),
+   (('~flrp', ('fadd(is_used_once)', a,  1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp',  1.0, -1.0, d), a)),
+   (('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))),
  ]
  
  # This section contains "late" optimizations that should be run after the
@@ -1902,6 +1922,7 @@ late_optimizations = [
     (('iadd', 'a', ('ineg', 'b')), ('isub', 'a', 'b'), '!options->lower_sub'),
     (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
     (('ineg', a), ('isub', 0, a), 'options->lower_negate'),
+   (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'),
  
     # These are duplicated from the main optimizations table.  The late
     # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
@@ -1916,7 +1937,9 @@ late_optimizations = [
     (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
  
     (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
-   (('flt', ('fsat(is_used_once)', a), 1.0), ('flt', a, 1.0)),
+   # flt(fsat(a), 1.0) is inexact because it returns True if a is NaN
+   # (fsat(NaN) is 0), while flt(a, 1.0) always returns FALSE.
+   (('~flt', ('fsat(is_used_once)', a), 1.0), ('flt', a, 1.0)),
  
     (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))),
  
@@ -1944,8 +1967,7 @@ late_optimizations = [
     (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),
     (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
  
-   (('~flrp@32', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
-   (('~flrp@64', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
+   (('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
  
     (('~fadd@32', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp32'),
     (('~fadd@64', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp64'),