nir/algebraic: Reassociate open-coded flrp(1, b, c)

[mesa.git] / src / compiler / nir / nir_opt_algebraic.py
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py

index e8696abaf528f1d61c4ff8b27ec92a3c8c754d97..0d708d4fe1af7e636176eb3231e5f6c9dce51873 100644 (file)
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -35,6 +35,7 @@ a = 'a'
  b = 'b'
  c = 'c'
  d = 'd'
+e = 'e'
  
  # Written in the form (<search>, <replace>) where <search> is an expression
  # and <replace> is either an expression or a value.  An expression is
@@ -126,11 +127,22 @@ optimizations = [
     (('~flrp', a, b, 1.0), b),
     (('~flrp', a, a, b), a),
     (('~flrp', 0.0, a, b), ('fmul', a, b)),
+
+   # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
+   (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
+   (('~flrp@32', a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp32'),
+   (('~flrp@64', a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp64'),
+
+   (('~flrp@32', ('fadd', a, b), ('fadd', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp32'),
+   (('~flrp@64', ('fadd', a, b), ('fadd', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp64'),
+
+   (('~flrp@32', a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp32'),
+   (('~flrp@64', a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp64'),
+
+   (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)),
+
     (('~flrp', a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp32'),
     (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),
-   (('flrp@16', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp16'),
-   (('flrp@32', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp32'),
-   (('flrp@64', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp64'),
     (('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),
     (('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
     (('fadd', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
@@ -145,6 +157,9 @@ optimizations = [
     (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
     (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'),
  
+   (('~fmul', ('fadd', ('iand', ('ineg', ('b2i32', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
+    ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
+
     (('fdot4', ('vec4', a, b,   c,   1.0), d), ('fdph',  ('vec3', a, b, c), d)),
     (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)),
     (('fdot4', ('vec4', a, b,   0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)),
@@ -218,6 +233,9 @@ optimizations = [
     (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))),
     (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))),
  
+   (('flt', a, ('fneg', a)), ('flt', a, 0.0)),
+   (('fge', a, ('fneg', a)), ('fge', a, 0.0)),
+
     # Some optimizations (below) convert things like (a < b || c < b) into
     # (min(a, c) < b).  However, this interfers with the previous optimizations
     # that try to remove comparisons with negated sums of b2f.  This just
@@ -394,9 +412,18 @@ optimizations = [
     (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)),
     (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)),
  
+   # The (i2f32, ...) part is an open-coded fsign.  When that is combined with
+   # the bcsel, it's basically copysign(1.0, a).  There is no copysign in NIR,
+   # so emit an open-coded version of that.
+   (('bcsel@32', ('feq', a, 0.0), 1.0, ('i2f32', ('iadd', ('b2i32', ('flt', 0.0, 'a@32')), ('ineg', ('b2i32', ('flt', 'a@32', 0.0)))))),
+    ('ior', 0x3f800000, ('iand', a, 0x80000000))),
+
     (('ior', a, ('ieq', a, False)), True),
     (('ior', a, ('inot', a)), -1),
  
+   (('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)),
+   (('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))),
+
     (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', 'a@32', 'b@32'), 0)),
  
     # These patterns can result when (a < b || a < c) => (a < min(b, c))
@@ -824,6 +851,7 @@ optimizations.extend([
       'options->lower_unpack_snorm_4x8'),
  
     (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
+   (('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'),
  ])
  
  # bit_size dependent lowerings
@@ -867,6 +895,48 @@ for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']):
     x2yN = '{}2{}'.format(x, y)
     optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
  
+# Optimize away x2xN(a@N)
+for t in ['int', 'uint', 'float']:
+   for N in type_sizes(t):
+      x2xN = '{0}2{0}{1}'.format(t[0], N)
+      aN = 'a@{0}'.format(N)
+      optimizations.append(((x2xN, aN), a))
+
+# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers
+# In particular, we can optimize away everything except upcast of downcast and
+# upcasts where the type differs from the other cast
+for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')):
+   if N < M:
+      # The outer cast is a down-cast.  It doesn't matter what the size of the
+      # argument of the inner cast is because we'll never been in the upcast
+      # of downcast case.  Regardless of types, we'll always end up with y2yN
+      # in the end.
+      for x, y in itertools.product(['i', 'u'], ['i', 'u']):
+         x2xN = '{0}2{0}{1}'.format(x, N)
+         y2yM = '{0}2{0}{1}'.format(y, M)
+         y2yN = '{0}2{0}{1}'.format(y, N)
+         optimizations.append(((x2xN, (y2yM, a)), (y2yN, a)))
+   elif N > M:
+      # If the outer cast is an up-cast, we have to be more careful about the
+      # size of the argument of the inner cast and with types.  In this case,
+      # the type is always the type of type up-cast which is given by the
+      # outer cast.
+      for P in type_sizes('uint'):
+         # We can't optimize away up-cast of down-cast.
+         if M < P:
+            continue
+
+         # Because we're doing down-cast of down-cast, the types always have
+         # to match between the two casts
+         for x in ['i', 'u']:
+            x2xN = '{0}2{0}{1}'.format(x, N)
+            x2xM = '{0}2{0}{1}'.format(x, M)
+            aP = 'a@{0}'.format(P)
+            optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a)))
+   else:
+      # The N == M case is handled by other optimizations
+      pass
+
  def fexp2i(exp, bits):
     # We assume that exp is already in the right range.
     if bits == 16:
@@ -1004,6 +1074,10 @@ before_ffma_optimizations = [
     (('iadd', a, ('iadd', ('ineg', a), b)), b),
     (('~fadd', ('fneg', a), ('fadd', a, b)), b),
     (('~fadd', a, ('fadd', ('fneg', a), b)), b),
+
+   (('~flrp@32', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a,  1.0), d), ('fadd', ('flrp', -1.0,  1.0, d), a)),
+   (('~flrp@32', ('fadd(is_used_once)', a,  1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp',  1.0, -1.0, d), a)),
+   (('~flrp@32', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))),
  ]
  
  # This section contains "late" optimizations that should be run after the
@@ -1027,9 +1101,15 @@ late_optimizations = [
     (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),
     (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
  
+   (('~flrp@32', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
+   (('~flrp@64', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
+
     (('b2f(is_used_more_than_once)', ('inot', 'a@1')), ('bcsel', a, 0.0, 1.0)),
     (('fneg(is_used_more_than_once)', ('b2f', ('inot', 'a@1'))), ('bcsel', a, -0.0, -1.0)),
  
+   (('~fadd@32', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp32'),
+   (('~fadd@64', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp64'),
+
     # we do these late so that we don't get in the way of creating ffmas
     (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
     (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),