nir/algebraic: Mark some value range analysis-based optimizations imprecise

[mesa.git] / src / compiler / nir / nir_opt_algebraic.py
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py

index b26adc750f96b8ddab71c6cec4dc2e59e523e39d..3fc35995bcac1251dc882e7f184c215c258614bf 100644 (file)
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -204,6 +204,7 @@ optimizations = [
     # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1
     # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0
     (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
+   (('~fsub', 1.0, ('fsat', a)), ('fsat', ('fsub', 1.0, a))),
  
     # 1 - ((1 - a) * (1 - b))
     # 1 - (1 - a - b + a*b)
@@ -224,6 +225,35 @@ optimizations = [
     # (a * #b) << #c
     # a * (#b << #c)
     (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))),
+]
+
+# Care must be taken here.  Shifts in NIR uses only the lower log2(bitsize)
+# bits of the second source.  These replacements must correctly handle the
+# case where (b % bitsize) + (c % bitsize) >= bitsize.
+for s in [8, 16, 32, 64]:
+   mask = (1 << s) - 1
+
+   ishl = "ishl@{}".format(s)
+   ishr = "ishr@{}".format(s)
+   ushr = "ushr@{}".format(s)
+
+   in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s)
+
+   optimizations.extend([
+       ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)),
+       ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)),
+
+       # To get get -1 for large shifts of negative values, ishr must instead
+       # clamp the shift count to the maximum value.
+       ((ishr, (ishr, a, '#b'), '#c'),
+        (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),
+   ])
+
+optimizations.extend([
+   # This is common for address calculations.  Reassociating may enable the
+   # 'a<<c' to be CSE'd.  It also helps architectures that have an ISHLADD
+   # instruction or a constant offset field for in load / store instructions.
+   (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))),
  
     # Comparison simplifications
     (('~inot', ('flt', a, b)), ('fge', a, b)),
@@ -326,6 +356,14 @@ optimizations = [
     (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)),
     (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)),
     (('~fne', ('fadd', a, b), a), ('fne', b, 0.0)),
+   (('~flt',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('flt', a, ('fadd', c, ('fneg', b)))),
+   (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)),
+   (('~fge',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fge', a, ('fadd', c, ('fneg', b)))),
+   (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)),
+   (('~feq',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('feq', a, ('fadd', c, ('fneg', b)))),
+   (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)),
+   (('~fne',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fne', a, ('fadd', c, ('fneg', b)))),
+   (('~fne', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fne', ('fneg', ('fadd', c, b)), a)),
  
     # Cannot remove the addition from ilt or ige due to overflow.
     (('ieq', ('iadd', a, b), a), ('ieq', b, 0)),
@@ -756,8 +794,6 @@ optimizations = [
     (('f2u', ('ftrunc', a)), ('f2u', a)),
     (('i2b', ('ineg', a)), ('i2b', a)),
     (('i2b', ('iabs', a)), ('i2b', a)),
-   (('fabs', ('b2f', a)), ('b2f', a)),
-   (('iabs', ('b2i', a)), ('b2i', a)),
     (('inot', ('f2b1', a)), ('feq', a, 0.0)),
  
     # Ironically, mark these as imprecise because removing the conversions may
@@ -768,6 +804,15 @@ optimizations = [
     (('~f2u32', ('i2f', 'a@32')), a),
     (('~f2u32', ('u2f', 'a@32')), a),
  
+   (('ffloor', 'a(is_integral)'), a),
+   (('fceil', 'a(is_integral)'), a),
+   (('ftrunc', 'a(is_integral)'), a),
+   # fract(x) = x - floor(x), so fract(NaN) = NaN
+   (('~ffract', 'a(is_integral)'), 0.0),
+   (('fabs', 'a(is_not_negative)'), a),
+   (('iabs', 'a(is_not_negative)'), a),
+   (('fsat', 'a(is_not_positive)'), 0.0),
+
     # Section 5.4.1 (Conversion and Scalar Constructors) of the GLSL 4.60 spec
     # says:
     #
@@ -781,7 +826,7 @@ optimizations = [
     (('ilt', ('f2u', a), b), ('ilt', ('f2i', a), b)),
     (('ilt', b, ('f2u', a)), ('ilt', b, ('f2i', a))),
  
-   (('~fmin', ('fabs', a), 1.0), ('fsat', ('fabs', a)), '!options->lower_fsat'),
+   (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'),
  
     # The result of the multiply must be in [-1, 0], so the result of the ffma
     # must be in [0, 1].
@@ -790,6 +835,37 @@ optimizations = [
     (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),
     (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),
  
+   (('fne', 'a(is_not_zero)', 0.0), True),
+   (('feq', 'a(is_not_zero)', 0.0), False),
+
+   # The results expecting true, must be marked imprecise.  The results
+   # expecting false are fine because NaN compared >= or < anything is false.
+
+   (('~fge', 'a(is_not_negative)', 'b(is_not_positive)'), True),
+   (('fge',  'b(is_not_positive)', 'a(is_gt_zero)'),      False),
+   (('fge',  'a(is_lt_zero)',      'b(is_not_negative)'), False),
+   (('~fge', 'b(is_not_negative)', 'a(is_not_positive)'), True),
+
+   (('flt',  'a(is_not_negative)', 'b(is_not_positive)'), False),
+   (('~flt', 'b(is_not_positive)', 'a(is_gt_zero)'),      True),
+   (('~flt', 'a(is_lt_zero)',      'b(is_not_negative)'), True),
+   (('flt',  'b(is_not_negative)', 'a(is_not_positive)'), False),
+
+   (('ine', 'a(is_not_zero)', 0), True),
+   (('ieq', 'a(is_not_zero)', 0), False),
+
+   (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True),
+   (('ige', 'b(is_not_positive)', 'a(is_gt_zero)'),      False),
+   (('ige', 'a(is_lt_zero)',      'b(is_not_negative)'), False),
+   (('ige', 'b(is_not_negative)', 'a(is_not_positive)'), True),
+
+   (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False),
+   (('ilt', 'b(is_not_positive)', 'a(is_gt_zero)'),      True),
+   (('ilt', 'a(is_lt_zero)',      'b(is_not_negative)'), True),
+   (('ilt', 'b(is_not_negative)', 'a(is_not_positive)'), False),
+
+   (('ult', 0, 'a(is_gt_zero)'), True),
+
     # Packing and then unpacking does nothing
     (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
     (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b),
@@ -811,8 +887,16 @@ optimizations = [
     (('ishr', 'a@16',  8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
     (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),
     (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),
-   (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte')
-]
+   (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
+
+   # Useless masking before unpacking
+   (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)),
+   (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)),
+   (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)),
+   (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)),
+   (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),
+   (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),
+])
  
  # After the ('extract_u8', a, 0) pattern, above, triggers, there will be
  # patterns like those below.
@@ -841,6 +925,8 @@ optimizations.extend([
     # Subtracts
     (('~fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
     (('isub', a, ('isub', 0, b)), ('iadd', a, b)),
+   (('isub', ('iadd', a, b), b), a),
+   (('~fsub', ('fadd', a, b), b), a),
     (('ussub_4x8', a, 0), a),
     (('ussub_4x8', a, ~0), 0),
     (('fsub', a, b), ('fadd', a, ('fneg', b)), 'options->lower_sub'),
@@ -1237,7 +1323,7 @@ def bitfield_reverse(u):
  
      return step5
  
-optimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'))]
+optimizations += [(bitfield_reverse('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
  
  # For any float comparison operation, "cmp", if you have "a == a && a cmp b"
  # then the "a == a" is redundant because it's equivalent to "a is not NaN"