nir/algebraic: add optimization pattern for ('ult', a, ('and', b, a)) and friends.

[mesa.git] / src / compiler / nir / nir_opt_algebraic.py
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py

index 5e07d662b0717ebf07aff0a0b2ca5d4462d512f1..36e576b7eff89edd84ce9716b13944440539716c 100644 (file)
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -27,6 +27,7 @@ from __future__ import print_function
  
  from collections import OrderedDict
  import nir_algebraic
+from nir_opcodes import type_sizes
  import itertools
  
  # Convenience variables
@@ -34,6 +35,7 @@ a = 'a'
  b = 'b'
  c = 'c'
  d = 'd'
+e = 'e'
  
  # Written in the form (<search>, <replace>) where <search> is an expression
  # and <replace> is either an expression or a value.  An expression is
@@ -60,20 +62,26 @@ d = 'd'
  #
  # All expression types can have a bit-size specified.  For opcodes, this
  # looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a
-# type and size, and for literals, you can write "2.0@32".  In the search half
-# of the expression this indicates that it should only match that particular
-# bit-size.  In the replace half of the expression this indicates that the
-# constructed value should have that bit-size.
+# type and size.  In the search half of the expression this indicates that it
+# should only match that particular bit-size.  In the replace half of the
+# expression this indicates that the constructed value should have that
+# bit-size.
  
  optimizations = [
  
-   (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b))),
-   (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b))))),
+   (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitshift'),
+   (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitshift'),
+   (('ishl', a, '#b@32'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitshift'),
+
+   (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
+   (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
+   (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'),
+   (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'),
     (('udiv', a, 1), a),
     (('idiv', a, 1), a),
     (('umod', a, 1), 0),
     (('imod', a, 1), 0),
-   (('udiv', a, '#b@32(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b))),
+   (('udiv', a, '#b@32(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitshift'),
     (('idiv', a, '#b@32(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), 'options->lower_idiv'),
     (('idiv', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), 'options->lower_idiv'),
     (('umod', a, '#b(is_pos_power_of_two)'),    ('iand', a, ('isub', b, 1))),
@@ -82,9 +90,11 @@ optimizations = [
     (('ineg', ('ineg', a)), a),
     (('fabs', ('fabs', a)), ('fabs', a)),
     (('fabs', ('fneg', a)), ('fabs', a)),
-   (('fabs', ('u2f32', a)), ('u2f32', a)),
+   (('fabs', ('u2f', a)), ('u2f', a)),
     (('iabs', ('iabs', a)), ('iabs', a)),
     (('iabs', ('ineg', a)), ('iabs', a)),
+   (('f2b', ('fneg', a)), ('f2b', a)),
+   (('i2b', ('ineg', a)), ('i2b', a)),
     (('~fadd', a, 0.0), a),
     (('iadd', a, 0), a),
     (('usadd_4x8', a, 0), a),
@@ -97,6 +107,7 @@ optimizations = [
     (('iadd', a, ('iadd', ('ineg', a), b)), b),
     (('~fadd', ('fneg', a), ('fadd', a, b)), b),
     (('~fadd', a, ('fadd', ('fneg', a), b)), b),
+   (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))),
     (('~fmul', a, 0.0), 0.0),
     (('imul', a, 0), 0),
     (('umul_unorm_4x8', a, 0), 0),
@@ -105,29 +116,57 @@ optimizations = [
     (('imul', a, 1), a),
     (('fmul', a, -1.0), ('fneg', a)),
     (('imul', a, -1), ('ineg', a)),
+   # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a
+   # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a
+   # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0
+   (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)),
+   (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)),
     (('~ffma', 0.0, a, b), b),
-   (('~ffma', a, 0.0, b), b),
     (('~ffma', a, b, 0.0), ('fmul', a, b)),
-   (('ffma', a, 1.0, b), ('fadd', a, b)),
     (('ffma', 1.0, a, b), ('fadd', a, b)),
+   (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)),
     (('~flrp', a, b, 0.0), a),
     (('~flrp', a, b, 1.0), b),
     (('~flrp', a, a, b), a),
     (('~flrp', 0.0, a, b), ('fmul', a, b)),
-   (('~flrp', a, b, ('b2f', c)), ('bcsel', c, b, a), 'options->lower_flrp32'),
+
+   # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
+   (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
+   (('~flrp@32', a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp32'),
+   (('~flrp@64', a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp64'),
+
+   (('~flrp@32', ('fadd', a, b), ('fadd', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp32'),
+   (('~flrp@64', ('fadd', a, b), ('fadd', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp64'),
+
+   (('~flrp@32', a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp32'),
+   (('~flrp@64', a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp64'),
+
+   (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)),
+
+   (('~flrp', a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp32'),
     (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),
-   (('flrp@32', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp32'),
-   (('flrp@64', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp64'),
+   (('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),
+   (('ffloor', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
+   (('fadd', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
     (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
-   (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', c)))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp32'),
-   (('~fadd@32', ('fmul', a, ('fadd', 1.0, ('fneg',         c ))), ('fmul', b,         c )), ('flrp', a, b, c), '!options->lower_flrp32'),
-   (('~fadd@64', ('fmul', a, ('fadd', 1.0, ('fneg',         c ))), ('fmul', b,         c )), ('flrp', a, b, c), '!options->lower_flrp64'),
-   (('~fadd', a, ('fmul', ('b2f', c), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp32'),
+   (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'),
+   (('~fadd',    ('fmul', a,          ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f',  c))), ('bcsel', c, b, a), 'options->lower_flrp32'),
+   (('~fadd@32', ('fmul', a,          ('fadd', 1.0, ('fneg',          c   ) )), ('fmul', b,          c )), ('flrp', a, b, c), '!options->lower_flrp32'),
+   (('~fadd@64', ('fmul', a,          ('fadd', 1.0, ('fneg',          c   ) )), ('fmul', b,          c )), ('flrp', a, b, c), '!options->lower_flrp64'),
+   # These are the same as the previous three rules, but it depends on
+   # 1-fsat(x) <=> fsat(1-x).  See below.
+   (('~fadd@32', ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg',          c   )))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp32'),
+   (('~fadd@64', ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg',          c   )))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp64'),
+
+   (('~fadd', a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp32'),
     (('~fadd@32', a, ('fmul',         c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp32'),
     (('~fadd@64', a, ('fmul',         c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp64'),
     (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
     (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'),
  
+   (('~fmul', ('fadd', ('iand', ('ineg', ('b2i32', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
+    ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
+
     (('fdot4', ('vec4', a, b,   c,   1.0), d), ('fdph',  ('vec3', a, b, c), d)),
     (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)),
     (('fdot4', ('vec4', a, b,   0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)),
@@ -136,6 +175,11 @@ optimizations = [
     (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)),
     (('fdot3', ('vec3', a, b,   0.0), c), ('fdot2', ('vec2', a, b), c)),
  
+   # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially
+   # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1
+   # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0
+   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
+
     # (a * #b + #c) << #d
     # ((a * #b) << #d) + (#c << #d)
     # (a * (#b << #d)) + (#c << #d)
@@ -158,13 +202,84 @@ optimizations = [
     (('inot', ('ieq', a, b)), ('ine', a, b)),
     (('inot', ('ine', a, b)), ('ieq', a, b)),
  
+   # This helps some shaders because, after some optimizations, they end up
+   # with patterns like (-a < -b) || (b < a).  In an ideal world, this sort of
+   # matching would be handled by CSE.
+   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
+   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
+   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
+   (('fne', ('fneg', a), ('fneg', b)), ('fne', b, a)),
+   (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
+   (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
+   (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
+   (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
+   (('fne', ('fneg', a), -1.0), ('fne', 1.0, a)),
+   (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
+
+   (('flt', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('flt', a, b)),
+   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
+   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
+   (('fge', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('fge', b, a)),
+   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
+   (('fne', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fne', a, b)),
+
+   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
+   (('flt', ('fsat(is_used_once)', a), 1.0), ('flt', a, 1.0)),
+   (('fge', 0.0, ('fsat(is_used_once)', a)), ('fge', 0.0, a)),
+   (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)),
+
     # 0.0 >= b2f(a)
     # b2f(a) <= 0.0
     # b2f(a) == 0.0 because b2f(a) can only be 0 or 1
     # inot(a)
-   (('fge', 0.0, ('b2f', a)), ('inot', a)),
-
-   (('fge', ('fneg', ('b2f', a)), 0.0), ('inot', a)),
+   (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)),
+
+   (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)),
+
+   (('fne', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
+   (('fne', ('fmax', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
+   (('fne', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('ior', a, b)),
+   (('fne', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('ior', a, b)),
+   (('fne', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
+   (('fne', ('fmin', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
+   (('fne', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('iand', a, b)),
+   (('fne', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)),
+   (('fne',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ixor', a, b)),
+   (('fne', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ixor', a, b)),
+   (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))),
+   (('feq', ('fmax', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))),
+   (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('inot', ('ior', a, b))),
+   (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('inot', ('ior', a, b))),
+   (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))),
+   (('feq', ('fmin', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))),
+   (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('inot', ('iand', a, b))),
+   (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)),
+   (('feq',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ieq', a, b)),
+   (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ieq', a, b)),
+
+   # -(b2f(a) + b2f(b)) < 0
+   # 0 < b2f(a) + b2f(b)
+   # 0 != b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
+   # a || b
+   (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)),
+   (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)),
+
+   # -(b2f(a) + b2f(b)) >= 0
+   # 0 >= b2f(a) + b2f(b)
+   # 0 == b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
+   # !(a || b)
+   (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))),
+   (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))),
+
+   (('flt', a, ('fneg', a)), ('flt', a, 0.0)),
+   (('fge', a, ('fneg', a)), ('fge', a, 0.0)),
+
+   # Some optimizations (below) convert things like (a < b || c < b) into
+   # (min(a, c) < b).  However, this interfers with the previous optimizations
+   # that try to remove comparisons with negated sums of b2f.  This just
+   # breaks that apart.
+   (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0),
+    ('ior', ('flt', c, 0.0), ('ior', a, b))),
  
     (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)),
     (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)),
@@ -185,24 +300,50 @@ optimizations = [
     # The fge in the second replacement is not a typo.  I leave the proof that
     # "fmin(-b2f(a), b) >= 0 <=> fmin(-b2f(a), b) == 0" as an exercise for the
     # reader.
-   (('fge', ('fmin', ('fneg', ('b2f', a)), b), 0.0), ('iand', ('inot', a), ('fge', b, 0.0))),
-   (('feq', ('fmin', ('fneg', ('b2f', a)), b), 0.0), ('iand', ('inot', a), ('fge', b, 0.0))),
-
-   (('feq', ('b2f', a), 0.0), ('inot', a)),
-   (('fne', ('b2f', a), 0.0), a),
-   (('ieq', ('b2i', a), 0),   ('inot', a)),
-   (('ine', ('b2i', a), 0),   a),
+   (('fge', ('fmin', ('fneg', ('b2f', 'a@1')), 'b@1'), 0.0), ('iand', ('inot', a), ('fge', b, 0.0))),
+   (('feq', ('fmin', ('fneg', ('b2f', 'a@1')), 'b@1'), 0.0), ('iand', ('inot', a), ('fge', b, 0.0))),
+
+   (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)),
+   (('fne', ('b2f', 'a@1'), 0.0), a),
+   (('ieq', ('b2i', 'a@1'), 0),   ('inot', a)),
+   (('ine', ('b2i', 'a@1'), 0),   a),
+
+   (('fne', ('u2f', a), 0.0), ('ine', a, 0)),
+   (('feq', ('u2f', a), 0.0), ('ieq', a, 0)),
+   (('fge', ('u2f', a), 0.0), True),
+   (('fge', 0.0, ('u2f', a)), ('uge', 0, a)),    # ieq instead?
+   (('flt', ('u2f', a), 0.0), False),
+   (('flt', 0.0, ('u2f', a)), ('ult', 0, a)),    # ine instead?
+   (('fne', ('i2f', a), 0.0), ('ine', a, 0)),
+   (('feq', ('i2f', a), 0.0), ('ieq', a, 0)),
+   (('fge', ('i2f', a), 0.0), ('ige', a, 0)),
+   (('fge', 0.0, ('i2f', a)), ('ige', 0, a)),
+   (('flt', ('i2f', a), 0.0), ('ilt', a, 0)),
+   (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)),
  
     # 0.0 < fabs(a)
     # fabs(a) > 0.0
     # fabs(a) != 0.0 because fabs(a) must be >= 0
     # a != 0.0
-   (('flt', 0.0, ('fabs', a)), ('fne', a, 0.0)),
+   (('~flt', 0.0, ('fabs', a)), ('fne', a, 0.0)),
+
+   # -fabs(a) < 0.0
+   # fabs(a) > 0.0
+   (('~flt', ('fneg', ('fabs', a)), 0.0), ('fne', a, 0.0)),
  
-   (('fmax',                        ('b2f(is_used_once)', a),           ('b2f', b)),           ('b2f', ('ior', a, b))),
-   (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', a)), ('fneg', ('b2f', b))), ('fneg', ('b2f', ('ior', a, b)))),
-   (('fmin',                        ('b2f(is_used_once)', a),           ('b2f', b)),           ('b2f', ('iand', a, b))),
-   (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', a)), ('fneg', ('b2f', b))), ('fneg', ('b2f', ('iand', a, b)))),
+   # 0.0 >= fabs(a)
+   # 0.0 == fabs(a)   because fabs(a) must be >= 0
+   # 0.0 == a
+   (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)),
+
+   # -fabs(a) >= 0.0
+   # 0.0 >= fabs(a)
+   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
+
+   (('fmax',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('ior', a, b))),
+   (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))),
+   (('fmin',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('iand', a, b))),
+   (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))),
  
     # fmin(b2f(a), b)
     # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b))
@@ -211,7 +352,7 @@ optimizations = [
     #
     # Since b is a constant, constant folding will eliminate the fmin and the
     # fmax.  If b is > 1.0, the bcsel will be replaced with a b2f.
-   (('fmin', ('b2f', a), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))),
+   (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))),
  
     (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)),
  
@@ -220,15 +361,32 @@ optimizations = [
     (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
     (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)),
     (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)),
+   (('bcsel', ('i2b', a), b, c), ('bcsel', ('ine', a, 0), b, c)),
     (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)),
     (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
-   (('bcsel', a, True, 'b@bool'), ('ior', a, b)),
+   (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)),
+   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
+   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
+   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
+   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
+   (('bcsel', a, True, b), ('ior', a, b)),
+   (('bcsel', a, a, b), ('ior', a, b)),
+   (('bcsel', a, b, False), ('iand', a, b)),
+   (('bcsel', a, b, a), ('iand', a, b)),
     (('fmin', a, a), a),
     (('fmax', a, a), a),
     (('imin', a, a), a),
     (('imax', a, a), a),
     (('umin', a, a), a),
     (('umax', a, a), a),
+   (('fmax', ('fmax', a, b), b), ('fmax', a, b)),
+   (('umax', ('umax', a, b), b), ('umax', a, b)),
+   (('imax', ('imax', a, b), b), ('imax', a, b)),
+   (('fmin', ('fmin', a, b), b), ('fmin', a, b)),
+   (('umin', ('umin', a, b), b), ('umin', a, b)),
+   (('imin', ('imin', a, b), b), ('imin', a, b)),
+   (('fmax', a, ('fneg', a)), ('fabs', a)),
+   (('imax', a, ('ineg', a)), ('iabs', a)),
     (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))),
     (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))),
     (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))),
@@ -241,10 +399,18 @@ optimizations = [
     (('imax', a, ('iabs', a)), ('iabs', a)),
     (('fmax', a, ('fneg', a)), ('fabs', a)),
     (('imax', a, ('ineg', a)), ('iabs', a)),
+   (('~fmax', ('fabs', a), 0.0), ('fabs', a)),
     (('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
     (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
+   (('~fmin', ('fmax', a, -1.0),  0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_negate && !options->lower_fsat'),
+   (('~fmax', ('fmin', a,  0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_negate && !options->lower_fsat'),
+   (('fsat', ('fsign', a)), ('b2f', ('flt', 0.0, a))),
+   (('fsat', ('b2f', a)), ('b2f', a)),
     (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
     (('fsat', ('fsat', a)), ('fsat', a)),
+   (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_negate && !options->lower_fsat'),
+   (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_negate && !options->lower_fsat'),
+   (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'),
     (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),
     (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
     (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),
@@ -285,6 +451,23 @@ optimizations = [
     (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))),
     (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)),
  
+   # Common pattern like 'if (i == 0 || i == 1 || ...)'
+   (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)),
+   (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)),
+   (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)),
+
+   # The (i2f32, ...) part is an open-coded fsign.  When that is combined with
+   # the bcsel, it's basically copysign(1.0, a).  There is no copysign in NIR,
+   # so emit an open-coded version of that.
+   (('bcsel@32', ('feq', a, 0.0), 1.0, ('i2f32', ('iadd', ('b2i32', ('flt', 0.0, 'a@32')), ('ineg', ('b2i32', ('flt', 'a@32', 0.0)))))),
+    ('ior', 0x3f800000, ('iand', a, 0x80000000))),
+
+   (('ior', a, ('ieq', a, False)), True),
+   (('ior', a, ('inot', a)), -1),
+
+   (('ine', ('ineg', ('b2i32', 'a@1')), ('ineg', ('b2i32', 'b@1'))), ('ine', a, b)),
+   (('b2i32', ('ine', 'a@1', 'b@1')), ('b2i32', ('ixor', a, b))),
+
     (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', 'a@32', 'b@32'), 0)),
  
     # These patterns can result when (a < b || a < c) => (a < min(b, c))
@@ -314,6 +497,10 @@ optimizations = [
     (('ult', ('umax', a, b), a), False),
     (('uge', a, ('umax', b, a)), ('uge', a, b)),
     (('uge', ('umin', a, b), a), ('uge', b, a)),
+   (('ult', a, ('iand', b, a)), False),
+   (('ult', ('ior', a, b), a), False),
+   (('uge', a, ('iand', b, a)), True),
+   (('uge', ('ior', a, b), a), True),
  
     (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))),
     (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))),
@@ -332,6 +519,12 @@ optimizations = [
     (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))),
     (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))),
  
+   # Thanks to sign extension, the ishr(a, b) is negative if and only if a is
+   # negative.
+   (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)),
+    ('iabs', ('ishr', a, b))),
+   (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)),
+
     (('fabs', ('slt', a, b)), ('slt', a, b)),
     (('fabs', ('sge', a, b)), ('sge', a, b)),
     (('fabs', ('seq', a, b)), ('seq', a, b)),
@@ -343,14 +536,14 @@ optimizations = [
     (('fne', ('fneg', a), a), ('fne', a, 0.0)),
     (('feq', ('fneg', a), a), ('feq', a, 0.0)),
     # Emulating booleans
-   (('imul', ('b2i', a), ('b2i', b)), ('b2i', ('iand', a, b))),
-   (('fmul', ('b2f', a), ('b2f', b)), ('b2f', ('iand', a, b))),
-   (('fsat', ('fadd', ('b2f', a), ('b2f', b))), ('b2f', ('ior', a, b))),
-   (('iand', 'a@bool', 1.0), ('b2f', a), '!options->lower_b2f'),
+   (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
+   (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))),
+   (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))),
+   (('iand', 'a@bool32', 1.0), ('b2f', a)),
     # True/False are ~0 and 0 in NIR.  b2i of True is 1, and -1 is ~0 (True).
-   (('ineg', ('b2i@32', a)), a),
-   (('flt', ('fneg', ('b2f', a)), 0), a), # Generated by TGSI KILL_IF.
-   (('flt', ('fsub', 0.0, ('b2f', a)), 0), a), # Generated by TGSI KILL_IF.
+   (('ineg', ('b2i32', 'a@32')), a),
+   (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF.
+   (('flt', ('fsub', 0.0, ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF.
     # Comparison with the same args.  Note that these are not done for
     # the float versions because NaN always returns false on float
     # inequalities.
@@ -372,6 +565,10 @@ optimizations = [
     (('ixor', a, a), 0),
     (('ixor', a, 0), a),
     (('inot', ('inot', a)), a),
+   (('ior', ('iand', a, b), b), b),
+   (('ior', ('ior', a, b), b), ('ior', a, b)),
+   (('iand', ('ior', a, b), b), b),
+   (('iand', ('iand', a, b), b), ('iand', a, b)),
     # DeMorgan's Laws
     (('iand', ('inot', a), ('inot', b)), ('inot', ('ior',  a, b))),
     (('ior',  ('inot', a), ('inot', b)), ('inot', ('iand', a, b))),
@@ -407,6 +604,7 @@ optimizations = [
     (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
     (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
     (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))),
+   (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))),
     # Division and reciprocal
     (('~fdiv', 1.0, a), ('frcp', a)),
     (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
@@ -415,10 +613,12 @@ optimizations = [
     (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
     (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
     # Boolean simplifications
-   (('ieq', 'a@bool', True), a),
-   (('ine(is_not_used_by_if)', 'a@bool', True), ('inot', a)),
-   (('ine', 'a@bool', False), a),
-   (('ieq(is_not_used_by_if)', 'a@bool', False), ('inot', 'a')),
+   (('i2b32(is_used_by_if)', a), ('ine32', a, 0)),
+   (('i2b1(is_used_by_if)', a), ('ine', a, 0)),
+   (('ieq', a, True), a),
+   (('ine(is_not_used_by_if)', a, True), ('inot', a)),
+   (('ine', a, False), a),
+   (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')),
     (('bcsel', a, True, False), a),
     (('bcsel', a, False, True), ('inot', a)),
     (('bcsel@32', a, 1.0, 0.0), ('b2f', a)),
@@ -427,24 +627,68 @@ optimizations = [
     (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
     (('bcsel', True, b, c), b),
     (('bcsel', False, b, c), c),
-   (('bcsel', a, ('b2f(is_used_once)', b), ('b2f', c)), ('b2f', ('bcsel', a, b, c))),
-   # The result of this should be hit by constant propagation and, in the
-   # next round of opt_algebraic, get picked up by one of the above two.
-   (('bcsel', '#a', b, c), ('bcsel', ('ine', 'a', 0), b, c)),
+   (('bcsel', a, ('b2f(is_used_once)', 'b@32'), ('b2f', 'c@32')), ('b2f', ('bcsel', a, b, c))),
  
     (('bcsel', a, b, b), b),
     (('fcsel', a, b, b), b),
  
+   # D3D Boolean emulation
+   (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))),
+   (('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))),
+   (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
+    ('ineg', ('b2i', ('iand', a, b)))),
+   (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))),
+    ('ineg', ('b2i', ('ior', a, b)))),
+   (('ieq', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)),
+   (('ieq', ('ineg', ('b2i', 'a@1')), -1), a),
+   (('ine', ('ineg', ('b2i', 'a@1')), 0), a),
+   (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)),
+   (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)),
+
+   # SM5 32-bit shifts are defined to use the 5 least significant bits
+   (('ishl', 'a@32', ('iand', 31, b)), ('ishl', a, b)),
+   (('ishr', 'a@32', ('iand', 31, b)), ('ishr', a, b)),
+   (('ushr', 'a@32', ('iand', 31, b)), ('ushr', a, b)),
+
     # Conversions
-   (('i2b', ('b2i', a)), a),
-   (('i2b', 'a@bool'), a),
-   (('f2i32', ('ftrunc', a)), ('f2i32', a)),
-   (('f2u32', ('ftrunc', a)), ('f2u32', a)),
+   (('i2b32', ('b2i', 'a@32')), a),
+   (('f2i', ('ftrunc', a)), ('f2i', a)),
+   (('f2u', ('ftrunc', a)), ('f2u', a)),
     (('i2b', ('ineg', a)), ('i2b', a)),
     (('i2b', ('iabs', a)), ('i2b', a)),
     (('fabs', ('b2f', a)), ('b2f', a)),
     (('iabs', ('b2i', a)), ('b2i', a)),
-   (('inot', ('f2b', a)), ('feq', a, 0.0)),
+   (('inot', ('f2b1', a)), ('feq', a, 0.0)),
+
+   # Ironically, mark these as imprecise because removing the conversions may
+   # preserve more precision than doing the conversions (e.g.,
+   # uint(float(0x81818181u)) == 0x81818200).
+   (('~f2i32', ('i2f', 'a@32')), a),
+   (('~f2i32', ('u2f', 'a@32')), a),
+   (('~f2u32', ('i2f', 'a@32')), a),
+   (('~f2u32', ('u2f', 'a@32')), a),
+
+   # Section 5.4.1 (Conversion and Scalar Constructors) of the GLSL 4.60 spec
+   # says:
+   #
+   #    It is undefined to convert a negative floating-point value to an
+   #    uint.
+   #
+   # Assuming that (uint)some_float behaves like (uint)(int)some_float allows
+   # some optimizations in the i965 backend to proceed.
+   (('ige', ('f2u', a), b), ('ige', ('f2i', a), b)),
+   (('ige', b, ('f2u', a)), ('ige', b, ('f2i', a))),
+   (('ilt', ('f2u', a), b), ('ilt', ('f2i', a), b)),
+   (('ilt', b, ('f2u', a)), ('ilt', b, ('f2i', a))),
+
+   (('~fmin', ('fabs', a), 1.0), ('fsat', ('fabs', a)), '!options->lower_fsat'),
+
+   # The result of the multiply must be in [-1, 0], so the result of the ffma
+   # must be in [0, 1].
+   (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False),
+   (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False),
+   (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),
+   (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),
  
     # Packing and then unpacking does nothing
     (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
@@ -453,13 +697,37 @@ optimizations = [
                             ('unpack_64_2x32_split_y', a)), a),
  
     # Byte extraction
-   (('ushr', a, 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
-   (('iand', 0xff, ('ushr', a, 16)), ('extract_u8', a, 2), '!options->lower_extract_byte'),
-   (('iand', 0xff, ('ushr', a,  8)), ('extract_u8', a, 1), '!options->lower_extract_byte'),
-   (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
+   (('ushr', 'a@16',  8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
+   (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
+   (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'),
+   (('ishr', 'a@16',  8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
+   (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),
+   (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),
+   (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte')
+]
  
+# After the ('extract_u8', a, 0) pattern, above, triggers, there will be
+# patterns like those below.
+for op in ('ushr', 'ishr'):
+   optimizations.extend([(('extract_u8', (op, 'a@16',  8),     0), ('extract_u8', a, 1))])
+   optimizations.extend([(('extract_u8', (op, 'a@32',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)])
+   optimizations.extend([(('extract_u8', (op, 'a@64',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)])
+
+optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))])
+
+# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be
+# patterns like those below.
+for op in ('extract_u8', 'extract_i8'):
+   optimizations.extend([((op, ('ishl', 'a@16',      8),     1), (op, a, 0))])
+   optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)])
+   optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)])
+
+optimizations.extend([
      # Word extraction
-   (('ushr', a, 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
+   (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
+   (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
+   (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
+   (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
     (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
  
     # Subtracts
@@ -477,7 +745,7 @@ optimizations = [
     (('iabs', ('isub', 0, a)), ('iabs', a)),
  
     # Propagate negation up multiplication chains
-   (('fmul', ('fneg', a), b), ('fneg', ('fmul', a, b))),
+   (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))),
     (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
  
     # Propagate constants up multiplication chains
@@ -489,11 +757,11 @@ optimizations = [
     # Reassociate constants in add/mul chains so they can be folded together.
     # For now, we mostly only handle cases where the constants are separated by
     # a single non-constant.  We could do better eventually.
-   (('~fmul', '#a', ('fmul', b, '#c')), ('fmul', ('fmul', a, c), b)),
-   (('imul', '#a', ('imul', b, '#c')), ('imul', ('imul', a, c), b)),
-   (('~fadd', '#a',          ('fadd', b, '#c')),  ('fadd', ('fadd', a,          c),           b)),
-   (('~fadd', '#a', ('fneg', ('fadd', b, '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
-   (('iadd', '#a', ('iadd', b, '#c')), ('iadd', ('iadd', a, c), b)),
+   (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)),
+   (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)),
+   (('~fadd', '#a',          ('fadd', 'b(is_not_const)', '#c')),  ('fadd', ('fadd', a,          c),           b)),
+   (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
+   (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
  
     # By definition...
     (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)),
@@ -507,9 +775,9 @@ optimizations = [
     (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
  
     # Misc. lowering
-   (('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod32'),
-   (('fmod@64', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod64'),
-   (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod32'),
+   (('fmod@16', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
+   (('fmod@32', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
+   (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
     (('uadd_carry@32', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
     (('usub_borrow@32', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
  
@@ -517,6 +785,12 @@ optimizations = [
      ('bcsel', ('ilt', 31, 'bits'), 'insert',
                ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')),
      'options->lower_bitfield_insert'),
+   (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
+   (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
+   (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
+   (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
+   (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat'),
+   (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_add_sat'),
  
     # Alternative lowering that doesn't rely on bfi.
     (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
@@ -604,44 +878,124 @@ optimizations = [
  
      (('unpack_unorm_2x16', 'v'),
       ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0),
-                                 ('extract_u16', 'v', 1))),
+                                  ('extract_u16', 'v', 1))),
                65535.0),
       'options->lower_unpack_unorm_2x16'),
  
      (('unpack_unorm_4x8', 'v'),
       ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0),
-                                 ('extract_u8', 'v', 1),
-                                 ('extract_u8', 'v', 2),
-                                 ('extract_u8', 'v', 3))),
+                                  ('extract_u8', 'v', 1),
+                                  ('extract_u8', 'v', 2),
+                                  ('extract_u8', 'v', 3))),
                255.0),
       'options->lower_unpack_unorm_4x8'),
  
      (('unpack_snorm_2x16', 'v'),
-     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f32', ('vec2', ('extract_i16', 'v', 0),
-                                                              ('extract_i16', 'v', 1))),
+     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0),
+                                                            ('extract_i16', 'v', 1))),
                                             32767.0))),
       'options->lower_unpack_snorm_2x16'),
  
      (('unpack_snorm_4x8', 'v'),
-     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f32', ('vec4', ('extract_i8', 'v', 0),
-                                                              ('extract_i8', 'v', 1),
-                                                              ('extract_i8', 'v', 2),
-                                                              ('extract_i8', 'v', 3))),
+     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0),
+                                                            ('extract_i8', 'v', 1),
+                                                            ('extract_i8', 'v', 2),
+                                                            ('extract_i8', 'v', 3))),
                                             127.0))),
       'options->lower_unpack_snorm_4x8'),
-]
+
+   (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
+   (('fsign', a), ('fsub', ('b2f', ('flt', 0.0, a)), ('b2f', ('flt', a, 0.0))), 'options->lower_fsign'),
+])
+
+# bit_size dependent lowerings
+for bit_size in [8, 16, 32, 64]:
+   # convenience constants
+   intmax = (1 << (bit_size - 1)) - 1
+   intmin = 1 << (bit_size - 1)
+
+   optimizations += [
+      (('iadd_sat@' + str(bit_size), a, b),
+       ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)),
+                                ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_add_sat'),
+      (('isub_sat@' + str(bit_size), a, b),
+       ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)),
+                                ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_add_sat'),
+   ]
  
  invert = OrderedDict([('feq', 'fne'), ('fne', 'feq'), ('fge', 'flt'), ('flt', 'fge')])
  
-for left, right in list(itertools.combinations(invert.keys(), 2)) + zip(invert.keys(), invert.keys()):
+for left, right in itertools.combinations_with_replacement(invert.keys(), 2):
     optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))),
                           ('iand', (invert[left], a, b), (invert[right], c, d))))
     optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))),
                           ('ior', (invert[left], a, b), (invert[right], c, d))))
  
+# Optimize x2bN(b2x(x)) -> x
+for size in type_sizes('bool'):
+    aN = 'a@' + str(size)
+    f2bN = 'f2b' + str(size)
+    i2bN = 'i2b' + str(size)
+    optimizations.append(((f2bN, ('b2f', aN)), a))
+    optimizations.append(((i2bN, ('b2i', aN)), a))
+
+# Optimize x2yN(b2x(x)) -> b2y
+for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']):
+   if x != 'f' and y != 'f' and x != y:
+      continue
+
+   b2x = 'b2f' if x == 'f' else 'b2i'
+   b2y = 'b2f' if y == 'f' else 'b2i'
+   x2yN = '{}2{}'.format(x, y)
+   optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
+
+# Optimize away x2xN(a@N)
+for t in ['int', 'uint', 'float']:
+   for N in type_sizes(t):
+      x2xN = '{0}2{0}{1}'.format(t[0], N)
+      aN = 'a@{0}'.format(N)
+      optimizations.append(((x2xN, aN), a))
+
+# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers
+# In particular, we can optimize away everything except upcast of downcast and
+# upcasts where the type differs from the other cast
+for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')):
+   if N < M:
+      # The outer cast is a down-cast.  It doesn't matter what the size of the
+      # argument of the inner cast is because we'll never been in the upcast
+      # of downcast case.  Regardless of types, we'll always end up with y2yN
+      # in the end.
+      for x, y in itertools.product(['i', 'u'], ['i', 'u']):
+         x2xN = '{0}2{0}{1}'.format(x, N)
+         y2yM = '{0}2{0}{1}'.format(y, M)
+         y2yN = '{0}2{0}{1}'.format(y, N)
+         optimizations.append(((x2xN, (y2yM, a)), (y2yN, a)))
+   elif N > M:
+      # If the outer cast is an up-cast, we have to be more careful about the
+      # size of the argument of the inner cast and with types.  In this case,
+      # the type is always the type of type up-cast which is given by the
+      # outer cast.
+      for P in type_sizes('uint'):
+         # We can't optimize away up-cast of down-cast.
+         if M < P:
+            continue
+
+         # Because we're doing down-cast of down-cast, the types always have
+         # to match between the two casts
+         for x in ['i', 'u']:
+            x2xN = '{0}2{0}{1}'.format(x, N)
+            x2xM = '{0}2{0}{1}'.format(x, M)
+            aP = 'a@{0}'.format(P)
+            optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a)))
+   else:
+      # The N == M case is handled by other optimizations
+      pass
+
  def fexp2i(exp, bits):
     # We assume that exp is already in the right range.
-   if bits == 32:
+   if bits == 16:
+      return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
+   elif bits == 32:
        return ('ishl', ('iadd', exp, 127), 23)
     elif bits == 64:
        return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
@@ -659,7 +1013,9 @@ def ldexp(f, exp, bits):
     # handles a range on exp of [-252, 254] which allows you to create any
     # value (including denorms if the hardware supports it) and to adjust the
     # exponent of any normal value to anything you want.
-   if bits == 32:
+   if bits == 16:
+      exp = ('imin', ('imax', exp, -28), 30)
+   elif bits == 32:
        exp = ('imin', ('imax', exp, -252), 254)
     elif bits == 64:
        exp = ('imin', ('imax', exp, -2044), 2046)
@@ -679,6 +1035,7 @@ def ldexp(f, exp, bits):
     return ('fmul', ('fmul', f, pow2_1), pow2_2)
  
  optimizations += [
+   (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'),
     (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'),
     (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'),
  ]
@@ -740,6 +1097,27 @@ for op in ['fadd', 'fmul', 'iadd', 'imul']:
        ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d)))
     ]
  
+# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives
+# states:
+#
+#     If neither layout qualifier is specified, derivatives in compute shaders
+#     return zero, which is consistent with the handling of built-in texture
+#     functions like texture() in GLSL 4.50 compute shaders.
+for op in ['fddx', 'fddx_fine', 'fddx_coarse',
+           'fddy', 'fddy_fine', 'fddy_coarse']:
+   optimizations += [
+      ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE')
+]
+
+# Some optimizations for ir3-specific instructions.
+optimizations += [
+   # 'al * bl': If either 'al' or 'bl' is zero, return zero.
+   (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)),
+   # '(ah * bl) << 16 + c': If either 'ah' or 'bl' is zero, return 'c'.
+   (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')),
+   (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')),
+]
+
  # This section contains "late" optimizations that should be run before
  # creating ffmas and calling regular optimizations for the final time.
  # Optimizations should go here if they help code generation and conflict
@@ -759,6 +1137,10 @@ before_ffma_optimizations = [
     (('iadd', a, ('iadd', ('ineg', a), b)), b),
     (('~fadd', ('fneg', a), ('fadd', a, b)), b),
     (('~fadd', a, ('fadd', ('fneg', a), b)), b),
+
+   (('~flrp@32', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a,  1.0), d), ('fadd', ('flrp', -1.0,  1.0, d), a)),
+   (('~flrp@32', ('fadd(is_used_once)', a,  1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp',  1.0, -1.0, d), a)),
+   (('~flrp@32', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))),
  ]
  
  # This section contains "late" optimizations that should be run after the
@@ -775,22 +1157,59 @@ late_optimizations = [
     (('~feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))),
     (('~fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))),
  
+   # nir_lower_to_source_mods will collapse this, but its existence during the
+   # optimization loop can prevent other optimizations.
+   (('fneg', ('fneg', a)), a),
+
+   # These are duplicated from the main optimizations table.  The late
+   # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
+   # new patterns like these.  The patterns that compare with zero are removed
+   # because they are unlikely to be created in by anything in
+   # late_optimizations.
+   (('flt', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('flt', a, b)),
+   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
+   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
+   (('fge', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('fge', b, a)),
+   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
+   (('fne', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fne', a, b)),
+
+   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
+   (('flt', ('fsat(is_used_once)', a), 1.0), ('flt', a, 1.0)),
+
     (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))),
  
+   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
+   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
+   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
+   (('fne', ('fneg', a), ('fneg', b)), ('fne', b, a)),
+   (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
+   (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
+   (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
+   (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
+   (('fne', ('fneg', a), -1.0), ('fne', 1.0, a)),
+   (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
+
+   (('ior', a, a), a),
+   (('iand', a, a), a),
+
+   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
+
     (('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'),
     (('fdot3', a, b), ('fdot_replicated3', a, b), 'options->fdot_replicates'),
     (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),
     (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
  
-   (('b2f(is_used_more_than_once)', ('inot', a)), ('bcsel', a, 0.0, 1.0)),
-   (('fneg(is_used_more_than_once)', ('b2f', ('inot', a))), ('bcsel', a, -0.0, -1.0)),
+   (('~flrp@32', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
+   (('~flrp@64', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
+
+   (('~fadd@32', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp32'),
+   (('~fadd@64', 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp64'),
  
     # we do these late so that we don't get in the way of creating ffmas
     (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
     (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),
  
-   # Lowered for backends without a dedicated b2f instruction
-   (('b2f@32', a), ('iand', a, 1.0), 'options->lower_b2f'),
+   (('bcsel', a, 0, ('b2f32', ('inot', 'b@bool'))), ('b2f32', ('inot', ('ior', a, b)))),
  ]
  
  print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())