From b421c0466d6ec28824b297d0545fca537c13a2b7 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Mon, 2 Mar 2020 18:57:44 -0800 Subject: [PATCH] soft-fp64/flt: Perform checks in a different order MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The change to nir_opt_algebraic cleans up a pattern that was never produced before the rest of this commit was added. Results on the 308 shaders extracted from the fp64 portion of the OpenGL CTS: Tiger Lake and Ice Lake had similar results. (Tiger Lake shown) total instructions in shared programs: 843005 -> 841666 (-0.16%) instructions in affected programs: 460655 -> 459316 (-0.29%) helped: 64 HURT: 17 helped stats (abs) min: 1 max: 72 x̄: 21.72 x̃: 20 helped stats (rel) min: 0.01% max: 28.07% x̄: 12.67% x̃: 16.07% HURT stats (abs) min: 1 max: 7 x̄: 3.00 x̃: 2 HURT stats (rel) min: 0.01% max: 0.04% x̄: 0.02% x̃: 0.02% 95% mean confidence interval for instructions value: -20.87 -12.19 95% mean confidence interval for instructions %-change: -12.35% -7.66% Instructions are helped. total cycles in shared programs: 6944998 -> 6927246 (-0.26%) cycles in affected programs: 3891872 -> 3874120 (-0.46%) helped: 71 HURT: 10 helped stats (abs) min: 2 max: 772 x̄: 254.21 x̃: 156 helped stats (rel) min: <.01% max: 66.44% x̄: 21.72% x̃: 18.40% HURT stats (abs) min: 18 max: 69 x̄: 29.70 x̃: 20 HURT stats (rel) min: 0.02% max: 0.04% x̄: 0.03% x̃: 0.03% 95% mean confidence interval for cycles value: -270.82 -167.50 95% mean confidence interval for cycles %-change: -24.41% -13.65% Cycles are helped. Reviewed-by: Matt Turner Part-of: --- src/compiler/glsl/float64.glsl | 72 +++++++++++++++++++++------ src/compiler/nir/nir_opt_algebraic.py | 11 ++++ 2 files changed, 67 insertions(+), 16 deletions(-) diff --git a/src/compiler/glsl/float64.glsl b/src/compiler/glsl/float64.glsl index 7f6d3a86e34..6dd85e5cc57 100644 --- a/src/compiler/glsl/float64.glsl +++ b/src/compiler/glsl/float64.glsl @@ -165,14 +165,14 @@ __extractFloat64Sign(uint64_t a) return unpackUint2x32(a).y & 0x80000000u; } -/* Returns true if the 64-bit value formed by concatenating `a0' and `a1' is less - * than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise, - * returns false. +/* Returns true if the signed 64-bit value formed by concatenating `a0' and + * `a1' is less than the signed 64-bit value formed by concatenating `b0' and + * `b1'. Otherwise, returns false. */ bool -lt64(uint a0, uint a1, uint b0, uint b1) +ilt64(uint a0, uint a1, uint b0, uint b1) { - return (a0 < b0) || ((a0 == b0) && (a1 < b1)); + return (int(a0) < int(b0)) || ((a0 == b0) && (a1 < b1)); } bool @@ -180,12 +180,42 @@ __flt64_nonnan(uint64_t __a, uint64_t __b) { uvec2 a = unpackUint2x32(__a); uvec2 b = unpackUint2x32(__b); - uint aSign = __extractFloat64Sign(__a); - uint bSign = __extractFloat64Sign(__b); - if (aSign != bSign) - return (aSign != 0u) && ((((a.y | b.y)<<1) | a.x | b.x) != 0u); - return mix(lt64(a.y, a.x, b.y, b.x), lt64(b.y, b.x, a.y, a.x), aSign != 0u); + /* IEEE 754 floating point numbers are specifically designed so that, with + * two exceptions, values can be compared by bit-casting to signed integers + * with the same number of bits. + * + * From https://en.wikipedia.org/wiki/IEEE_754-1985#Comparing_floating-point_numbers: + * + * When comparing as 2's-complement integers: If the sign bits differ, + * the negative number precedes the positive number, so 2's complement + * gives the correct result (except that negative zero and positive zero + * should be considered equal). If both values are positive, the 2's + * complement comparison again gives the correct result. Otherwise (two + * negative numbers), the correct FP ordering is the opposite of the 2's + * complement ordering. + * + * The logic implied by the above quotation is: + * + * !both_are_zero(a, b) && (both_negative(a, b) ? a > b : a < b) + * + * This is equivalent to + * + * fne(a, b) && (both_negative(a, b) ? a >= b : a < b) + * + * fne(a, b) && (both_negative(a, b) ? !(a < b) : a < b) + * + * fne(a, b) && ((both_negative(a, b) && !(a < b)) || + * (!both_negative(a, b) && (a < b))) + * + * (A!|B)&(A|!B) is (A xor B) which is implemented here using !=. + * + * fne(a, b) && (both_negative(a, b) != (a < b)) + */ + bool lt = ilt64(a.y, a.x, b.y, b.x); + bool both_negative = (a.y & b.y & 0x80000000u) != 0; + + return !__feq64_nonnan(__a, __b) && (lt != both_negative); } /* Returns true if the double-precision floating-point value `a' is less than @@ -195,10 +225,15 @@ __flt64_nonnan(uint64_t __a, uint64_t __b) bool __flt64(uint64_t a, uint64_t b) { - if (__is_nan(a) || __is_nan(b)) - return false; + /* This weird layout matters. Doing the "obvious" thing results in extra + * flow control being inserted to implement the short-circuit evaluation + * rules. Flow control is bad! + */ + bool x = !__is_nan(a); + bool y = !__is_nan(b); + bool z = __flt64_nonnan(a, b); - return __flt64_nonnan(a, b); + return (x && y && z); } /* Returns true if the double-precision floating-point value `a' is greater @@ -209,10 +244,15 @@ __flt64(uint64_t a, uint64_t b) bool __fge64(uint64_t a, uint64_t b) { - if (__is_nan(a) || __is_nan(b)) - return false; + /* This weird layout matters. Doing the "obvious" thing results in extra + * flow control being inserted to implement the short-circuit evaluation + * rules. Flow control is bad! + */ + bool x = !__is_nan(a); + bool y = !__is_nan(b); + bool z = !__flt64_nonnan(a, b); - return !__flt64_nonnan(a, b); + return (x && y && z); } uint64_t diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 7d9775950a4..3302cd8d9e1 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -629,6 +629,17 @@ optimizations.extend([ (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', a, b), 0), '!options->lower_bitops'), (('ior', ('ine', 'a@32', 0), ('ine', 'b@32', 0)), ('ine', ('ior', a, b), 0), '!options->lower_bitops'), + # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code. + # The first part of the iand comes from the !__feq64_nonnan. + # + # The second pattern is a reformulation of the first based on the relation + # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation + # happens to be y == 0. + (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0), b), c)), ('ilt', a, 0)), + ('iand', ('inot', ('iand', b , c)), ('ilt', a, 0))), + (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)), + ('iand', ('inot', ('iand', ('ieq', b , 0), c)), ('ilt', a, 0))), + # These patterns can result when (a < b || a < c) => (a < min(b, c)) # transformations occur before constant propagation and loop-unrolling. (('~flt', a, ('fmax', b, a)), ('flt', a, b)), -- 2.30.2