From 7215375c445f533e3962a09b8e3b075880c1382f Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Fri, 20 Jan 2017 15:24:30 -0800 Subject: [PATCH] nir/spirv/glsl450: Rewrite atan2 implementation to fix accuracy and handling of zero/infinity. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit See "glsl: Rewrite atan2 implementation to fix accuracy and handling of zero/infinity." for the rationale, but note that the instruction count benefit discussed there is somewhat less important for the SPIRV implementation, because the current code already emitted no control flow instructions -- Still this saves us one hardware instruction per scalar component on Intel SKL hardware. Fixes the following Vulkan CTS tests on Intel hardware: dEQP-VK.glsl.builtin.precision.atan2.highp_compute.scalar dEQP-VK.glsl.builtin.precision.atan2.highp_compute.vec2 dEQP-VK.glsl.builtin.precision.atan2.highp_compute.vec3 dEQP-VK.glsl.builtin.precision.atan2.highp_compute.vec4 dEQP-VK.glsl.builtin.precision.atan2.mediump_compute.vec2 dEQP-VK.glsl.builtin.precision.atan2.mediump_compute.vec4 Note that most of the test-cases above expect IEEE-compliant handling of atan2(±∞, ±∞), which this patch doesn't explicitly handle, so except for the last two the test-cases above weren't expected to pass yet. The reason they do is that the i965 back-end implementation of the NIR fmin and fmax instructions is not quite GLSL-compliant (it complies with IEEE 754 recommendations though), because fmin/fmax of a NaN and a non-NaN argument currently always return the non-NaN argument, which causes atan() to flush NaN to one and return the expected value. The front-end should probably not be relying on this behavior for correctness though because other back-ends are likely to behave differently -- A follow-up patch will handle the atan2(±∞, ±∞) corner cases explicitly. v2: Fix up argument scaling to take into account the range and precision of exotic FP24 hardware. Flip coordinate system for arguments along the vertical line as if they were on the left half-plane in order to avoid division by zero which may give unspecified results on non-GLSL 4.1-capable hardware. Sprinkle in some more comments. Reviewed-by: Ian Romanick --- src/compiler/spirv/vtn_glsl450.c | 77 +++++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 22 deletions(-) diff --git a/src/compiler/spirv/vtn_glsl450.c b/src/compiler/spirv/vtn_glsl450.c index 0d32fddbef4..8509f641317 100644 --- a/src/compiler/spirv/vtn_glsl450.c +++ b/src/compiler/spirv/vtn_glsl450.c @@ -302,28 +302,61 @@ build_atan(nir_builder *b, nir_ssa_def *y_over_x) static nir_ssa_def * build_atan2(nir_builder *b, nir_ssa_def *y, nir_ssa_def *x) { - nir_ssa_def *zero = nir_imm_float(b, 0.0f); - - /* If |x| >= 1.0e-8 * |y|: */ - nir_ssa_def *condition = - nir_fge(b, nir_fabs(b, x), - nir_fmul(b, nir_imm_float(b, 1.0e-8f), nir_fabs(b, y))); - - /* Then...call atan(y/x) and fix it up: */ - nir_ssa_def *atan1 = build_atan(b, nir_fdiv(b, y, x)); - nir_ssa_def *r_then = - nir_bcsel(b, nir_flt(b, x, zero), - nir_fadd(b, atan1, - nir_bcsel(b, nir_fge(b, y, zero), - nir_imm_float(b, M_PIf), - nir_imm_float(b, -M_PIf))), - atan1); - - /* Else... */ - nir_ssa_def *r_else = - nir_fmul(b, nir_fsign(b, y), nir_imm_float(b, M_PI_2f)); - - return nir_bcsel(b, condition, r_then, r_else); + nir_ssa_def *zero = nir_imm_float(b, 0); + nir_ssa_def *one = nir_imm_float(b, 1); + + /* If we're on the left half-plane rotate the coordinates π/2 clock-wise + * for the y=0 discontinuity to end up aligned with the vertical + * discontinuity of atan(s/t) along t=0. This also makes sure that we + * don't attempt to divide by zero along the vertical line, which may give + * unspecified results on non-GLSL 4.1-capable hardware. + */ + nir_ssa_def *flip = nir_fge(b, zero, x); + nir_ssa_def *s = nir_bcsel(b, flip, nir_fabs(b, x), y); + nir_ssa_def *t = nir_bcsel(b, flip, y, nir_fabs(b, x)); + + /* If the magnitude of the denominator exceeds some huge value, scale down + * the arguments in order to prevent the reciprocal operation from flushing + * its result to zero, which would cause precision problems, and for s + * infinite would cause us to return a NaN instead of the correct finite + * value. + * + * If fmin and fmax are respectively the smallest and largest positive + * normalized floating point values representable by the implementation, + * the constants below should be in agreement with: + * + * huge <= 1 / fmin + * scale <= 1 / fmin / fmax (for |t| >= huge) + * + * In addition scale should be a negative power of two in order to avoid + * loss of precision. The values chosen below should work for most usual + * floating point representations with at least the dynamic range of ATI's + * 24-bit representation. + */ + nir_ssa_def *huge = nir_imm_float(b, 1e18f); + nir_ssa_def *scale = nir_bcsel(b, nir_fge(b, nir_fabs(b, t), huge), + nir_imm_float(b, 0.25), one); + nir_ssa_def *rcp_scaled_t = nir_frcp(b, nir_fmul(b, t, scale)); + nir_ssa_def *s_over_t = nir_fmul(b, nir_fmul(b, s, scale), rcp_scaled_t); + + /* Calculate the arctangent and fix up the result if we had flipped the + * coordinate system. + */ + nir_ssa_def *arc = nir_fadd(b, nir_fmul(b, nir_b2f(b, flip), + nir_imm_float(b, M_PI_2f)), + build_atan(b, nir_fabs(b, s_over_t))); + + /* Rather convoluted calculation of the sign of the result. When x < 0 we + * cannot use fsign because we need to be able to distinguish between + * negative and positive zero. We don't use bitwise arithmetic tricks for + * consistency with the GLSL front-end. When x >= 0 rcp_scaled_t will + * always be non-negative so this won't be able to distinguish between + * negative and positive zero, but we don't care because atan2 is + * continuous along the whole positive y = 0 half-line, so it won't affect + * the result significantly. + */ + return nir_bcsel(b, nir_flt(b, nir_fmin(b, y, rcp_scaled_t), zero), + nir_fneg(b, arc), arc); } static nir_ssa_def * -- 2.30.2