From 0d96cbe4a5e0a39c17db007f3815868c6a766382 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Jos=C3=A9=20Fonseca?= Date: Sat, 21 Aug 2010 21:58:22 +0100 Subject: [PATCH] gallivm: Emit DIVPS instead of RCPPS. See comments for detailed rationale. Thanks to Michal Krol and Zack Rusin for detecting and investigating this in detail. --- src/gallium/auxiliary/gallivm/lp_bld_arit.c | 36 ++++++++++++++------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index 7b35dd4bb49..bb30e6e9dfb 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -59,14 +59,6 @@ #include "lp_bld_arit.h" -/* - * XXX: Increasing eliminates some artifacts, but adds others, most - * noticeably corruption in the Earth halo in Google Earth. - */ -#define RCP_NEWTON_STEPS 0 - -#define RSQRT_NEWTON_STEPS 0 - #define EXP_POLY_DEGREE 3 #define LOG_POLY_DEGREE 5 @@ -1266,6 +1258,11 @@ lp_build_sqrt(struct lp_build_context *bld, * * x_{i+1} = x_i * (2 - a * x_i) * + * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or + * +/-Inf, giving NaN instead. Certain applications rely on this behavior, + * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's + * halo. It would be necessary to clamp the argument to prevent this. + * * See also: * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division * - http://softwarecommunity.intel.com/articles/eng/1818.htm @@ -1306,13 +1303,27 @@ lp_build_rcp(struct lp_build_context *bld, if(LLVMIsConstant(a)) return LLVMConstFDiv(bld->one, a); - if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + /* + * We don't use RCPPS because: + * - it only has 10bits of precision + * - it doesn't even get the reciprocate of 1.0 exactly + * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf + * - for recent processors the benefit over DIVPS is marginal, a case + * depedent + * + * We could still use it on certain processors if benchmarks show that the + * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for + * particular uses that require less workarounds. + */ + + if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + const unsigned num_iterations = 0; LLVMValueRef res; unsigned i; res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a); - for (i = 0; i < RCP_NEWTON_STEPS; ++i) { + for (i = 0; i < num_iterations; ++i) { res = lp_build_rcp_refine(bld, a, res); } @@ -1363,13 +1374,14 @@ lp_build_rsqrt(struct lp_build_context *bld, assert(type.floating); - if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + const unsigned num_iterations = 0; LLVMValueRef res; unsigned i; res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a); - for (i = 0; i < RSQRT_NEWTON_STEPS; ++i) { + for (i = 0; i < num_iterations; ++i) { res = lp_build_rsqrt_refine(bld, a, res); } -- 2.30.2