gallivm: fix rsqrt failures

author Roland Scheidegger <sroland@vmware.com>

Fri, 21 Sep 2012 15:03:48 +0000 (17:03 +0200)

committer José Fonseca <jfonseca@vmware.com>

Fri, 12 Oct 2012 17:51:18 +0000 (18:51 +0100)
author Roland Scheidegger <sroland@vmware.com>
Fri, 21 Sep 2012 15:03:48 +0000 (17:03 +0200)
committer José Fonseca <jfonseca@vmware.com>
Fri, 12 Oct 2012 17:51:18 +0000 (18:51 +0100)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c

index 7878544436dfaeb93333cb55a7db07a038fd2b04..d23ff0bf996eb327eabe79f06847f47d0e3c5160 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -60,6 +60,7 @@
  #include "lp_bld_debug.h"
  #include "lp_bld_arit.h"
  
+#include "float.h"
  
  #define EXP_POLY_DEGREE 5
  
@@ -1953,8 +1954,7 @@ lp_build_rcp(struct lp_build_context *bld,
   *
   *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
   *
- * See also:
- * - http://softwarecommunity.intel.com/articles/eng/1818.htm
+ * See also Intel 64 and IA-32 Architectures Optimization Manual.
   */
  static INLINE LLVMValueRef
  lp_build_rsqrt_refine(struct lp_build_context *bld,
@@ -1977,7 +1977,8 @@ lp_build_rsqrt_refine(struct lp_build_context *bld,
  
  
  /**
- * Generate 1/sqrt(a)
+ * Generate 1/sqrt(a).
+ * Result is undefined for values < 0, infinity for +0.
   */
  LLVMValueRef
  lp_build_rsqrt(struct lp_build_context *bld,
@@ -1990,8 +1991,11 @@ lp_build_rsqrt(struct lp_build_context *bld,
  
     assert(type.floating);
  
-   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
-        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
+   /*
+    * This should be faster but all denormals will end up as infinity.
+    */
+   if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) {
        const unsigned num_iterations = 1;
        LLVMValueRef res;
        unsigned i;
@@ -2003,12 +2007,41 @@ lp_build_rsqrt(struct lp_build_context *bld,
        else {
           intrinsic = "llvm.x86.avx.rsqrt.ps.256";
        }
+      if (num_iterations) {
+         /*
+          * Newton-Raphson will result in NaN instead of infinity for zero,
+          * and NaN instead of zero for infinity.
+          * Also, need to ensure rsqrt(1.0) == 1.0.
+          * All numbers smaller than FLT_MIN will result in +infinity
+          * (rsqrtps treats all denormals as zero).
+          */
+         /*
+          * Certain non-c99 compilers don't know INFINITY and might not support
+          * hacks to evaluate it at compile time neither.
+          */
+         const unsigned posinf_int = 0x7F800000;
+         LLVMValueRef cmp;
+         LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
+         LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
  
-      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
+         inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
  
+         res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
+
+         for (i = 0; i < num_iterations; ++i) {
+            res = lp_build_rsqrt_refine(bld, a, res);
+         }
+         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
+         res = lp_build_select(bld, cmp, inf, res);
+         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
+         res = lp_build_select(bld, cmp, bld->zero, res);
+         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
+         res = lp_build_select(bld, cmp, bld->one, res);
+      }
+      else {
+         /* rsqrt(1.0) != 1.0 here */
+         res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
  
-      for (i = 0; i < num_iterations; ++i) {
-         res = lp_build_rsqrt_refine(bld, a, res);
        }
  
        return res;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_arit.c b/src/gallium/drivers/llvmpipe/lp_test_arit.c

index 6e09f7e67b0660e612b24391c91c722120fa8f38..99928b8ab6ea66a8d1c82f647d7367d28b23fec1 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_test_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_arit.c
@@ -150,19 +150,42 @@ const float log2_values[] = {
  };
  
  
+static float rcpf(float x)
+{
+   return 1.0/x;
+}
+
+
+const float rcp_values[] = {
+   -0.0, 0.0,
+   -1.0, 1.0,
+   -1e-007, 1e-007,
+   -4.0, 4.0,
+   -1e+035, -100000,
+   100000, 1e+035,
+   5.88e-39f, // denormal
+#if (__STDC_VERSION__ >= 199901L)
+   INFINITY, -INFINITY,
+#endif
+};
+
+
  static float rsqrtf(float x)
  {
-   return 1.0/sqrt(x);
+   return 1.0/(float)sqrt(x);
  }
  
  
  const float rsqrt_values[] = {
-   -1, -1e-007,
-   1e-007, 1,
-   -4, -1,
-   1, 4,
-   -1e+035, -100000,
+   // http://msdn.microsoft.com/en-us/library/windows/desktop/bb147346.aspx
+   0.0, // must yield infinity
+   1.0, // must yield 1.0
+   1e-007, 4.0,
     100000, 1e+035,
+   5.88e-39f, // denormal
+#if (__STDC_VERSION__ >= 199901L)
+   INFINITY,
+#endif
  };
  
  
@@ -231,6 +254,7 @@ unary_tests[] = {
     {"log2", &lp_build_log2, &log2f, log2_values, Elements(log2_values), 20.0 },
     {"exp", &lp_build_exp, &expf, exp2_values, Elements(exp2_values), 18.0 },
     {"log", &lp_build_log, &logf, log2_values, Elements(log2_values), 20.0 },
+   {"rcp", &lp_build_rcp, &rcpf, rcp_values, Elements(rcp_values), 20.0 },
     {"rsqrt", &lp_build_rsqrt, &rsqrtf, rsqrt_values, Elements(rsqrt_values), 20.0 },
     {"sin", &lp_build_sin, &sinf, sincos_values, Elements(sincos_values), 20.0 },
     {"cos", &lp_build_cos, &cosf, sincos_values, Elements(sincos_values), 20.0 },
@@ -330,7 +354,11 @@ test_unary(unsigned verbose, FILE *fp, const struct unary_test_t *test)
           double error, precision;
           bool pass;
  
-         error = fabs(out[i] - ref);
+         if (util_inf_sign(ref) && util_inf_sign(out[i]) == util_inf_sign(ref)) {
+            error = 0;
+         } else {
+            error = fabs(out[i] - ref);
+         }
           precision = error ? -log2(error/fabs(ref)) : FLT_MANT_DIG;
  
           pass = precision >= test->precision;
author	Roland Scheidegger <sroland@vmware.com>
	Fri, 21 Sep 2012 15:03:48 +0000 (17:03 +0200)
committer	José Fonseca <jfonseca@vmware.com>
	Fri, 12 Oct 2012 17:51:18 +0000 (18:51 +0100)
src/gallium/auxiliary/gallivm/lp_bld_arit.c		patch \| blob \| history
src/gallium/drivers/llvmpipe/lp_test_arit.c		patch \| blob \| history