gallivm: better support for fast rsqrt

author Roland Scheidegger <sroland@vmware.com>

Thu, 11 Jul 2013 21:15:44 +0000 (23:15 +0200)

committer Roland Scheidegger <sroland@vmware.com>

Sat, 13 Jul 2013 16:42:17 +0000 (18:42 +0200)
author Roland Scheidegger <sroland@vmware.com>
Thu, 11 Jul 2013 21:15:44 +0000 (23:15 +0200)
committer Roland Scheidegger <sroland@vmware.com>
Sat, 13 Jul 2013 16:42:17 +0000 (18:42 +0200)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c

index c006ac537c1853f98806156640549ebc52d22f8a..7d6fe04f50dc20dbd98e401bf4535d3908fb6228 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -2306,19 +2306,14 @@ lp_build_rsqrt(struct lp_build_context *bld,
     /*
      * This should be faster but all denormals will end up as infinity.
      */
-   if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
-        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) {
+   if (0 && lp_build_fast_rsqrt_available(type)) {
        const unsigned num_iterations = 1;
        LLVMValueRef res;
        unsigned i;
-      const char *intrinsic = NULL;
  
-      if (type.length == 4) {
-         intrinsic = "llvm.x86.sse.rsqrt.ps";
-      }
-      else {
-         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
-      }
+      /* rsqrt(1.0) != 1.0 here */
+      res = lp_build_fast_rsqrt(bld, a);
+
        if (num_iterations) {
           /*
            * Newton-Raphson will result in NaN instead of infinity for zero,
@@ -2338,8 +2333,6 @@ lp_build_rsqrt(struct lp_build_context *bld,
  
           inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
  
-         res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
-
           for (i = 0; i < num_iterations; ++i) {
              res = lp_build_rsqrt_refine(bld, a, res);
           }
@@ -2350,11 +2343,6 @@ lp_build_rsqrt(struct lp_build_context *bld,
           cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
           res = lp_build_select(bld, cmp, bld->one, res);
        }
-      else {
-         /* rsqrt(1.0) != 1.0 here */
-         res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
-
-      }
  
        return res;
     }
@@ -2362,6 +2350,58 @@ lp_build_rsqrt(struct lp_build_context *bld,
     return lp_build_rcp(bld, lp_build_sqrt(bld, a));
  }
  
+/**
+ * If there's a fast (inaccurate) rsqrt instruction available
+ * (caller may want to avoid to call rsqrt_fast if it's not available,
+ * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
+ * unavailable it would result in sqrt/div/mul so obviously
+ * much better to just call sqrt, skipping both div and mul).
+ */
+boolean
+lp_build_fast_rsqrt_available(struct lp_type type)
+{
+   assert(type.floating);
+
+   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
+      return true;
+   }
+   return false;
+}
+
+
+/**
+ * Generate 1/sqrt(a).
+ * Result is undefined for values < 0, infinity for +0.
+ * Precision is limited, only ~10 bits guaranteed
+ * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
+ */
+LLVMValueRef
+lp_build_fast_rsqrt(struct lp_build_context *bld,
+                    LLVMValueRef a)
+{
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   const struct lp_type type = bld->type;
+
+   assert(lp_check_value(type, a));
+
+   if (lp_build_fast_rsqrt_available(type)) {
+      const char *intrinsic = NULL;
+
+      if (type.length == 4) {
+         intrinsic = "llvm.x86.sse.rsqrt.ps";
+      }
+      else {
+         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
+      }
+      return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
+   }
+   else {
+      debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
+   }
+   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
+}
+
  
  /**
   * Generate sin(a) using SSE2
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h

index 966796c3c4d190608519f8ddda203d85409d6235..920e339cda5cb3e925074245fe079c2487162b0b 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -231,6 +231,13 @@ LLVMValueRef
  lp_build_rsqrt(struct lp_build_context *bld,
                 LLVMValueRef a);
  
+boolean
+lp_build_fast_rsqrt_available(struct lp_type type);
+
+LLVMValueRef
+lp_build_fast_rsqrt(struct lp_build_context *bld,
+                    LLVMValueRef a);
+
  LLVMValueRef
  lp_build_cos(struct lp_build_context *bld,
               LLVMValueRef a);
author	Roland Scheidegger <sroland@vmware.com>
	Thu, 11 Jul 2013 21:15:44 +0000 (23:15 +0200)
committer	Roland Scheidegger <sroland@vmware.com>
	Sat, 13 Jul 2013 16:42:17 +0000 (18:42 +0200)
src/gallium/auxiliary/gallivm/lp_bld_arit.c		patch \| blob \| history
src/gallium/auxiliary/gallivm/lp_bld_arit.h		patch \| blob \| history