gallivm: Use a faster (and less accurate) log2 in lod computation.

[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c

index 7b35dd4bb49cead48526a54f198c3404d9f0c248..ff94f498acf7180776b88bd90666d12f21d71184 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -56,17 +56,10 @@
  #include "lp_bld_intr.h"
  #include "lp_bld_logic.h"
  #include "lp_bld_pack.h"
+#include "lp_bld_debug.h"
  #include "lp_bld_arit.h"
  
  
-/*
- * XXX: Increasing eliminates some artifacts, but adds others, most
- * noticeably corruption in the Earth halo in Google Earth.
- */
-#define RCP_NEWTON_STEPS 0
-
-#define RSQRT_NEWTON_STEPS 0
-
  #define EXP_POLY_DEGREE 3
  
  #define LOG_POLY_DEGREE 5
@@ -267,7 +260,7 @@ lp_build_add(struct lp_build_context *bld,
  }
  
  
-/** Return the sum of the elements of a */
+/** Return the scalar sum of the elements of a */
  LLVMValueRef
  lp_build_sum_vector(struct lp_build_context *bld,
                      LLVMValueRef a)
@@ -278,11 +271,9 @@ lp_build_sum_vector(struct lp_build_context *bld,
  
     assert(lp_check_value(type, a));
  
-   if (a == bld->zero)
-      return bld->zero;
-   if (a == bld->undef)
-      return bld->undef;
-   assert(type.length > 1);
+   if (type.length == 1) {
+      return a;
+   }
  
     assert(!bld->type.norm);
  
@@ -546,7 +537,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
     if(b == 2 && bld->type.floating)
        return lp_build_add(bld, a, a);
  
-   if(util_is_pot(b)) {
+   if(util_is_power_of_two(b)) {
        unsigned shift = ffs(b) - 1;
  
        if(bld->type.floating) {
@@ -623,17 +614,15 @@ lp_build_div(struct lp_build_context *bld,
  
  
  /**
- * Linear interpolation.
- *
- * This also works for integer values with a few caveats.
+ * Linear interpolation -- without any checks.
   *
   * @sa http://www.stereopsis.com/doubleblend.html
   */
-LLVMValueRef
-lp_build_lerp(struct lp_build_context *bld,
-              LLVMValueRef x,
-              LLVMValueRef v0,
-              LLVMValueRef v1)
+static INLINE LLVMValueRef
+lp_build_lerp_simple(struct lp_build_context *bld,
+                     LLVMValueRef x,
+                     LLVMValueRef v0,
+                     LLVMValueRef v1)
  {
     LLVMValueRef delta;
     LLVMValueRef res;
@@ -648,12 +637,80 @@ lp_build_lerp(struct lp_build_context *bld,
  
     res = lp_build_add(bld, v0, res);
  
-   if(bld->type.fixed)
+   if (bld->type.fixed) {
        /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
         * but it will be wrong for other uses. Basically we need a more
         * powerful lp_type, capable of further distinguishing the values
         * interpretation from the value storage. */
        res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
+   }
+
+   return res;
+}
+
+
+/**
+ * Linear interpolation.
+ */
+LLVMValueRef
+lp_build_lerp(struct lp_build_context *bld,
+              LLVMValueRef x,
+              LLVMValueRef v0,
+              LLVMValueRef v1)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   assert(lp_check_value(type, x));
+   assert(lp_check_value(type, v0));
+   assert(lp_check_value(type, v1));
+
+   if (type.norm) {
+      struct lp_type wide_type;
+      struct lp_build_context wide_bld;
+      LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
+      LLVMValueRef shift;
+
+      assert(type.length >= 2);
+      assert(!type.sign);
+
+      /*
+       * Create a wider type, enough to hold the intermediate result of the
+       * multiplication.
+       */
+      memset(&wide_type, 0, sizeof wide_type);
+      wide_type.fixed  = TRUE;
+      wide_type.width  = type.width*2;
+      wide_type.length = type.length/2;
+
+      lp_build_context_init(&wide_bld, bld->builder, wide_type);
+
+      lp_build_unpack2(bld->builder, type, wide_type, x,  &xl,  &xh);
+      lp_build_unpack2(bld->builder, type, wide_type, v0, &v0l, &v0h);
+      lp_build_unpack2(bld->builder, type, wide_type, v1, &v1l, &v1h);
+
+      /*
+       * Scale x from [0, 255] to [0, 256]
+       */
+
+      shift = lp_build_const_int_vec(wide_type, type.width - 1);
+
+      xl = lp_build_add(&wide_bld, xl,
+                        LLVMBuildAShr(bld->builder, xl, shift, ""));
+      xh = lp_build_add(&wide_bld, xh,
+                        LLVMBuildAShr(bld->builder, xh, shift, ""));
+
+      /*
+       * Lerp both halves.
+       */
+
+      resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
+      resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
+
+      res = lp_build_pack2(bld->builder, wide_type, type, resl, resh);
+   } else {
+      res = lp_build_lerp_simple(bld, x, v0, v1);
+   }
  
     return res;
  }
@@ -932,28 +989,67 @@ lp_build_round_sse41(struct lp_build_context *bld,
                       enum lp_build_round_sse41_mode mode)
  {
     const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMTypeRef i32t = LLVMInt32Type();
     const char *intrinsic;
+   LLVMValueRef res;
  
     assert(type.floating);
-   assert(type.width*type.length == 128);
+
     assert(lp_check_value(type, a));
     assert(util_cpu_caps.has_sse4_1);
  
-   switch(type.width) {
-   case 32:
-      intrinsic = "llvm.x86.sse41.round.ps";
-      break;
-   case 64:
-      intrinsic = "llvm.x86.sse41.round.pd";
-      break;
-   default:
-      assert(0);
-      return bld->undef;
+   if (type.length == 1) {
+      LLVMTypeRef vec_type;
+      LLVMValueRef undef;
+      LLVMValueRef args[3];
+      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
+
+      switch(type.width) {
+      case 32:
+         intrinsic = "llvm.x86.sse41.round.ss";
+         break;
+      case 64:
+         intrinsic = "llvm.x86.sse41.round.sd";
+         break;
+      default:
+         assert(0);
+         return bld->undef;
+      }
+
+      vec_type = LLVMVectorType(bld->elem_type, 4);
+
+      undef = LLVMGetUndef(vec_type);
+
+      args[0] = undef;
+      args[1] = LLVMBuildInsertElement(bld->builder, undef, a, index0, "");
+      args[2] = LLVMConstInt(i32t, mode, 0);
+
+      res = lp_build_intrinsic(bld->builder, intrinsic,
+                               vec_type, args, Elements(args));
+
+      res = LLVMBuildExtractElement(bld->builder, res, index0, "");
+   }
+   else {
+      assert(type.width*type.length == 128);
+
+      switch(type.width) {
+      case 32:
+         intrinsic = "llvm.x86.sse41.round.ps";
+         break;
+      case 64:
+         intrinsic = "llvm.x86.sse41.round.pd";
+         break;
+      default:
+         assert(0);
+         return bld->undef;
+      }
+
+      res = lp_build_intrinsic_binary(bld->builder, intrinsic,
+                                      bld->vec_type, a,
+                                      LLVMConstInt(i32t, mode, 0));
     }
  
-   return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
-                                    LLVMConstInt(LLVMInt32Type(), mode, 0));
+   return res;
  }
  
  
@@ -971,8 +1067,10 @@ lp_build_trunc(struct lp_build_context *bld,
     assert(type.floating);
     assert(lp_check_value(type, a));
  
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
        return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
+   }
     else {
        LLVMTypeRef vec_type = lp_build_vec_type(type);
        LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
@@ -999,8 +1097,10 @@ lp_build_round(struct lp_build_context *bld,
     assert(type.floating);
     assert(lp_check_value(type, a));
  
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
        return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+   }
     else {
        LLVMTypeRef vec_type = lp_build_vec_type(type);
        LLVMValueRef res;
@@ -1025,8 +1125,10 @@ lp_build_floor(struct lp_build_context *bld,
     assert(type.floating);
     assert(lp_check_value(type, a));
  
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
        return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+   }
     else {
        LLVMTypeRef vec_type = lp_build_vec_type(type);
        LLVMValueRef res;
@@ -1051,8 +1153,10 @@ lp_build_ceil(struct lp_build_context *bld,
     assert(type.floating);
     assert(lp_check_value(type, a));
  
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
        return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+   }
     else {
        LLVMTypeRef vec_type = lp_build_vec_type(type);
        LLVMValueRef res;
@@ -1106,31 +1210,36 @@ lp_build_iround(struct lp_build_context *bld,
                  LLVMValueRef a)
  {
     const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMTypeRef int_vec_type = bld->int_vec_type;
     LLVMValueRef res;
  
     assert(type.floating);
  
     assert(lp_check_value(type, a));
  
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
        res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
     }
     else {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
-      LLVMValueRef sign;
        LLVMValueRef half;
  
-      /* get sign bit */
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-
-      /* sign * 0.5 */
        half = lp_build_const_vec(type, 0.5);
-      half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
-      half = LLVMBuildOr(bld->builder, sign, half, "");
-      half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
+
+      if (type.sign) {
+         LLVMTypeRef vec_type = bld->vec_type;
+         LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+         LLVMValueRef sign;
+
+         /* get sign bit */
+         sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+
+         /* sign * 0.5 */
+         half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
+         half = LLVMBuildOr(bld->builder, sign, half, "");
+         half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
+      }
  
        res = LLVMBuildFAdd(bld->builder, a, half, "");
     }
@@ -1151,37 +1260,42 @@ lp_build_ifloor(struct lp_build_context *bld,
                  LLVMValueRef a)
  {
     const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMTypeRef int_vec_type = bld->int_vec_type;
     LLVMValueRef res;
  
     assert(type.floating);
     assert(lp_check_value(type, a));
  
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
        res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
     }
     else {
-      /* Take the sign bit and add it to 1 constant */
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      unsigned mantissa = lp_mantissa(type);
-      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
-      LLVMValueRef sign;
-      LLVMValueRef offset;
-
-      /* sign = a < 0 ? ~0 : 0 */
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
-
-      /* offset = -0.99999(9)f */
-      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
-      offset = LLVMConstBitCast(offset, int_vec_type);
-
-      /* offset = a < 0 ? offset : 0.0f */
-      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
-      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
-
-      res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
+      res = a;
+
+      if (type.sign) {
+         /* Take the sign bit and add it to 1 constant */
+         LLVMTypeRef vec_type = bld->vec_type;
+         unsigned mantissa = lp_mantissa(type);
+         LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+         LLVMValueRef sign;
+         LLVMValueRef offset;
+
+         /* sign = a < 0 ? ~0 : 0 */
+         sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+         sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
+
+         /* offset = -0.99999(9)f */
+         offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
+         offset = LLVMConstBitCast(offset, int_vec_type);
+
+         /* offset = a < 0 ? offset : 0.0f */
+         offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+         offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
+
+         res = LLVMBuildFAdd(bld->builder, res, offset, "ifloor.res");
+      }
     }
  
     /* round to nearest (toward zero) */
@@ -1201,35 +1315,39 @@ lp_build_iceil(struct lp_build_context *bld,
                 LLVMValueRef a)
  {
     const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMTypeRef int_vec_type = bld->int_vec_type;
     LLVMValueRef res;
  
     assert(type.floating);
     assert(lp_check_value(type, a));
  
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
        res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
     }
     else {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMTypeRef vec_type = bld->vec_type;
        unsigned mantissa = lp_mantissa(type);
-      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
-      LLVMValueRef sign;
        LLVMValueRef offset;
  
-      /* sign = a < 0 ? 0 : ~0 */
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
-      sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
-
        /* offset = 0.99999(9)f */
        offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
-      offset = LLVMConstBitCast(offset, int_vec_type);
  
-      /* offset = a < 0 ? 0.0 : offset */
-      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
-      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
+      if (type.sign) {
+         LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+         LLVMValueRef sign;
+
+         /* sign = a < 0 ? 0 : ~0 */
+         sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+         sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
+         sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
+
+         /* offset = a < 0 ? 0.0 : offset */
+         offset = LLVMConstBitCast(offset, int_vec_type);
+         offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+         offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
+      }
  
        res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
     }
@@ -1266,6 +1384,11 @@ lp_build_sqrt(struct lp_build_context *bld,
   *
   *   x_{i+1} = x_i * (2 - a * x_i)
   *
+ * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
+ * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
+ * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
+ * halo. It would be necessary to clamp the argument to prevent this.
+ *
   * See also:
   * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
   * - http://softwarecommunity.intel.com/articles/eng/1818.htm
@@ -1306,13 +1429,27 @@ lp_build_rcp(struct lp_build_context *bld,
     if(LLVMIsConstant(a))
        return LLVMConstFDiv(bld->one, a);
  
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   /*
+    * We don't use RCPPS because:
+    * - it only has 10bits of precision
+    * - it doesn't even get the reciprocate of 1.0 exactly
+    * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
+    * - for recent processors the benefit over DIVPS is marginal, a case
+    *   depedent
+    *
+    * We could still use it on certain processors if benchmarks show that the
+    * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
+    * particular uses that require less workarounds.
+    */
+
+   if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+      const unsigned num_iterations = 0;
        LLVMValueRef res;
        unsigned i;
  
        res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
  
-      for (i = 0; i < RCP_NEWTON_STEPS; ++i) {
+      for (i = 0; i < num_iterations; ++i) {
           res = lp_build_rcp_refine(bld, a, res);
        }
  
@@ -1363,13 +1500,14 @@ lp_build_rsqrt(struct lp_build_context *bld,
  
     assert(type.floating);
  
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+      const unsigned num_iterations = 0;
        LLVMValueRef res;
        unsigned i;
  
        res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
  
-      for (i = 0; i < RSQRT_NEWTON_STEPS; ++i) {
+      for (i = 0; i < num_iterations; ++i) {
           res = lp_build_rsqrt_refine(bld, a, res);
        }
  
@@ -1840,9 +1978,11 @@ lp_build_pow(struct lp_build_context *bld,
               LLVMValueRef y)
  {
     /* TODO: optimize the constant case */
-   if(LLVMIsConstant(x) && LLVMIsConstant(y))
+   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
+       LLVMIsConstant(x) && LLVMIsConstant(y)) {
        debug_printf("%s: inefficient/imprecise constant arithmetic\n",
                     __FUNCTION__);
+   }
  
     return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
  }
@@ -1897,9 +2037,11 @@ lp_build_polynomial(struct lp_build_context *bld,
     assert(lp_check_value(bld->type, x));
  
     /* TODO: optimize the constant case */
-   if(LLVMIsConstant(x))
+   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
+       LLVMIsConstant(x)) {
        debug_printf("%s: inefficient/imprecise constant arithmetic\n",
                     __FUNCTION__);
+   }
  
     for (i = num_coeffs; i--; ) {
        LLVMValueRef coeff;
@@ -1971,9 +2113,11 @@ lp_build_exp2_approx(struct lp_build_context *bld,
  
     if(p_exp2_int_part || p_frac_part || p_exp2) {
        /* TODO: optimize the constant case */
-      if(LLVMIsConstant(x))
+      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
+          LLVMIsConstant(x)) {
           debug_printf("%s: inefficient/imprecise constant arithmetic\n",
                        __FUNCTION__);
+      }
  
        assert(type.floating && type.width == 32);
  
@@ -2086,9 +2230,11 @@ lp_build_log2_approx(struct lp_build_context *bld,
  
     if(p_exp || p_floor_log2 || p_log2) {
        /* TODO: optimize the constant case */
-      if(LLVMIsConstant(x))
+      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
+          LLVMIsConstant(x)) {
           debug_printf("%s: inefficient/imprecise constant arithmetic\n",
                        __FUNCTION__);
+      }
  
        assert(type.floating && type.width == 32);
  
@@ -2140,3 +2286,47 @@ lp_build_log2(struct lp_build_context *bld,
     lp_build_log2_approx(bld, x, NULL, NULL, &res);
     return res;
  }
+
+
+/**
+ * Faster (and less accurate) log2.
+ *
+ *    log2(x) = floor(log2(x)) + frac(x)
+ *
+ * See http://www.flipcode.com/archives/Fast_log_Function.shtml
+ */
+LLVMValueRef
+lp_build_fast_log2(struct lp_build_context *bld,
+                   LLVMValueRef x)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = bld->vec_type;
+   LLVMTypeRef int_vec_type = bld->int_vec_type;
+
+   unsigned mantissa = lp_mantissa(type);
+   LLVMValueRef mantmask = lp_build_const_int_vec(type, (1ULL << mantissa) - 1);
+   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
+
+   LLVMValueRef ipart;
+   LLVMValueRef fpart;
+
+   assert(lp_check_value(bld->type, x));
+
+   assert(type.floating);
+
+   x = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
+
+   /* ipart = floor(log2(x)) - 1 */
+   ipart = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), "");
+   ipart = LLVMBuildAnd(bld->builder, ipart, lp_build_const_int_vec(type, 255), "");
+   ipart = LLVMBuildSub(bld->builder, ipart, lp_build_const_int_vec(type, 128), "");
+   ipart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
+
+   /* fpart = 1.0 + frac(x) */
+   fpart = LLVMBuildAnd(bld->builder, x, mantmask, "");
+   fpart = LLVMBuildOr(bld->builder, fpart, one, "");
+   fpart = LLVMBuildBitCast(bld->builder, fpart, vec_type, "");
+
+   /* floor(log2(x)) + frac(x) */
+   return LLVMBuildFAdd(bld->builder, ipart, fpart, "");
+}