gallivm: fix indirect addressing of constant buffer

[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c

index f372a48846ff86986a543c38ecb09a1e3a0c93b3..f5f2623e467850de886d45cb699fb1d10ff6040c 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -56,7 +56,6 @@
  #include "lp_bld_intr.h"
  #include "lp_bld_logic.h"
  #include "lp_bld_pack.h"
-#include "lp_bld_debug.h"
  #include "lp_bld_arit.h"
  
  
@@ -239,7 +238,7 @@ lp_build_sum_vector(struct lp_build_context *bld,
  {
     const struct lp_type type = bld->type;
     LLVMValueRef index, res;
-   int i;
+   unsigned i;
  
     if (a == bld->zero)
        return bld->zero;
@@ -675,26 +674,13 @@ lp_build_abs(struct lp_build_context *bld,
  
     if(type.floating) {
        /* Mask out the sign bit */
-      if (type.length == 1) {
-         LLVMTypeRef int_type = LLVMIntType(type.width);
-         LLVMTypeRef float_type = LLVMFloatType();
-         unsigned long long absMask = ~(1ULL << (type.width - 1));
-         LLVMValueRef mask = LLVMConstInt(int_type, absMask, 0);
-         a = LLVMBuildBitCast(bld->builder, a, int_type, "");
-         a = LLVMBuildAnd(bld->builder, a, mask, "");
-         a = LLVMBuildBitCast(bld->builder, a, float_type, "");
-         return a;
-      }
-      else {
-         /* vector of floats */
-         LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-         unsigned long long absMask = ~(1ULL << (type.width - 1));
-         LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
-         a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-         a = LLVMBuildAnd(bld->builder, a, mask, "");
-         a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
-         return a;
-      }
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      unsigned long long absMask = ~(1ULL << (type.width - 1));
+      LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
+      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      a = LLVMBuildAnd(bld->builder, a, mask, "");
+      a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
+      return a;
     }
  
     if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
@@ -742,17 +728,9 @@ lp_build_sgn(struct lp_build_context *bld,
        LLVMValueRef one;
        unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
  
-      if (type.length == 1) {
-         int_type = lp_build_int_elem_type(type);
-         vec_type = lp_build_elem_type(type);
-         mask = LLVMConstInt(int_type, maskBit, 0);
-      }
-      else {
-         /* vector */
-         int_type = lp_build_int_vec_type(type);
-         vec_type = lp_build_vec_type(type);
-         mask = lp_build_const_int_vec(type, maskBit);
-      }
+      int_type = lp_build_int_vec_type(type);
+      vec_type = lp_build_vec_type(type);
+      mask = lp_build_const_int_vec(type, maskBit);
  
        /* Take the sign bit and add it to 1 constant */
        sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
@@ -819,21 +797,11 @@ lp_build_int_to_float(struct lp_build_context *bld,
                        LLVMValueRef a)
  {
     const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
  
     assert(type.floating);
-   /*assert(lp_check_value(type, a));*/
  
-   if (type.length == 1) {
-      LLVMTypeRef float_type = LLVMFloatType();
-      return LLVMBuildSIToFP(bld->builder, a, float_type, "");
-   }
-   else {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      /*LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);*/
-      LLVMValueRef res;
-      res = LLVMBuildSIToFP(bld->builder, a, vec_type, "");
-      return res;
-   }
+   return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
  }
  
  
@@ -878,6 +846,11 @@ lp_build_round_sse41(struct lp_build_context *bld,
  }
  
  
+/**
+ * Return the integer part of a float (vector) value.  The returned value is
+ * a float (vector).
+ * Ex: trunc(-1.5) = 1.0
+ */
  LLVMValueRef
  lp_build_trunc(struct lp_build_context *bld,
                 LLVMValueRef a)
@@ -887,7 +860,7 @@ lp_build_trunc(struct lp_build_context *bld,
     assert(type.floating);
     assert(lp_check_value(type, a));
  
-   if(util_cpu_caps.has_sse4_1)
+   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
        return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
     else {
        LLVMTypeRef vec_type = lp_build_vec_type(type);
@@ -900,6 +873,12 @@ lp_build_trunc(struct lp_build_context *bld,
  }
  
  
+/**
+ * Return float (vector) rounded to nearest integer (vector).  The returned
+ * value is a float (vector).
+ * Ex: round(0.9) = 1.0
+ * Ex: round(-1.5) = -2.0
+ */
  LLVMValueRef
  lp_build_round(struct lp_build_context *bld,
                 LLVMValueRef a)
@@ -909,7 +888,7 @@ lp_build_round(struct lp_build_context *bld,
     assert(type.floating);
     assert(lp_check_value(type, a));
  
-   if(util_cpu_caps.has_sse4_1)
+   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
        return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
     else {
        LLVMTypeRef vec_type = lp_build_vec_type(type);
@@ -921,6 +900,11 @@ lp_build_round(struct lp_build_context *bld,
  }
  
  
+/**
+ * Return floor of float (vector), result is a float (vector)
+ * Ex: floor(1.1) = 1.0
+ * Ex: floor(-1.1) = -2.0
+ */
  LLVMValueRef
  lp_build_floor(struct lp_build_context *bld,
                 LLVMValueRef a)
@@ -928,15 +912,9 @@ lp_build_floor(struct lp_build_context *bld,
     const struct lp_type type = bld->type;
  
     assert(type.floating);
+   assert(lp_check_value(type, a));
  
-   if (type.length == 1) {
-      LLVMValueRef res;
-      res = lp_build_ifloor(bld, a);
-      res = LLVMBuildSIToFP(bld->builder, res, LLVMFloatType(), "");
-      return res;
-   }
-
-   if(util_cpu_caps.has_sse4_1)
+   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
        return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
     else {
        LLVMTypeRef vec_type = lp_build_vec_type(type);
@@ -948,6 +926,11 @@ lp_build_floor(struct lp_build_context *bld,
  }
  
  
+/**
+ * Return ceiling of float (vector), returning float (vector).
+ * Ex: ceil( 1.1) = 2.0
+ * Ex: ceil(-1.1) = -1.0
+ */
  LLVMValueRef
  lp_build_ceil(struct lp_build_context *bld,
                LLVMValueRef a)
@@ -957,7 +940,7 @@ lp_build_ceil(struct lp_build_context *bld,
     assert(type.floating);
     assert(lp_check_value(type, a));
  
-   if(util_cpu_caps.has_sse4_1)
+   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
        return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
     else {
        LLVMTypeRef vec_type = lp_build_vec_type(type);
@@ -970,7 +953,7 @@ lp_build_ceil(struct lp_build_context *bld,
  
  
  /**
- * Return fractional part of 'a' computed as a - floor(f)
+ * Return fractional part of 'a' computed as a - floor(a)
   * Typically used in texture coord arithmetic.
   */
  LLVMValueRef
@@ -983,31 +966,29 @@ lp_build_fract(struct lp_build_context *bld,
  
  
  /**
- * Convert to integer, through whichever rounding method that's fastest,
- * typically truncating toward zero.
+ * Return the integer part of a float (vector) value.  The returned value is
+ * an integer (vector).
+ * Ex: itrunc(-1.5) = 1
   */
  LLVMValueRef
  lp_build_itrunc(struct lp_build_context *bld,
                  LLVMValueRef a)
  {
     const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
  
     assert(type.floating);
+   assert(lp_check_value(type, a));
  
-   if (type.length == 1) {
-      LLVMTypeRef int_type = LLVMIntType(type.width);
-      return LLVMBuildFPToSI(bld->builder, a, int_type, "");
-   }
-   else {
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      assert(lp_check_value(type, a));
-      return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
-   }
+   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
  }
  
  
  /**
- * Convert float[] to int[] with round().
+ * Return float (vector) rounded to nearest integer (vector).  The returned
+ * value is an integer (vector).
+ * Ex: iround(0.9) = 1
+ * Ex: iround(-1.5) = -2
   */
  LLVMValueRef
  lp_build_iround(struct lp_build_context *bld,
@@ -1019,17 +1000,9 @@ lp_build_iround(struct lp_build_context *bld,
  
     assert(type.floating);
  
-   if (type.length == 1) {
-      /* scalar float to int */
-      LLVMTypeRef int_type = LLVMIntType(type.width);
-      /* XXX we want rounding here! */
-      res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
-      return res;
-   }
-
     assert(lp_check_value(type, a));
  
-   if(util_cpu_caps.has_sse4_1) {
+   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
        res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
     }
     else {
@@ -1058,7 +1031,9 @@ lp_build_iround(struct lp_build_context *bld,
  
  
  /**
- * Convert float[] to int[] with floor().
+ * Return floor of float (vector), result is an int (vector)
+ * Ex: ifloor(1.1) = 1.0
+ * Ex: ifloor(-1.1) = -2.0
   */
  LLVMValueRef
  lp_build_ifloor(struct lp_build_context *bld,
@@ -1069,17 +1044,9 @@ lp_build_ifloor(struct lp_build_context *bld,
     LLVMValueRef res;
  
     assert(type.floating);
-
-   if (type.length == 1) {
-      /* scalar float to int */
-      LLVMTypeRef int_type = LLVMIntType(type.width);
-      res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
-      return res;
-   }
-
     assert(lp_check_value(type, a));
  
-   if(util_cpu_caps.has_sse4_1) {
+   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
        res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
     }
     else {
@@ -1093,29 +1060,31 @@ lp_build_ifloor(struct lp_build_context *bld,
        /* sign = a < 0 ? ~0 : 0 */
        sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
        sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "");
-      lp_build_name(sign, "floor.sign");
+      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
  
        /* offset = -0.99999(9)f */
-      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
+      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
        offset = LLVMConstBitCast(offset, int_vec_type);
  
-      /* offset = a < 0 ? -0.99999(9)f : 0.0f */
+      /* offset = a < 0 ? offset : 0.0f */
        offset = LLVMBuildAnd(bld->builder, offset, sign, "");
-      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
-      lp_build_name(offset, "floor.offset");
+      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
  
-      res = LLVMBuildAdd(bld->builder, a, offset, "");
-      lp_build_name(res, "floor.res");
+      res = LLVMBuildAdd(bld->builder, a, offset, "ifloor.res");
     }
  
-   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
-   lp_build_name(res, "floor");
+   /* round to nearest (toward zero) */
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
  
     return res;
  }
  
  
+/**
+ * Return ceiling of float (vector), returning int (vector).
+ * Ex: iceil( 1.1) = 2
+ * Ex: iceil(-1.1) = -1
+ */
  LLVMValueRef
  lp_build_iceil(struct lp_build_context *bld,
                 LLVMValueRef a)
@@ -1127,15 +1096,35 @@ lp_build_iceil(struct lp_build_context *bld,
     assert(type.floating);
     assert(lp_check_value(type, a));
  
-   if(util_cpu_caps.has_sse4_1) {
+   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
        res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
     }
     else {
-      assert(0);
-      res = bld->undef;
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      unsigned mantissa = lp_mantissa(type);
+      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef offset;
+
+      /* sign = a < 0 ? 0 : ~0 */
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
+      sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
+
+      /* offset = 0.99999(9)f */
+      offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
+      offset = LLVMConstBitCast(offset, int_vec_type);
+
+      /* offset = a < 0 ? 0.0 : offset */
+      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
+
+      res = LLVMBuildAdd(bld->builder, a, offset, "iceil.res");
     }
  
-   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
+   /* round to nearest (toward zero) */
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
  
     return res;
  }
@@ -1228,61 +1217,454 @@ lp_build_rsqrt(struct lp_build_context *bld,
  }
  
  
+static inline LLVMValueRef
+lp_build_const_v4si(unsigned long value)
+{
+   LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
+   LLVMValueRef elements[4] = { element, element, element, element };
+   return LLVMConstVector(elements, 4);
+}
+
+static inline LLVMValueRef
+lp_build_const_v4sf(float value)
+{
+   LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
+   LLVMValueRef elements[4] = { element, element, element, element };
+   return LLVMConstVector(elements, 4);
+}
+
+
  /**
- * Generate cos(a)
+ * Generate sin(a) using SSE2
   */
  LLVMValueRef
-lp_build_cos(struct lp_build_context *bld,
-              LLVMValueRef a)
+lp_build_sin(struct lp_build_context *bld,
+             LLVMValueRef a)
  {
-#ifdef PIPE_OS_WINDOWS
+   struct lp_type int_type = lp_int_type(bld->type);
+   LLVMBuilderRef b = bld->builder;
+   LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
+   LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
+
     /*
-    * FIXME: X86 backend translates llvm.cos.v4f32 to 4 calls to CRT's cosf()
-    * which is neither efficient nor does the CRT linkage work on Windows
-    * causing segmentation fault. So simply disable the code for now.
+    *  take the absolute value,
+    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
      */
-   return bld->one;
-#else
-   const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   char intrinsic[32];
  
-   /* TODO: optimize the constant case */
+   LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
+   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
  
-   assert(type.floating);
-   util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
+   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
+   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
  
-   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
-#endif
+   /*
+    * extract the sign bit (upper one)
+    * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
+    */
+   LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
+   LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
+
+   /*
+    * scale by 4/Pi
+    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+    */
+   
+   LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
+   LLVMValueRef scale_y = LLVMBuildMul(b, x_abs, FOPi, "scale_y");
+
+   /*
+    * store the integer part of y in mm0
+    * emm2 = _mm_cvttps_epi32(y);
+    */
+   
+   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
+
+   /*
+    * j=(j+1) & (~1) (see the cephes sources)
+    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+    */
+
+   LLVMValueRef all_one = lp_build_const_v4si(1);
+   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
+   /*
+    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+    */
+   LLVMValueRef inv_one = lp_build_const_v4si(~1);
+   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
+
+   /*
+    * y = _mm_cvtepi32_ps(emm2);
+    */
+   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
+
+   /* get the swap sign flag
+    * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+    */
+   LLVMValueRef pi32_4 = lp_build_const_v4si(4);
+   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
+   
+   /*
+    * emm2 = _mm_slli_epi32(emm0, 29);
+    */  
+   LLVMValueRef const_29 = lp_build_const_v4si(29);
+   LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
+
+   /*
+    * get the polynom selection mask 
+    * there is one polynom for 0 <= x <= Pi/4
+    * and another one for Pi/4<x<=Pi/2
+    * Both branches will be computed.
+    *  
+    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+    */
+
+   LLVMValueRef pi32_2 = lp_build_const_v4si(2);
+   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
+   LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
+                                             emm2_3, lp_build_const_v4si(0));
+   /*
+    *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+    */
+   LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
+
+   /*
+    * _PS_CONST(minus_cephes_DP1, -0.78515625);
+    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+    */
+   LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
+   LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
+   LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
+
+   /*
+    * The magic pass: "Extended precision modular arithmetic" 
+    * x = ((x - y * DP1) - y * DP2) - y * DP3; 
+    * xmm1 = _mm_mul_ps(y, xmm1);
+    * xmm2 = _mm_mul_ps(y, xmm2);
+    * xmm3 = _mm_mul_ps(y, xmm3);
+    */
+   LLVMValueRef xmm1 = LLVMBuildMul(b, y_2, DP1, "xmm1");
+   LLVMValueRef xmm2 = LLVMBuildMul(b, y_2, DP2, "xmm2");
+   LLVMValueRef xmm3 = LLVMBuildMul(b, y_2, DP3, "xmm3");
+
+   /*
+    * x = _mm_add_ps(x, xmm1);
+    * x = _mm_add_ps(x, xmm2);
+    * x = _mm_add_ps(x, xmm3);
+    */ 
+
+   LLVMValueRef x_1 = LLVMBuildAdd(b, x_abs, xmm1, "x_1");
+   LLVMValueRef x_2 = LLVMBuildAdd(b, x_1, xmm2, "x_2");
+   LLVMValueRef x_3 = LLVMBuildAdd(b, x_2, xmm3, "x_3");
+
+   /*
+    * Evaluate the first polynom  (0 <= x <= Pi/4)
+    *
+    * z = _mm_mul_ps(x,x);
+    */
+   LLVMValueRef z = LLVMBuildMul(b, x_3, x_3, "z");
+
+   /*
+    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
+    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
+    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
+    */
+   LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
+   LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
+   LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
+
+   /*
+    * y = *(v4sf*)_ps_coscof_p0;
+    * y = _mm_mul_ps(y, z);
+    */
+   LLVMValueRef y_3 = LLVMBuildMul(b, z, coscof_p0, "y_3");
+   LLVMValueRef y_4 = LLVMBuildAdd(b, y_3, coscof_p1, "y_4");
+   LLVMValueRef y_5 = LLVMBuildMul(b, y_4, z, "y_5");
+   LLVMValueRef y_6 = LLVMBuildAdd(b, y_5, coscof_p2, "y_6");
+   LLVMValueRef y_7 = LLVMBuildMul(b, y_6, z, "y_7");
+   LLVMValueRef y_8 = LLVMBuildMul(b, y_7, z, "y_8");
+
+
+   /*
+    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+    * y = _mm_sub_ps(y, tmp);
+    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
+    */ 
+   LLVMValueRef half = lp_build_const_v4sf(0.5);
+   LLVMValueRef tmp = LLVMBuildMul(b, z, half, "tmp");
+   LLVMValueRef y_9 = LLVMBuildSub(b, y_8, tmp, "y_8");
+   LLVMValueRef one = lp_build_const_v4sf(1.0);
+   LLVMValueRef y_10 = LLVMBuildAdd(b, y_9, one, "y_9");
+
+   /*
+    * _PS_CONST(sincof_p0, -1.9515295891E-4);
+    * _PS_CONST(sincof_p1,  8.3321608736E-3);
+    * _PS_CONST(sincof_p2, -1.6666654611E-1);
+    */
+   LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
+   LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
+   LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
+
+   /*
+    * Evaluate the second polynom  (Pi/4 <= x <= 0)
+    *
+    * y2 = *(v4sf*)_ps_sincof_p0;
+    * y2 = _mm_mul_ps(y2, z);
+    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+    * y2 = _mm_mul_ps(y2, z);
+    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+    * y2 = _mm_mul_ps(y2, z);
+    * y2 = _mm_mul_ps(y2, x);
+    * y2 = _mm_add_ps(y2, x);
+    */
+
+   LLVMValueRef y2_3 = LLVMBuildMul(b, z, sincof_p0, "y2_3");
+   LLVMValueRef y2_4 = LLVMBuildAdd(b, y2_3, sincof_p1, "y2_4");
+   LLVMValueRef y2_5 = LLVMBuildMul(b, y2_4, z, "y2_5");
+   LLVMValueRef y2_6 = LLVMBuildAdd(b, y2_5, sincof_p2, "y2_6");
+   LLVMValueRef y2_7 = LLVMBuildMul(b, y2_6, z, "y2_7");
+   LLVMValueRef y2_8 = LLVMBuildMul(b, y2_7, x_3, "y2_8");
+   LLVMValueRef y2_9 = LLVMBuildAdd(b, y2_8, x_3, "y2_9");
+
+   /*
+    * select the correct result from the two polynoms
+    * xmm3 = poly_mask;
+    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+    * y = _mm_andnot_ps(xmm3, y);
+    * y = _mm_add_ps(y,y2);
+    */
+   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
+   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
+   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
+   LLVMValueRef inv = lp_build_const_v4si(~0);
+   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
+   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
+   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
+
+   /*
+    * update the sign
+    * y = _mm_xor_ps(y, sign_bit);
+    */
+   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
+   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
+   return y_result;
  }
  
  
  /**
- * Generate sin(a)
+ * Generate cos(a) using SSE2
   */
  LLVMValueRef
-lp_build_sin(struct lp_build_context *bld,
-              LLVMValueRef a)
+lp_build_cos(struct lp_build_context *bld,
+             LLVMValueRef a)
  {
-#ifdef PIPE_OS_WINDOWS
+   struct lp_type int_type = lp_int_type(bld->type);
+   LLVMBuilderRef b = bld->builder;
+   LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
+   LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
+
     /*
-    * FIXME: X86 backend translates llvm.sin.v4f32 to 4 calls to CRT's sinf()
-    * which is neither efficient nor does the CRT linkage work on Windows
-    * causing segmentation fault. So simply disable the code for now.
+    *  take the absolute value,
+    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
      */
-   return bld->zero;
-#else
-   const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   char intrinsic[32];
  
-   /* TODO: optimize the constant case */
+   LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
+   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
  
-   assert(type.floating);
-   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
+   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
+   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
  
-   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
-#endif
+   /*
+    * scale by 4/Pi
+    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+    */
+   
+   LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
+   LLVMValueRef scale_y = LLVMBuildMul(b, x_abs, FOPi, "scale_y");
+
+   /*
+    * store the integer part of y in mm0
+    * emm2 = _mm_cvttps_epi32(y);
+    */
+   
+   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
+
+   /*
+    * j=(j+1) & (~1) (see the cephes sources)
+    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+    */
+
+   LLVMValueRef all_one = lp_build_const_v4si(1);
+   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
+   /*
+    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+    */
+   LLVMValueRef inv_one = lp_build_const_v4si(~1);
+   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
+
+   /*
+    * y = _mm_cvtepi32_ps(emm2);
+    */
+   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
+
+
+   /*
+    * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
+    */
+   LLVMValueRef const_2 = lp_build_const_v4si(2);
+   LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
+
+
+   /* get the swap sign flag
+    * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
+    */
+   LLVMValueRef inv = lp_build_const_v4si(~0);
+   LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
+   LLVMValueRef pi32_4 = lp_build_const_v4si(4);
+   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
+   
+   /*
+    * emm2 = _mm_slli_epi32(emm0, 29);
+    */  
+   LLVMValueRef const_29 = lp_build_const_v4si(29);
+   LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
+
+   /*
+    * get the polynom selection mask 
+    * there is one polynom for 0 <= x <= Pi/4
+    * and another one for Pi/4<x<=Pi/2
+    * Both branches will be computed.
+    *  
+    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+    */
+
+   LLVMValueRef pi32_2 = lp_build_const_v4si(2);
+   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
+   LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
+                                            emm2_3, lp_build_const_v4si(0));
+
+   /*
+    * _PS_CONST(minus_cephes_DP1, -0.78515625);
+    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+    */
+   LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
+   LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
+   LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
+
+   /*
+    * The magic pass: "Extended precision modular arithmetic" 
+    * x = ((x - y * DP1) - y * DP2) - y * DP3; 
+    * xmm1 = _mm_mul_ps(y, xmm1);
+    * xmm2 = _mm_mul_ps(y, xmm2);
+    * xmm3 = _mm_mul_ps(y, xmm3);
+    */
+   LLVMValueRef xmm1 = LLVMBuildMul(b, y_2, DP1, "xmm1");
+   LLVMValueRef xmm2 = LLVMBuildMul(b, y_2, DP2, "xmm2");
+   LLVMValueRef xmm3 = LLVMBuildMul(b, y_2, DP3, "xmm3");
+
+   /*
+    * x = _mm_add_ps(x, xmm1);
+    * x = _mm_add_ps(x, xmm2);
+    * x = _mm_add_ps(x, xmm3);
+    */ 
+
+   LLVMValueRef x_1 = LLVMBuildAdd(b, x_abs, xmm1, "x_1");
+   LLVMValueRef x_2 = LLVMBuildAdd(b, x_1, xmm2, "x_2");
+   LLVMValueRef x_3 = LLVMBuildAdd(b, x_2, xmm3, "x_3");
+
+   /*
+    * Evaluate the first polynom  (0 <= x <= Pi/4)
+    *
+    * z = _mm_mul_ps(x,x);
+    */
+   LLVMValueRef z = LLVMBuildMul(b, x_3, x_3, "z");
+
+   /*
+    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
+    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
+    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
+    */
+   LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
+   LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
+   LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
+
+   /*
+    * y = *(v4sf*)_ps_coscof_p0;
+    * y = _mm_mul_ps(y, z);
+    */
+   LLVMValueRef y_3 = LLVMBuildMul(b, z, coscof_p0, "y_3");
+   LLVMValueRef y_4 = LLVMBuildAdd(b, y_3, coscof_p1, "y_4");
+   LLVMValueRef y_5 = LLVMBuildMul(b, y_4, z, "y_5");
+   LLVMValueRef y_6 = LLVMBuildAdd(b, y_5, coscof_p2, "y_6");
+   LLVMValueRef y_7 = LLVMBuildMul(b, y_6, z, "y_7");
+   LLVMValueRef y_8 = LLVMBuildMul(b, y_7, z, "y_8");
+
+
+   /*
+    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+    * y = _mm_sub_ps(y, tmp);
+    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
+    */ 
+   LLVMValueRef half = lp_build_const_v4sf(0.5);
+   LLVMValueRef tmp = LLVMBuildMul(b, z, half, "tmp");
+   LLVMValueRef y_9 = LLVMBuildSub(b, y_8, tmp, "y_8");
+   LLVMValueRef one = lp_build_const_v4sf(1.0);
+   LLVMValueRef y_10 = LLVMBuildAdd(b, y_9, one, "y_9");
+
+   /*
+    * _PS_CONST(sincof_p0, -1.9515295891E-4);
+    * _PS_CONST(sincof_p1,  8.3321608736E-3);
+    * _PS_CONST(sincof_p2, -1.6666654611E-1);
+    */
+   LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
+   LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
+   LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
+
+   /*
+    * Evaluate the second polynom  (Pi/4 <= x <= 0)
+    *
+    * y2 = *(v4sf*)_ps_sincof_p0;
+    * y2 = _mm_mul_ps(y2, z);
+    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+    * y2 = _mm_mul_ps(y2, z);
+    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+    * y2 = _mm_mul_ps(y2, z);
+    * y2 = _mm_mul_ps(y2, x);
+    * y2 = _mm_add_ps(y2, x);
+    */
+
+   LLVMValueRef y2_3 = LLVMBuildMul(b, z, sincof_p0, "y2_3");
+   LLVMValueRef y2_4 = LLVMBuildAdd(b, y2_3, sincof_p1, "y2_4");
+   LLVMValueRef y2_5 = LLVMBuildMul(b, y2_4, z, "y2_5");
+   LLVMValueRef y2_6 = LLVMBuildAdd(b, y2_5, sincof_p2, "y2_6");
+   LLVMValueRef y2_7 = LLVMBuildMul(b, y2_6, z, "y2_7");
+   LLVMValueRef y2_8 = LLVMBuildMul(b, y2_7, x_3, "y2_8");
+   LLVMValueRef y2_9 = LLVMBuildAdd(b, y2_8, x_3, "y2_9");
+
+   /*
+    * select the correct result from the two polynoms
+    * xmm3 = poly_mask;
+    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+    * y = _mm_andnot_ps(xmm3, y);
+    * y = _mm_add_ps(y,y2);
+    */
+   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
+   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
+   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
+   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
+   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
+   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
+
+   /*
+    * update the sign
+    * y = _mm_xor_ps(y, sign_bit);
+    */
+   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
+   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
+   return y_result;
  }
  
  
@@ -1346,7 +1728,6 @@ lp_build_polynomial(struct lp_build_context *bld,
                      unsigned num_coeffs)
  {
     const struct lp_type type = bld->type;
-   LLVMTypeRef float_type = LLVMFloatType();
     LLVMValueRef res = NULL;
     unsigned i;
  
@@ -1358,10 +1739,7 @@ lp_build_polynomial(struct lp_build_context *bld,
     for (i = num_coeffs; i--; ) {
        LLVMValueRef coeff;
  
-      if (type.length == 1)
-         coeff = LLVMConstReal(float_type, coeffs[i]);
-      else
-         coeff = lp_build_const_vec(type, coeffs[i]);
+      coeff = lp_build_const_vec(type, coeffs[i]);
  
        if(res)
           res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
@@ -1377,17 +1755,31 @@ lp_build_polynomial(struct lp_build_context *bld,
  
  
  /**
- * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
+ * Minimax polynomial fit of 2**x, in range [0, 1[
   */
  const double lp_build_exp2_polynomial[] = {
  #if EXP_POLY_DEGREE == 5
-   9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
+   0.999999999690134838155,
+   0.583974334321735217258,
+   0.164553105719676828492,
+   0.0292811063701710962255,
+   0.00354944426657875141846,
+   0.000296253726543423377365
  #elif EXP_POLY_DEGREE == 4
-   1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
+   1.00000001502262084505,
+   0.563586057338685991394,
+   0.150436017652442413623,
+   0.0243220604213317927308,
+   0.0025359088446580436489
  #elif EXP_POLY_DEGREE == 3
-   9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
+   0.999925218562710312959,
+   0.695833540494823811697,
+   0.226067155427249155588,
+   0.0780245226406372992967
  #elif EXP_POLY_DEGREE == 2
-   1.0017247, 6.5763628e-1, 3.3718944e-1
+   1.00172476321474503578,
+   0.657636275736077639316,
+   0.33718943461968720704
  #else
  #error
  #endif
@@ -1421,17 +1813,16 @@ lp_build_exp2_approx(struct lp_build_context *bld,
        x = lp_build_min(bld, x, lp_build_const_vec(type,  129.0));
        x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
  
-      /* ipart = int(x - 0.5) */
-      ipart = LLVMBuildSub(bld->builder, x, lp_build_const_vec(type, 0.5f), "");
-      ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
+      /* ipart = floor(x) */
+      ipart = lp_build_floor(bld, x);
  
        /* fpart = x - ipart */
-      fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
-      fpart = LLVMBuildSub(bld->builder, x, fpart, "");
+      fpart = LLVMBuildSub(bld->builder, x, ipart, "");
     }
  
     if(p_exp2_int_part || p_exp2) {
        /* expipart = (float) (1 << ipart) */
+      ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
        expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
        expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
        expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
@@ -1472,13 +1863,27 @@ lp_build_exp2(struct lp_build_context *bld,
   */
  const double lp_build_log2_polynomial[] = {
  #if LOG_POLY_DEGREE == 6
-   3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
+   3.11578814719469302614,
+   -3.32419399085241980044,
+   2.59883907202499966007,
+   -1.23152682416275988241,
+   0.318212422185251071475,
+   -0.0344359067839062357313
  #elif LOG_POLY_DEGREE == 5
-   2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
+   2.8882704548164776201,
+   -2.52074962577807006663,
+   1.48116647521213171641,
+   -0.465725644288844778798,
+   0.0596515482674574969533
  #elif LOG_POLY_DEGREE == 4
-   2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
+   2.61761038894603480148,
+   -1.75647175389045657003,
+   0.688243882994381274313,
+   -0.107254423828329604454
  #elif LOG_POLY_DEGREE == 3
-   2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
+   2.28330284476918490682,
+   -1.04913055217340124191,
+   0.204446009836232697516
  #else
  #error
  #endif
@@ -1558,89 +1963,11 @@ lp_build_log2_approx(struct lp_build_context *bld,
  }
  
  
-/** scalar version of above function */
-static void
-lp_build_float_log2_approx(struct lp_build_context *bld,
-                           LLVMValueRef x,
-                           LLVMValueRef *p_exp,
-                           LLVMValueRef *p_floor_log2,
-                           LLVMValueRef *p_log2)
-{
-   const struct lp_type type = bld->type;
-   LLVMTypeRef float_type = LLVMFloatType();
-   LLVMTypeRef int_type = LLVMIntType(type.width);
-
-   LLVMValueRef expmask = LLVMConstInt(int_type, 0x7f800000, 0);
-   LLVMValueRef mantmask = LLVMConstInt(int_type, 0x007fffff, 0);
-   LLVMValueRef one = LLVMConstBitCast(bld->one, int_type);
-
-   LLVMValueRef i = NULL;
-   LLVMValueRef exp = NULL;
-   LLVMValueRef mant = NULL;
-   LLVMValueRef logexp = NULL;
-   LLVMValueRef logmant = NULL;
-   LLVMValueRef res = NULL;
-
-   if(p_exp || p_floor_log2 || p_log2) {
-      /* TODO: optimize the constant case */
-      if(LLVMIsConstant(x))
-         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
-                      __FUNCTION__);
-
-      assert(type.floating && type.width == 32);
-
-      i = LLVMBuildBitCast(bld->builder, x, int_type, "");
-
-      /* exp = (float) exponent(x) */
-      exp = LLVMBuildAnd(bld->builder, i, expmask, "");
-   }
-
-   if(p_floor_log2 || p_log2) {
-      LLVMValueRef c23 = LLVMConstInt(int_type, 23, 0);
-      LLVMValueRef c127 = LLVMConstInt(int_type, 127, 0);
-      logexp = LLVMBuildLShr(bld->builder, exp, c23, "");
-      logexp = LLVMBuildSub(bld->builder, logexp, c127, "");
-      logexp = LLVMBuildSIToFP(bld->builder, logexp, float_type, "");
-   }
-
-   if(p_log2) {
-      /* mant = (float) mantissa(x) */
-      mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
-      mant = LLVMBuildOr(bld->builder, mant, one, "");
-      mant = LLVMBuildBitCast(bld->builder, mant, float_type, "");
-
-      logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
-                                    Elements(lp_build_log2_polynomial));
-
-      /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
-      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
-
-      res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
-   }
-
-   if(p_exp) {
-      exp = LLVMBuildBitCast(bld->builder, exp, float_type, "");
-      *p_exp = exp;
-   }
-
-   if(p_floor_log2)
-      *p_floor_log2 = logexp;
-
-   if(p_log2)
-      *p_log2 = res;
-}
-
-
  LLVMValueRef
  lp_build_log2(struct lp_build_context *bld,
                LLVMValueRef x)
  {
     LLVMValueRef res;
-   if (bld->type.length == 1) {
-      lp_build_float_log2_approx(bld, x, NULL, NULL, &res);
-   }
-   else {
-      lp_build_log2_approx(bld, x, NULL, NULL, &res);
-   }
+   lp_build_log2_approx(bld, x, NULL, NULL, &res);
     return res;
  }