Merge remote branch 'origin/master' into lp-binning

[mesa.git] / src / gallium / drivers / llvmpipe / lp_bld_arit.c
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.c b/src/gallium/drivers/llvmpipe/lp_bld_arit.c

index f8260938f5a337048a6fa1cc4afc94180813895b..2df86dd32e50e63476af819728907994ea7988a9 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
@@ -47,6 +47,7 @@
  
  #include "util/u_memory.h"
  #include "util/u_debug.h"
+#include "util/u_math.h"
  #include "util/u_string.h"
  #include "util/u_cpu_detect.h"
  
@@ -54,6 +55,7 @@
  #include "lp_bld_const.h"
  #include "lp_bld_intr.h"
  #include "lp_bld_logic.h"
+#include "lp_bld_pack.h"
  #include "lp_bld_debug.h"
  #include "lp_bld_arit.h"
  
@@ -279,45 +281,6 @@ lp_build_sub(struct lp_build_context *bld,
  }
  
  
-/**
- * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
- */
-static LLVMValueRef 
-lp_build_unpack_shuffle(unsigned n, unsigned lo_hi)
-{
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i, j;
-
-   assert(n <= LP_MAX_VECTOR_LENGTH);
-   assert(lo_hi < 2);
-
-   for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
-      elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
-      elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
-   }
-
-   return LLVMConstVector(elems, n);
-}
-
-
-/**
- * Build constant int vector of width 'n' and value 'c'.
- */
-static LLVMValueRef 
-lp_build_const_vec(LLVMTypeRef type, unsigned n, long long c)
-{
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i;
-
-   assert(n <= LP_MAX_VECTOR_LENGTH);
-
-   for(i = 0; i < n; ++i)
-      elems[i] = LLVMConstInt(type, c, 0);
-
-   return LLVMConstVector(elems, n);
-}
-
-
  /**
   * Normalized 8bit multiplication.
   *
@@ -361,33 +324,30 @@ lp_build_const_vec(LLVMTypeRef type, unsigned n, long long c)
   */
  static LLVMValueRef
  lp_build_mul_u8n(LLVMBuilderRef builder,
+                 struct lp_type i16_type,
                   LLVMValueRef a, LLVMValueRef b)
  {
-   static LLVMValueRef c01 = NULL;
-   static LLVMValueRef c08 = NULL;
-   static LLVMValueRef c80 = NULL;
+   LLVMValueRef c8;
     LLVMValueRef ab;
  
-   if(!c01) c01 = lp_build_const_vec(LLVMInt16Type(), 8, 0x01);
-   if(!c08) c08 = lp_build_const_vec(LLVMInt16Type(), 8, 0x08);
-   if(!c80) c80 = lp_build_const_vec(LLVMInt16Type(), 8, 0x80);
+   c8 = lp_build_int_const_scalar(i16_type, 8);
     
  #if 0
     
     /* a*b/255 ~= (a*(b + 1)) >> 256 */
-   b = LLVMBuildAdd(builder, b, c01, "");
+   b = LLVMBuildAdd(builder, b, lp_build_int_const_scalar(i16_type, 1), "");
     ab = LLVMBuildMul(builder, a, b, "");
  
  #else
     
-   /* t/255 ~= (t + (t >> 8) + 0x80) >> 8 */
+   /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
     ab = LLVMBuildMul(builder, a, b, "");
-   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c08, ""), "");
-   ab = LLVMBuildAdd(builder, ab, c80, "");
+   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
+   ab = LLVMBuildAdd(builder, ab, lp_build_int_const_scalar(i16_type, 0x80), "");
  
  #endif
     
-   ab = LLVMBuildLShr(builder, ab, c08, "");
+   ab = LLVMBuildLShr(builder, ab, c8, "");
  
     return ab;
  }
@@ -402,6 +362,8 @@ lp_build_mul(struct lp_build_context *bld,
               LLVMValueRef b)
  {
     const struct lp_type type = bld->type;
+   LLVMValueRef shift;
+   LLVMValueRef res;
  
     if(a == bld->zero)
        return bld->zero;
@@ -415,39 +377,18 @@ lp_build_mul(struct lp_build_context *bld,
        return bld->undef;
  
     if(!type.floating && !type.fixed && type.norm) {
-      if(util_cpu_caps.has_sse2 && type.width == 8 && type.length == 16) {
-         LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
-         LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
-         static LLVMValueRef ml = NULL;
-         static LLVMValueRef mh = NULL;
-         LLVMValueRef al, ah, bl, bh;
-         LLVMValueRef abl, abh;
-         LLVMValueRef ab;
-         
-         if(!ml) ml = lp_build_unpack_shuffle(16, 0);
-         if(!mh) mh = lp_build_unpack_shuffle(16, 1);
-
-         /*  PUNPCKLBW, PUNPCKHBW */
-         al = LLVMBuildShuffleVector(bld->builder, a, bld->zero, ml, "");
-         bl = LLVMBuildShuffleVector(bld->builder, b, bld->zero, ml, "");
-         ah = LLVMBuildShuffleVector(bld->builder, a, bld->zero, mh, "");
-         bh = LLVMBuildShuffleVector(bld->builder, b, bld->zero, mh, "");
+      if(type.width == 8) {
+         struct lp_type i16_type = lp_wider_type(type);
+         LLVMValueRef al, ah, bl, bh, abl, abh, ab;
  
-         /* NOP */
-         al = LLVMBuildBitCast(bld->builder, al, i16x8, "");
-         bl = LLVMBuildBitCast(bld->builder, bl, i16x8, "");
-         ah = LLVMBuildBitCast(bld->builder, ah, i16x8, "");
-         bh = LLVMBuildBitCast(bld->builder, bh, i16x8, "");
+         lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
+         lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
  
           /* PMULLW, PSRLW, PADDW */
-         abl = lp_build_mul_u8n(bld->builder, al, bl);
-         abh = lp_build_mul_u8n(bld->builder, ah, bh);
+         abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
+         abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
  
-         /* PACKUSWB */
-         ab = lp_build_intrinsic_binary(bld->builder, "llvm.x86.sse2.packuswb.128" , i16x8, abl, abh);
-
-         /* NOP */
-         ab = LLVMBuildBitCast(bld->builder, ab, i8x16, "");
+         ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
           
           return ab;
        }
@@ -456,10 +397,84 @@ lp_build_mul(struct lp_build_context *bld,
        assert(0);
     }
  
-   if(LLVMIsConstant(a) && LLVMIsConstant(b))
-      return LLVMConstMul(a, b);
+   if(type.fixed)
+      shift = lp_build_int_const_scalar(type, type.width/2);
+   else
+      shift = NULL;
+
+   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
+      res =  LLVMConstMul(a, b);
+      if(shift) {
+         if(type.sign)
+            res = LLVMConstAShr(res, shift);
+         else
+            res = LLVMConstLShr(res, shift);
+      }
+   }
+   else {
+      res = LLVMBuildMul(bld->builder, a, b, "");
+      if(shift) {
+         if(type.sign)
+            res = LLVMBuildAShr(bld->builder, res, shift, "");
+         else
+            res = LLVMBuildLShr(bld->builder, res, shift, "");
+      }
+   }
+
+   return res;
+}
+
+
+/**
+ * Small vector x scale multiplication optimization.
+ */
+LLVMValueRef
+lp_build_mul_imm(struct lp_build_context *bld,
+                 LLVMValueRef a,
+                 int b)
+{
+   LLVMValueRef factor;
+
+   if(b == 0)
+      return bld->zero;
  
-   return LLVMBuildMul(bld->builder, a, b, "");
+   if(b == 1)
+      return a;
+
+   if(b == -1)
+      return LLVMBuildNeg(bld->builder, a, "");
+
+   if(b == 2 && bld->type.floating)
+      return lp_build_add(bld, a, a);
+
+   if(util_is_pot(b)) {
+      unsigned shift = ffs(b) - 1;
+
+      if(bld->type.floating) {
+#if 0
+         /*
+          * Power of two multiplication by directly manipulating the mantissa.
+          *
+          * XXX: This might not be always faster, it will introduce a small error
+          * for multiplication by zero, and it will produce wrong results
+          * for Inf and NaN.
+          */
+         unsigned mantissa = lp_mantissa(bld->type);
+         factor = lp_build_int_const_scalar(bld->type, (unsigned long long)shift << mantissa);
+         a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
+         a = LLVMBuildAdd(bld->builder, a, factor, "");
+         a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
+         return a;
+#endif
+      }
+      else {
+         factor = lp_build_const_scalar(bld->type, shift);
+         return LLVMBuildShl(bld->builder, a, factor, "");
+      }
+   }
+
+   factor = lp_build_const_scalar(bld->type, (double)b);
+   return lp_build_mul(bld, a, factor);
  }
  
  
@@ -494,13 +509,36 @@ lp_build_div(struct lp_build_context *bld,
  }
  
  
+/**
+ * Linear interpolation.
+ *
+ * This also works for integer values with a few caveats.
+ *
+ * @sa http://www.stereopsis.com/doubleblend.html
+ */
  LLVMValueRef
  lp_build_lerp(struct lp_build_context *bld,
                LLVMValueRef x,
                LLVMValueRef v0,
                LLVMValueRef v1)
  {
-   return lp_build_add(bld, v0, lp_build_mul(bld, x, lp_build_sub(bld, v1, v0)));
+   LLVMValueRef delta;
+   LLVMValueRef res;
+
+   delta = lp_build_sub(bld, v1, v0);
+
+   res = lp_build_mul(bld, x, delta);
+
+   res = lp_build_add(bld, v0, res);
+
+   if(bld->type.fixed)
+      /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
+       * but it will be wrong for other uses. Basically we need a more
+       * powerful lp_type, capable of further distinguishing the values
+       * interpretation from the value storage. */
+      res = LLVMBuildAnd(bld->builder, res, lp_build_int_const_scalar(bld->type, (1 << bld->type.width/2) - 1), "");
+
+   return res;
  }
  
  
@@ -1046,7 +1084,7 @@ lp_build_log(struct lp_build_context *bld,
               LLVMValueRef x)
  {
     /* log(2) */
-   LLVMValueRef log2 = lp_build_const_scalar(bld->type, 1.4426950408889634);
+   LLVMValueRef log2 = lp_build_const_scalar(bld->type, 0.69314718055994529);
  
     return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
  }
@@ -1058,7 +1096,7 @@ lp_build_log(struct lp_build_context *bld,
  
  /**
   * Generate polynomial.
- * Ex:  x^2 * coeffs[0] + x * coeffs[1] + coeffs[2].
+ * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
   */
  static LLVMValueRef
  lp_build_polynomial(struct lp_build_context *bld,
@@ -1248,13 +1286,13 @@ lp_build_log2_approx(struct lp_build_context *bld,
        /* mant = (float) mantissa(x) */
        mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
        mant = LLVMBuildOr(bld->builder, mant, one, "");
-      mant = LLVMBuildSIToFP(bld->builder, mant, vec_type, "");
+      mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
  
        logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
                                      Elements(lp_build_log2_polynomial));
  
        /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
-      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildMul(bld->builder, mant, bld->one, ""), "");
+      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
  
        res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
     }