llvmpipe: Emit SSE intrinsics based on runtime cpu capability check.

[mesa.git] / src / gallium / drivers / llvmpipe / lp_bld_arit.c
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.c b/src/gallium/drivers/llvmpipe/lp_bld_arit.c

index 09a57ff33d51e63ccfdeaf55fdc61bf2454fd093..e8c5fa3c2a13737783dead29346a4e0a4c28addc 100644 (file)
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
@@ -48,6 +48,7 @@
  #include "util/u_memory.h"
  #include "util/u_debug.h"
  #include "util/u_string.h"
+#include "util/u_cpu_detect.h"
  
  #include "lp_bld_type.h"
  #include "lp_bld_const.h"
@@ -65,7 +66,7 @@ lp_build_min_simple(struct lp_build_context *bld,
                      LLVMValueRef a,
                      LLVMValueRef b)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
     const char *intrinsic = NULL;
     LLVMValueRef cond;
  
@@ -113,36 +114,34 @@ lp_build_max_simple(struct lp_build_context *bld,
                      LLVMValueRef a,
                      LLVMValueRef b)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
     const char *intrinsic = NULL;
     LLVMValueRef cond;
  
     /* TODO: optimize the constant case */
  
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
     if(type.width * type.length == 128) {
        if(type.floating) {
-         if(type.width == 32)
+         if(type.width == 32 && util_cpu_caps.has_sse)
              intrinsic = "llvm.x86.sse.max.ps";
-         if(type.width == 64)
+         if(type.width == 64 && util_cpu_caps.has_sse2)
              intrinsic = "llvm.x86.sse2.max.pd";
        }
        else {
-         if(type.width == 8 && !type.sign)
+         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
              intrinsic = "llvm.x86.sse2.pmaxu.b";
-         if(type.width == 8 && type.sign)
+         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
              intrinsic = "llvm.x86.sse41.pmaxsb";
-         if(type.width == 16 && !type.sign)
+         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
              intrinsic = "llvm.x86.sse41.pmaxuw";
-         if(type.width == 16 && type.sign)
+         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
              intrinsic = "llvm.x86.sse2.pmaxs.w";
-         if(type.width == 32 && !type.sign)
+         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
              intrinsic = "llvm.x86.sse41.pmaxud";
-         if(type.width == 32 && type.sign)
+         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
              intrinsic = "llvm.x86.sse41.pmaxsd";
        }
     }
-#endif
  
     if(intrinsic)
        return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -159,7 +158,7 @@ LLVMValueRef
  lp_build_comp(struct lp_build_context *bld,
                LLVMValueRef a)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
  
     if(a == bld->one)
        return bld->zero;
@@ -188,7 +187,7 @@ lp_build_add(struct lp_build_context *bld,
               LLVMValueRef a,
               LLVMValueRef b)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
     LLVMValueRef res;
  
     if(a == bld->zero)
@@ -204,15 +203,14 @@ lp_build_add(struct lp_build_context *bld,
        if(a == bld->one || b == bld->one)
          return bld->one;
  
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width * type.length == 128 &&
+      if(util_cpu_caps.has_sse2 &&
+         type.width * type.length == 128 &&
           !type.floating && !type.fixed) {
           if(type.width == 8)
              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
           if(type.width == 16)
              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
        }
-#endif
     
        if(intrinsic)
           return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -241,7 +239,7 @@ lp_build_sub(struct lp_build_context *bld,
               LLVMValueRef a,
               LLVMValueRef b)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
     LLVMValueRef res;
  
     if(b == bld->zero)
@@ -257,15 +255,14 @@ lp_build_sub(struct lp_build_context *bld,
        if(b == bld->one)
          return bld->zero;
  
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width * type.length == 128 &&
+      if(util_cpu_caps.has_sse2 &&
+         type.width * type.length == 128 &&
           !type.floating && !type.fixed) {
           if(type.width == 8)
              intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
           if(type.width == 16)
              intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
        }
-#endif
     
        if(intrinsic)
           return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -405,7 +402,7 @@ lp_build_mul(struct lp_build_context *bld,
               LLVMValueRef a,
               LLVMValueRef b)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
  
     if(a == bld->zero)
        return bld->zero;
@@ -419,8 +416,7 @@ lp_build_mul(struct lp_build_context *bld,
        return bld->undef;
  
     if(!type.floating && !type.fixed && type.norm) {
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width == 8 && type.length == 16) {
+      if(util_cpu_caps.has_sse2 && type.width == 8 && type.length == 16) {
           LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
           LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
           static LLVMValueRef ml = NULL;
@@ -456,7 +452,6 @@ lp_build_mul(struct lp_build_context *bld,
           
           return ab;
        }
-#endif
  
        /* FIXME */
        assert(0);
@@ -477,7 +472,7 @@ lp_build_div(struct lp_build_context *bld,
               LLVMValueRef a,
               LLVMValueRef b)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
  
     if(a == bld->zero)
        return bld->zero;
@@ -493,15 +488,38 @@ lp_build_div(struct lp_build_context *bld,
     if(LLVMIsConstant(a) && LLVMIsConstant(b))
        return LLVMConstFDiv(a, b);
  
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
        return lp_build_mul(bld, a, lp_build_rcp(bld, b));
-#endif
  
     return LLVMBuildFDiv(bld->builder, a, b, "");
  }
  
  
+LLVMValueRef
+lp_build_lerp(struct lp_build_context *bld,
+              LLVMValueRef x,
+              LLVMValueRef v0,
+              LLVMValueRef v1)
+{
+   return lp_build_add(bld, v0, lp_build_mul(bld, x, lp_build_sub(bld, v1, v0)));
+}
+
+
+LLVMValueRef
+lp_build_lerp_2d(struct lp_build_context *bld,
+                 LLVMValueRef x,
+                 LLVMValueRef y,
+                 LLVMValueRef v00,
+                 LLVMValueRef v01,
+                 LLVMValueRef v10,
+                 LLVMValueRef v11)
+{
+   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
+   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
+   return lp_build_lerp(bld, y, v0, v1);
+}
+
+
  /**
   * Generate min(a, b)
   * Do checks for special cases.
@@ -565,33 +583,216 @@ LLVMValueRef
  lp_build_abs(struct lp_build_context *bld,
               LLVMValueRef a)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
  
     if(!type.sign)
        return a;
  
-   /* XXX: is this really necessary? */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(!type.floating && type.width*type.length == 128) {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      if(type.width == 8)
+   if(type.floating) {
+      /* Mask out the sign bit */
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long)1 << type.width) - 1);
+      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      a = LLVMBuildAnd(bld->builder, a, mask, "");
+      a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
+      return a;
+   }
+
+   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
+      switch(type.width) {
+      case 8:
           return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
-      if(type.width == 16)
+      case 16:
           return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
-      if(type.width == 32)
+      case 32:
           return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
+      }
     }
-#endif
  
     return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
  }
  
  
+LLVMValueRef
+lp_build_sgn(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMValueRef cond;
+   LLVMValueRef res;
+
+   /* Handle non-zero case */
+   if(!type.sign) {
+      /* if not zero then sign must be positive */
+      res = bld->one;
+   }
+   else if(type.floating) {
+      /* Take the sign bit and add it to 1 constant */
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef one;
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+      one = LLVMConstBitCast(bld->one, int_vec_type);
+      res = LLVMBuildOr(bld->builder, sign, one, "");
+      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+   }
+   else
+   {
+      LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
+      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
+      res = lp_build_select(bld, cond, bld->one, minus_one);
+   }
+
+   /* Handle zero */
+   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
+   res = lp_build_select(bld, cond, bld->zero, bld->one);
+
+   return res;
+}
+
+
+enum lp_build_round_sse41_mode
+{
+   LP_BUILD_ROUND_SSE41_NEAREST = 0,
+   LP_BUILD_ROUND_SSE41_FLOOR = 1,
+   LP_BUILD_ROUND_SSE41_CEIL = 2,
+   LP_BUILD_ROUND_SSE41_TRUNCATE = 3
+};
+
+
+static INLINE LLVMValueRef
+lp_build_round_sse41(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     enum lp_build_round_sse41_mode mode)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   const char *intrinsic;
+
+   assert(type.floating);
+   assert(type.width*type.length == 128);
+
+   switch(type.width) {
+   case 32:
+      intrinsic = "llvm.x86.sse41.round.ps";
+      break;
+   case 64:
+      intrinsic = "llvm.x86.sse41.round.pd";
+      break;
+   default:
+      assert(0);
+      return bld->undef;
+   }
+
+   return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
+                                    LLVMConstInt(LLVMInt32Type(), mode, 0));
+}
+
+
+LLVMValueRef
+lp_build_round(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+LLVMValueRef
+lp_build_floor(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+LLVMValueRef
+lp_build_ceil(struct lp_build_context *bld,
+              LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+LLVMValueRef
+lp_build_trunc(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
+
+   /* FIXME */
+   assert(0);
+   return bld->undef;
+}
+
+
+/**
+ * Convert to integer, through whichever rounding method that's fastest,
+ * typically truncating to zero.
+ */
+LLVMValueRef
+lp_build_int(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+
+   assert(type.floating);
+
+   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+}
+
+
+LLVMValueRef
+lp_build_ifloor(struct lp_build_context *bld,
+                LLVMValueRef a)
+{
+   a = lp_build_floor(bld, a);
+   a = lp_build_int(bld, a);
+   return a;
+}
+
+
  LLVMValueRef
  lp_build_sqrt(struct lp_build_context *bld,
                LLVMValueRef a)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
     LLVMTypeRef vec_type = lp_build_vec_type(type);
     char intrinsic[32];
  
@@ -609,7 +810,7 @@ LLVMValueRef
  lp_build_rcp(struct lp_build_context *bld,
               LLVMValueRef a)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
  
     if(a == bld->zero)
        return bld->undef;
@@ -623,11 +824,9 @@ lp_build_rcp(struct lp_build_context *bld,
     if(LLVMIsConstant(a))
        return LLVMConstFDiv(bld->one, a);
  
-   /* XXX: is this really necessary? */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
+      /* FIXME: improve precision */
        return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
-#endif
  
     return LLVMBuildFDiv(bld->builder, bld->one, a, "");
  }
@@ -640,15 +839,12 @@ LLVMValueRef
  lp_build_rsqrt(struct lp_build_context *bld,
                 LLVMValueRef a)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
  
     assert(type.floating);
  
-   /* XXX: is this really necessary? */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
        return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
-#endif
  
     return lp_build_rcp(bld, lp_build_sqrt(bld, a));
  }
@@ -661,7 +857,7 @@ LLVMValueRef
  lp_build_cos(struct lp_build_context *bld,
                LLVMValueRef a)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
     LLVMTypeRef vec_type = lp_build_vec_type(type);
     char intrinsic[32];
  
@@ -681,7 +877,7 @@ LLVMValueRef
  lp_build_sin(struct lp_build_context *bld,
                LLVMValueRef a)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
     LLVMTypeRef vec_type = lp_build_vec_type(type);
     char intrinsic[32];
  
@@ -704,7 +900,8 @@ lp_build_pow(struct lp_build_context *bld,
  {
     /* TODO: optimize the constant case */
     if(LLVMIsConstant(x) && LLVMIsConstant(y))
-      debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                   __FUNCTION__);
  
     return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
  }
@@ -752,13 +949,14 @@ lp_build_polynomial(struct lp_build_context *bld,
                      const double *coeffs,
                      unsigned num_coeffs)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
     LLVMValueRef res = NULL;
     unsigned i;
  
     /* TODO: optimize the constant case */
     if(LLVMIsConstant(x))
-      debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                   __FUNCTION__);
  
     for (i = num_coeffs; i--; ) {
        LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
@@ -800,7 +998,7 @@ lp_build_exp2_approx(struct lp_build_context *bld,
                       LLVMValueRef *p_frac_part,
                       LLVMValueRef *p_exp2)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
     LLVMTypeRef vec_type = lp_build_vec_type(type);
     LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
     LLVMValueRef ipart = NULL;
@@ -812,7 +1010,8 @@ lp_build_exp2_approx(struct lp_build_context *bld,
     if(p_exp2_int_part || p_frac_part || p_exp2) {
        /* TODO: optimize the constant case */
        if(LLVMIsConstant(x))
-         debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                      __FUNCTION__);
  
        assert(type.floating && type.width == 32);
  
@@ -893,7 +1092,7 @@ lp_build_log2_approx(struct lp_build_context *bld,
                       LLVMValueRef *p_floor_log2,
                       LLVMValueRef *p_log2)
  {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
     LLVMTypeRef vec_type = lp_build_vec_type(type);
     LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
  
@@ -911,7 +1110,8 @@ lp_build_log2_approx(struct lp_build_context *bld,
     if(p_exp || p_floor_log2 || p_log2) {
        /* TODO: optimize the constant case */
        if(LLVMIsConstant(x))
-         debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                      __FUNCTION__);
  
        assert(type.floating && type.width == 32);