llvmpipe: Emit SSE intrinsics based on runtime cpu capability check.
authorJosé Fonseca <jfonseca@vmware.com>
Tue, 29 Sep 2009 12:58:58 +0000 (13:58 +0100)
committerJosé Fonseca <jfonseca@vmware.com>
Tue, 29 Sep 2009 12:59:16 +0000 (13:59 +0100)
Note that llvmpipe still doesn't run on any processor yet: if you don't
have a recent processor with SSE4.1 you will still likely end up
hitting a code path for which a generic non-sse4 version is not
implemented yet.

src/gallium/drivers/llvmpipe/lp_bld_arit.c
src/gallium/drivers/llvmpipe/lp_bld_conv.c
src/gallium/drivers/llvmpipe/lp_bld_logic.c
src/gallium/drivers/llvmpipe/lp_jit.c

index 31433318a7e0f73936eebcca33e6acab8ab8c1d0..e8c5fa3c2a13737783dead29346a4e0a4c28addc 100644 (file)
@@ -48,6 +48,7 @@
 #include "util/u_memory.h"
 #include "util/u_debug.h"
 #include "util/u_string.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -119,30 +120,28 @@ lp_build_max_simple(struct lp_build_context *bld,
 
    /* TODO: optimize the constant case */
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
       if(type.floating) {
-         if(type.width == 32)
+         if(type.width == 32 && util_cpu_caps.has_sse)
             intrinsic = "llvm.x86.sse.max.ps";
-         if(type.width == 64)
+         if(type.width == 64 && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.max.pd";
       }
       else {
-         if(type.width == 8 && !type.sign)
+         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pmaxu.b";
-         if(type.width == 8 && type.sign)
+         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxsb";
-         if(type.width == 16 && !type.sign)
+         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxuw";
-         if(type.width == 16 && type.sign)
+         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pmaxs.w";
-         if(type.width == 32 && !type.sign)
+         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxud";
-         if(type.width == 32 && type.sign)
+         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxsd";
       }
    }
-#endif
 
    if(intrinsic)
       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -204,15 +203,14 @@ lp_build_add(struct lp_build_context *bld,
       if(a == bld->one || b == bld->one)
         return bld->one;
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width * type.length == 128 &&
+      if(util_cpu_caps.has_sse2 &&
+         type.width * type.length == 128 &&
          !type.floating && !type.fixed) {
          if(type.width == 8)
             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
          if(type.width == 16)
             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
       }
-#endif
    
       if(intrinsic)
          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -257,15 +255,14 @@ lp_build_sub(struct lp_build_context *bld,
       if(b == bld->one)
         return bld->zero;
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width * type.length == 128 &&
+      if(util_cpu_caps.has_sse2 &&
+         type.width * type.length == 128 &&
          !type.floating && !type.fixed) {
          if(type.width == 8)
             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
          if(type.width == 16)
             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
       }
-#endif
    
       if(intrinsic)
          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -419,8 +416,7 @@ lp_build_mul(struct lp_build_context *bld,
       return bld->undef;
 
    if(!type.floating && !type.fixed && type.norm) {
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width == 8 && type.length == 16) {
+      if(util_cpu_caps.has_sse2 && type.width == 8 && type.length == 16) {
          LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
          LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
          static LLVMValueRef ml = NULL;
@@ -456,7 +452,6 @@ lp_build_mul(struct lp_build_context *bld,
          
          return ab;
       }
-#endif
 
       /* FIXME */
       assert(0);
@@ -493,10 +488,8 @@ lp_build_div(struct lp_build_context *bld,
    if(LLVMIsConstant(a) && LLVMIsConstant(b))
       return LLVMConstFDiv(a, b);
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
-#endif
 
    return LLVMBuildFDiv(bld->builder, a, b, "");
 }
@@ -606,8 +599,7 @@ lp_build_abs(struct lp_build_context *bld,
       return a;
    }
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width*type.length == 128) {
+   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
       switch(type.width) {
       case 8:
          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
@@ -617,7 +609,6 @@ lp_build_abs(struct lp_build_context *bld,
          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
       }
    }
-#endif
 
    return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
 }
@@ -710,9 +701,8 @@ lp_build_round(struct lp_build_context *bld,
 
    assert(type.floating);
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
-#endif
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
 
    /* FIXME */
    assert(0);
@@ -728,9 +718,8 @@ lp_build_floor(struct lp_build_context *bld,
 
    assert(type.floating);
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
-#endif
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
 
    /* FIXME */
    assert(0);
@@ -746,9 +735,8 @@ lp_build_ceil(struct lp_build_context *bld,
 
    assert(type.floating);
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
-#endif
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
 
    /* FIXME */
    assert(0);
@@ -764,9 +752,8 @@ lp_build_trunc(struct lp_build_context *bld,
 
    assert(type.floating);
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
-#endif
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
 
    /* FIXME */
    assert(0);
@@ -837,11 +824,9 @@ lp_build_rcp(struct lp_build_context *bld,
    if(LLVMIsConstant(a))
       return LLVMConstFDiv(bld->one, a);
 
-   /* XXX: is this really necessary? */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
+      /* FIXME: improve precision */
       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
-#endif
 
    return LLVMBuildFDiv(bld->builder, bld->one, a, "");
 }
@@ -858,11 +843,8 @@ lp_build_rsqrt(struct lp_build_context *bld,
 
    assert(type.floating);
 
-   /* XXX: is this really necessary? */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
-#endif
 
    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
 }
index 186cac70f62a07c817ef89711d3dd883dcf577e4..20c8710214b9408d0b5ac269a303335466b1bc44 100644 (file)
@@ -63,6 +63,7 @@
 
 #include "util/u_debug.h"
 #include "util/u_math.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -334,8 +335,7 @@ lp_build_pack2(LLVMBuilderRef builder,
    assert(!src_type.floating);
    assert(!dst_type.floating);
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(src_type.width * src_type.length == 128) {
+   if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
       /* All X86 non-interleaved pack instructions all take signed inputs and
        * saturate them, so saturate beforehand. */
       if(!src_type.sign && !clamped) {
@@ -349,7 +349,7 @@ lp_build_pack2(LLVMBuilderRef builder,
 
       switch(src_type.width) {
       case 32:
-         if(dst_type.sign)
+         if(dst_type.sign || !util_cpu_caps.has_sse4_1)
             res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
          else
             /* PACKUSDW is the only instrinsic with a consistent signature */
@@ -372,7 +372,6 @@ lp_build_pack2(LLVMBuilderRef builder,
       res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
       return res;
    }
-#endif
 
    lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
    hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
index 6b6f82076976bd4b8d54d03e3fbc951ad7066eec..db22a8028a6a2306bd8d5e733169abe51e94e998 100644 (file)
@@ -33,6 +33,8 @@
  */
 
 
+#include "util/u_cpu_detect.h"
+
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
@@ -65,7 +67,7 @@ lp_build_cmp(struct lp_build_context *bld,
 
 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
-      if(type.floating) {
+      if(type.floating && util_cpu_caps.has_sse) {
          LLVMValueRef args[3];
          unsigned cc;
          boolean swap;
@@ -114,7 +116,7 @@ lp_build_cmp(struct lp_build_context *bld,
          res = LLVMBuildBitCast(bld->builder, res, int_vec_type, "");
          return res;
       }
-      else {
+      else if(util_cpu_caps.has_sse2) {
          static const struct {
             unsigned swap:1;
             unsigned eq:1;
index f7111c1e5cf888fcbdbcd03507f600b60a3e1d90..5d2cf01e5bcbb459b15dbe3818882cc3b1aa5d17 100644 (file)
@@ -36,6 +36,7 @@
 #include <llvm-c/Transforms/Scalar.h>
 
 #include "util/u_memory.h"
+#include "util/u_cpu_detect.h"
 #include "lp_screen.h"
 #include "lp_bld_intr.h"
 #include "lp_jit.h"
@@ -147,6 +148,8 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
 {
    char *error = NULL;
 
+   util_cpu_detect();
+
 #ifdef LLVM_NATIVE_ARCH
    LLVMLinkInJIT();
    LLVMInitializeNativeTarget();