gallivm: faster iround implementation for sse2

author Roland Scheidegger <sroland@vmware.com>

Fri, 8 Oct 2010 16:38:25 +0000 (18:38 +0200)

committer Roland Scheidegger <sroland@vmware.com>

Fri, 8 Oct 2010 22:36:37 +0000 (00:36 +0200)
author Roland Scheidegger <sroland@vmware.com>
Fri, 8 Oct 2010 16:38:25 +0000 (18:38 +0200)
committer Roland Scheidegger <sroland@vmware.com>
Fri, 8 Oct 2010 22:36:37 +0000 (00:36 +0200)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c

index 4f108f6e8185c08b248b98108169a7af791dc500..6ab13506e1553372934d356200d2c43025b49ff2 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1053,6 +1053,54 @@ lp_build_round_sse41(struct lp_build_context *bld,
  }
  
  
+static INLINE LLVMValueRef
+lp_build_iround_nearest_sse2(struct lp_build_context *bld,
+                             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMTypeRef ret_type = lp_build_int_vec_type(type);
+   const char *intrinsic;
+   LLVMValueRef res;
+
+   assert(type.floating);
+   /* using the double precision conversions is a bit more complicated */
+   assert(type.width == 32);
+
+   assert(lp_check_value(type, a));
+   assert(util_cpu_caps.has_sse2);
+
+   /* This is relying on MXCSR rounding mode, which should always be nearest. */
+   if (type.length == 1) {
+      LLVMTypeRef vec_type;
+      LLVMValueRef undef;
+      LLVMValueRef arg;
+      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
+
+      vec_type = LLVMVectorType(bld->elem_type, 4);
+
+      intrinsic = "llvm.x86.sse.cvtss2si";
+
+      undef = LLVMGetUndef(vec_type);
+
+      arg = LLVMBuildInsertElement(bld->builder, undef, a, index0, "");
+
+      res = lp_build_intrinsic_unary(bld->builder, intrinsic,
+                                     ret_type, arg);
+   }
+   else {
+      assert(type.width*type.length == 128);
+
+      intrinsic = "llvm.x86.sse2.cvtps2dq";
+
+      res = lp_build_intrinsic_unary(bld->builder, intrinsic,
+                                     ret_type, a);
+   }
+
+   return res;
+}
+
+
  /**
   * Return the integer part of a float (vector) value (== round toward zero).
   * The returned value is a float (vector).
@@ -1217,7 +1265,11 @@ lp_build_iround(struct lp_build_context *bld,
  
     assert(lp_check_value(type, a));
  
-   if (util_cpu_caps.has_sse4_1 &&
+   if (util_cpu_caps.has_sse2 &&
+       ((type.width == 32) && (type.length == 1 || type.length == 4))) {
+      return lp_build_iround_nearest_sse2(bld, a);
+   }
+   else if (util_cpu_caps.has_sse4_1 &&
         (type.length == 1 || type.width*type.length == 128)) {
        res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
     }
author	Roland Scheidegger <sroland@vmware.com>
	Fri, 8 Oct 2010 16:38:25 +0000 (18:38 +0200)
committer	Roland Scheidegger <sroland@vmware.com>
	Fri, 8 Oct 2010 22:36:37 +0000 (00:36 +0200)