gallivm: use f16c hw support for float->half and half->float conversion

author Roland Scheidegger <sroland@vmware.com>

Tue, 2 Apr 2013 23:06:52 +0000 (01:06 +0200)

committer Roland Scheidegger <sroland@vmware.com>

Wed, 3 Apr 2013 23:03:42 +0000 (01:03 +0200)
author Roland Scheidegger <sroland@vmware.com>
Tue, 2 Apr 2013 23:06:52 +0000 (01:06 +0200)
committer Roland Scheidegger <sroland@vmware.com>
Wed, 3 Apr 2013 23:03:42 +0000 (01:03 +0200)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c

index 38a577cdba50bb3c167c81823a33a1e8d734477c..eb2d09638fa0e96bd423bdd3d9f1d93d17a19761 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -175,9 +175,24 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
     struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
     struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
     LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
+   LLVMValueRef h;
+
+   if (util_cpu_caps.has_f16c && HAVE_LLVM >= 0x0301 &&
+       (src_length == 4 || src_length == 8)) {
+      const char *intrinsic = NULL;
+      if (src_length == 4) {
+         src = lp_build_pad_vector(gallivm, src, 8);
+         intrinsic = "llvm.x86.vcvtph2ps.128";
+      }
+      else {
+         intrinsic = "llvm.x86.vcvtph2ps.256";
+      }
+      return lp_build_intrinsic_unary(builder, intrinsic,
+                                      lp_build_vec_type(gallivm, f32_type), src);
+   }
  
     /* Convert int16 vector to int32 vector by zero ext (might generate bad code) */
-   LLVMValueRef h             = LLVMBuildZExt(builder, src, int_vec_type, "");
+   h = LLVMBuildZExt(builder, src, int_vec_type, "");
     return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true);
  }
  
@@ -204,9 +219,31 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
     struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
     LLVMValueRef result;
  
-   result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true);
-   /* Convert int32 vector to int16 vector by trunc (might generate bad code) */
-   result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), "");
+   if (util_cpu_caps.has_f16c && HAVE_LLVM >= 0x0301 &&
+       (length == 4 || length == 8)) {
+      struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
+      unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */
+      LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+      const char *intrinsic = NULL;
+      if (length == 4) {
+         intrinsic = "llvm.x86.vcvtps2ph.128";
+      }
+      else {
+         intrinsic = "llvm.x86.vcvtps2ph.256";
+      }
+      result = lp_build_intrinsic_binary(builder, intrinsic,
+                                         lp_build_vec_type(gallivm, i168_type),
+                                         src, LLVMConstInt(i32t, mode, 0));
+      if (length == 4) {
+         result = lp_build_extract_range(gallivm, result, 0, 4);
+      }
+   }
+
+   else {
+      result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true);
+      /* Convert int32 vector to int16 vector by trunc (might generate bad code) */
+      result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), "");
+   }
  
     /*
      * Debugging code.
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c

index 050eba7b2b387e1c02d1c7a3376d0eabb86afba2..4fa5887e878a22d553d17a960690995814afb1ba 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -468,6 +468,15 @@ lp_build_init(void)
        util_cpu_caps.has_avx = 0;
     }
  
+   if (!HAVE_AVX) {
+      /*
+       * note these instructions are VEX-only, so can only emit if we use
+       * avx (don't want to base it on has_avx & has_f16c later as that would
+       * omit it unnecessarily on amd cpus, see above).
+       */
+      util_cpu_caps.has_f16c = 0;
+   }
+
  #ifdef PIPE_ARCH_PPC_64
     /* Set the NJ bit in VSCR to 0 so denormalized values are handled as
      * specified by IEEE standard (PowerISA 2.06 - Section 6.3). This garantees
@@ -495,6 +504,7 @@ lp_build_init(void)
     util_cpu_caps.has_ssse3 = 0;
     util_cpu_caps.has_sse4_1 = 0;
     util_cpu_caps.has_avx = 0;
+   util_cpu_caps.has_f16c = 0;
  #endif
  }
  
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c

index 03280515be15e3467f03d7c12185feaa4ec35373..7e6df9df1574cd2297d6444795656a8a85ce4637 100644 (file)
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -279,6 +279,7 @@ util_cpu_detect(void)
           util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1;
           util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1;
           util_cpu_caps.has_avx    = (regs2[2] >> 28) & 1;
+         util_cpu_caps.has_f16c   = (regs2[2] >> 29) & 1;
           util_cpu_caps.has_mmx2   = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */
  
           cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h

index acac68658493d38c25b4c6b3127c3fb73094297d..21c2f048ff443bde91036f0ed5245f85a4abfb20 100644 (file)
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -63,6 +63,7 @@ struct util_cpu_caps {
     unsigned has_sse4_1:1;
     unsigned has_sse4_2:1;
     unsigned has_avx:1;
+   unsigned has_f16c:1;
     unsigned has_3dnow:1;
     unsigned has_3dnow_ext:1;
     unsigned has_altivec:1;
author	Roland Scheidegger <sroland@vmware.com>
	Tue, 2 Apr 2013 23:06:52 +0000 (01:06 +0200)
committer	Roland Scheidegger <sroland@vmware.com>
	Wed, 3 Apr 2013 23:03:42 +0000 (01:03 +0200)
src/gallium/auxiliary/gallivm/lp_bld_conv.c		patch \| blob \| history
src/gallium/auxiliary/gallivm/lp_bld_init.c		patch \| blob \| history
src/gallium/auxiliary/util/u_cpu_detect.c		patch \| blob \| history
src/gallium/auxiliary/util/u_cpu_detect.h		patch \| blob \| history