From 067a0ae42017f327edce7634890e699b4ec8492c Mon Sep 17 00:00:00 2001 From: Roland Scheidegger Date: Wed, 3 Apr 2013 01:06:52 +0200 Subject: [PATCH] gallivm: use f16c hw support for float->half and half->float conversion Should be way faster of course on cpus supporting this (includes AMD Bulldozer and Jaguar cores, Intel Ivy Bridge and up (except budget models)). Passes piglit fbo-blending-formats GL_ARB_texture_float -auto on Ivy Bridge. Reviewed-by: Brian Paul --- src/gallium/auxiliary/gallivm/lp_bld_conv.c | 45 +++++++++++++++++++-- src/gallium/auxiliary/gallivm/lp_bld_init.c | 10 +++++ src/gallium/auxiliary/util/u_cpu_detect.c | 1 + src/gallium/auxiliary/util/u_cpu_detect.h | 1 + 4 files changed, 53 insertions(+), 4 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c index 38a577cdba5..eb2d09638fa 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c @@ -175,9 +175,24 @@ lp_build_half_to_float(struct gallivm_state *gallivm, struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length); struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length); LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type); + LLVMValueRef h; + + if (util_cpu_caps.has_f16c && HAVE_LLVM >= 0x0301 && + (src_length == 4 || src_length == 8)) { + const char *intrinsic = NULL; + if (src_length == 4) { + src = lp_build_pad_vector(gallivm, src, 8); + intrinsic = "llvm.x86.vcvtph2ps.128"; + } + else { + intrinsic = "llvm.x86.vcvtph2ps.256"; + } + return lp_build_intrinsic_unary(builder, intrinsic, + lp_build_vec_type(gallivm, f32_type), src); + } /* Convert int16 vector to int32 vector by zero ext (might generate bad code) */ - LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, ""); + h = LLVMBuildZExt(builder, src, int_vec_type, ""); return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true); } @@ -204,9 +219,31 @@ lp_build_float_to_half(struct gallivm_state *gallivm, struct lp_type i16_type = lp_type_int_vec(16, 16 * length); LLVMValueRef result; - result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true); - /* Convert int32 vector to int16 vector by trunc (might generate bad code) */ - result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), ""); + if (util_cpu_caps.has_f16c && HAVE_LLVM >= 0x0301 && + (length == 4 || length == 8)) { + struct lp_type i168_type = lp_type_int_vec(16, 16 * 8); + unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */ + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); + const char *intrinsic = NULL; + if (length == 4) { + intrinsic = "llvm.x86.vcvtps2ph.128"; + } + else { + intrinsic = "llvm.x86.vcvtps2ph.256"; + } + result = lp_build_intrinsic_binary(builder, intrinsic, + lp_build_vec_type(gallivm, i168_type), + src, LLVMConstInt(i32t, mode, 0)); + if (length == 4) { + result = lp_build_extract_range(gallivm, result, 0, 4); + } + } + + else { + result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true); + /* Convert int32 vector to int16 vector by trunc (might generate bad code) */ + result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), ""); + } /* * Debugging code. diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c index 050eba7b2b3..4fa5887e878 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c @@ -468,6 +468,15 @@ lp_build_init(void) util_cpu_caps.has_avx = 0; } + if (!HAVE_AVX) { + /* + * note these instructions are VEX-only, so can only emit if we use + * avx (don't want to base it on has_avx & has_f16c later as that would + * omit it unnecessarily on amd cpus, see above). + */ + util_cpu_caps.has_f16c = 0; + } + #ifdef PIPE_ARCH_PPC_64 /* Set the NJ bit in VSCR to 0 so denormalized values are handled as * specified by IEEE standard (PowerISA 2.06 - Section 6.3). This garantees @@ -495,6 +504,7 @@ lp_build_init(void) util_cpu_caps.has_ssse3 = 0; util_cpu_caps.has_sse4_1 = 0; util_cpu_caps.has_avx = 0; + util_cpu_caps.has_f16c = 0; #endif } diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c index 03280515be1..7e6df9df157 100644 --- a/src/gallium/auxiliary/util/u_cpu_detect.c +++ b/src/gallium/auxiliary/util/u_cpu_detect.c @@ -279,6 +279,7 @@ util_cpu_detect(void) util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1; util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1; util_cpu_caps.has_avx = (regs2[2] >> 28) & 1; + util_cpu_caps.has_f16c = (regs2[2] >> 29) & 1; util_cpu_caps.has_mmx2 = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */ cacheline = ((regs2[1] >> 8) & 0xFF) * 8; diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h index acac6865849..21c2f048ff4 100644 --- a/src/gallium/auxiliary/util/u_cpu_detect.h +++ b/src/gallium/auxiliary/util/u_cpu_detect.h @@ -63,6 +63,7 @@ struct util_cpu_caps { unsigned has_sse4_1:1; unsigned has_sse4_2:1; unsigned has_avx:1; + unsigned has_f16c:1; unsigned has_3dnow:1; unsigned has_3dnow_ext:1; unsigned has_altivec:1; -- 2.30.2