From 7eb504019731368fd55f01e0264b195d4f99ae93 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Jos=C3=A9=20Fonseca?= Date: Fri, 31 Aug 2012 17:01:50 +0100 Subject: [PATCH] gallivm,llvmpipe: Use 4-wide vectors on AMD Bulldozer. 8-wide vectors is slower. Reviewed-by: Roland Scheidegger --- src/gallium/auxiliary/gallivm/lp_bld_init.c | 10 +++++++++- src/gallium/auxiliary/util/u_cpu_detect.c | 5 +++++ src/gallium/auxiliary/util/u_cpu_detect.h | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c index 068a2cd7915..ffbe3eaed2c 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c @@ -434,8 +434,16 @@ lp_build_init(void) util_cpu_detect(); + /* AMD Bulldozer AVX's throughput is the same as SSE2; and because using + * 8-wide vector needs more floating ops than 4-wide (due to padding), it is + * actually more efficient to use 4-wide vectors on this processor. + * + * See also: + * - http://www.anandtech.com/show/4955/the-bulldozer-review-amd-fx8150-tested/2 + */ if (HAVE_AVX && - util_cpu_caps.has_avx) { + util_cpu_caps.has_avx && + util_cpu_caps.has_intel) { lp_native_vector_width = 256; } else { /* Leave it at 128, even when no SIMD extensions are available. diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c index 945f0b0a910..d7f0be40e3d 100644 --- a/src/gallium/auxiliary/util/u_cpu_detect.c +++ b/src/gallium/auxiliary/util/u_cpu_detect.c @@ -286,6 +286,11 @@ util_cpu_detect(void) util_cpu_caps.cacheline = cacheline; } + if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) { + /* GenuineIntel */ + util_cpu_caps.has_intel = 1; + } + cpuid(0x80000000, regs); if (regs[0] >= 0x80000001) { diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h index b44d9d9a0fe..acac6865849 100644 --- a/src/gallium/auxiliary/util/u_cpu_detect.h +++ b/src/gallium/auxiliary/util/u_cpu_detect.h @@ -52,6 +52,7 @@ struct util_cpu_caps { int x86_cpu_type; unsigned cacheline; + unsigned has_intel:1; unsigned has_tsc:1; unsigned has_mmx:1; unsigned has_mmx2:1; -- 2.30.2