gallivm,llvmpipe: Use 4-wide vectors on AMD Bulldozer.

author José Fonseca <jfonseca@vmware.com>

Fri, 31 Aug 2012 16:01:50 +0000 (17:01 +0100)

committer José Fonseca <jfonseca@vmware.com>

Tue, 4 Sep 2012 07:49:00 +0000 (08:49 +0100)
author José Fonseca <jfonseca@vmware.com>
Fri, 31 Aug 2012 16:01:50 +0000 (17:01 +0100)
committer José Fonseca <jfonseca@vmware.com>
Tue, 4 Sep 2012 07:49:00 +0000 (08:49 +0100)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c

index 068a2cd791557201ed67b60e6ce374c6c36e2582..ffbe3eaed2cdc62d9d44942aec999cf4bbb58706 100644 (file)
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -434,8 +434,16 @@ lp_build_init(void)
  
     util_cpu_detect();
  
+   /* AMD Bulldozer AVX's throughput is the same as SSE2; and because using
+    * 8-wide vector needs more floating ops than 4-wide (due to padding), it is
+    * actually more efficient to use 4-wide vectors on this processor.
+    *
+    * See also:
+    * - http://www.anandtech.com/show/4955/the-bulldozer-review-amd-fx8150-tested/2
+    */
     if (HAVE_AVX &&
-       util_cpu_caps.has_avx) {
+       util_cpu_caps.has_avx &&
+       util_cpu_caps.has_intel) {
        lp_native_vector_width = 256;
     } else {
        /* Leave it at 128, even when no SIMD extensions are available.
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c

index 945f0b0a91050846a189c4c5f02988ffc5f3f7d7..d7f0be40e3dcddd26c285c81305189c110c2b027 100644 (file)
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -286,6 +286,11 @@ util_cpu_detect(void)
              util_cpu_caps.cacheline = cacheline;
        }
  
+      if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) {
+         /* GenuineIntel */
+         util_cpu_caps.has_intel = 1;
+      }
+
        cpuid(0x80000000, regs);
  
        if (regs[0] >= 0x80000001) {
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h

index b44d9d9a0feef98cc51fd9f52e85b2245f762c83..acac68658493d38c25b4c6b3127c3fb73094297d 100644 (file)
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -52,6 +52,7 @@ struct util_cpu_caps {
     int x86_cpu_type;
     unsigned cacheline;
  
+   unsigned has_intel:1;
     unsigned has_tsc:1;
     unsigned has_mmx:1;
     unsigned has_mmx2:1;
author	José Fonseca <jfonseca@vmware.com>
	Fri, 31 Aug 2012 16:01:50 +0000 (17:01 +0100)
committer	José Fonseca <jfonseca@vmware.com>
	Tue, 4 Sep 2012 07:49:00 +0000 (08:49 +0100)
src/gallium/auxiliary/gallivm/lp_bld_init.c		patch \| blob \| history
src/gallium/auxiliary/util/u_cpu_detect.c		patch \| blob \| history
src/gallium/auxiliary/util/u_cpu_detect.h		patch \| blob \| history