gallium/util: Fix detection of AVX cpu caps
authorAndre Heider <a.heider@gmail.com>
Tue, 23 Jul 2013 17:08:45 +0000 (19:08 +0200)
committerJosé Fonseca <jfonseca@vmware.com>
Tue, 23 Jul 2013 22:12:58 +0000 (23:12 +0100)
For AVX it's not sufficient to only rely on the cpuid flags. If the CPU
supports these extensions, but the OS doesn't, issuing these insns will
trigger an undefined opcode exception.

In addition to the AVX cpuid bit we also need to:
* test cpuid for OSXSAVE support
* XGETBV to check if the OS saves/restores AVX regs on context switches

See "Detecting Availability and Support" at
http://software.intel.com/en-us/articles/introduction-to-intel-advanced-vector-extensions

Signed-off-by: Andre Heider <a.heider@gmail.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
Reviewed-by: José Fonseca <jfonseca@vmware.com>
src/gallium/auxiliary/util/u_cpu_detect.c

index b118fc8ae360dbb446e80b3fcd0596386baedd2e..588fc7c7292125e1f347e51f2ea1db3d26f45f5e 100644 (file)
@@ -67,7 +67,7 @@
 
 #if defined(PIPE_OS_WINDOWS)
 #include <windows.h>
-#if defined(MSVC)
+#if defined(PIPE_CC_MSVC)
 #include <intrin.h>
 #endif
 #endif
@@ -210,6 +210,27 @@ cpuid(uint32_t ax, uint32_t *p)
    p[2] = 0;
    p[3] = 0;
 #endif
+}
+
+static INLINE uint64_t xgetbv(void)
+{
+#if defined(PIPE_CC_GCC)
+   uint32_t eax, edx;
+
+   __asm __volatile (
+     ".byte 0x0f, 0x01, 0xd0" // xgetbv isn't supported on gcc < 4.4
+     : "=a"(eax),
+       "=d"(edx)
+     : "c"(0)
+   );
+
+   return ((uint64_t)edx << 32) | eax;
+#elif defined(PIPE_CC_MSVC) && defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
+   return _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+#else
+   return 0;
+#endif
+
 }
 #endif /* X86 or X86_64 */
 
@@ -284,7 +305,9 @@ util_cpu_detect(void)
          util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1;
          util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1;
          util_cpu_caps.has_popcnt = (regs2[2] >> 23) & 1;
-         util_cpu_caps.has_avx    = (regs2[2] >> 28) & 1;
+         util_cpu_caps.has_avx    = ((regs2[2] >> 28) & 1) && // AVX
+                                    ((regs2[2] >> 27) & 1) && // OSXSAVE
+                                    ((xgetbv() & 6) == 6);    // XMM & YMM
          util_cpu_caps.has_f16c   = (regs2[2] >> 29) & 1;
          util_cpu_caps.has_mmx2   = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */