util: add avx2 and xop detection to cpu detection code
authorRoland Scheidegger <sroland@vmware.com>
Tue, 20 Aug 2013 02:20:33 +0000 (04:20 +0200)
committerRoland Scheidegger <sroland@vmware.com>
Tue, 20 Aug 2013 21:00:24 +0000 (23:00 +0200)
Going to need this soon (not going to bother with avx2 intrinsics at this time
but don't want to do workarounds for true vector shifts if llvm itself can use
them just fine and won't need the gazillion instruction emulation).
Not really tested other than my cpu returns 0 for these features...
(I have no idea if llvm actually would emit avx2/xop instructions neither...)

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
src/gallium/auxiliary/gallivm/lp_bld_init.c
src/gallium/auxiliary/util/u_cpu_detect.c
src/gallium/auxiliary/util/u_cpu_detect.h

index 61eadb838dc601a54174857580b5b9779467259b..61b561f9343706ff60c8d4945abffc0efd6212b9 100644 (file)
@@ -461,12 +461,15 @@ lp_build_init(void)
                                                  lp_native_vector_width);
 
    if (lp_native_vector_width <= 128) {
-      /* Hide AVX support, as often LLVM AVX instrinsics are only guarded by
+      /* Hide AVX support, as often LLVM AVX intrinsics are only guarded by
        * "util_cpu_caps.has_avx" predicate, and lack the
        * "lp_native_vector_width > 128" predicate. And also to ensure a more
        * consistent behavior, allowing one to test SSE2 on AVX machines.
+       * XXX: should not play games with util_cpu_caps directly as it might
+       * get used for other things outside llvm too.
        */
       util_cpu_caps.has_avx = 0;
+      util_cpu_caps.has_avx2 = 0;
    }
 
    if (!HAVE_AVX) {
@@ -476,13 +479,17 @@ lp_build_init(void)
        * omit it unnecessarily on amd cpus, see above).
        */
       util_cpu_caps.has_f16c = 0;
+      util_cpu_caps.has_xop = 0;
    }
 
 #ifdef PIPE_ARCH_PPC_64
    /* Set the NJ bit in VSCR to 0 so denormalized values are handled as
-    * specified by IEEE standard (PowerISA 2.06 - Section 6.3). This garantees
+    * specified by IEEE standard (PowerISA 2.06 - Section 6.3). This guarantees
     * that some rounding and half-float to float handling does not round
     * incorrectly to 0.
+    * XXX: should eventually follow same logic on all platforms.
+    * Right now denorms get explicitly disabled (but elsewhere) for x86,
+    * whereas ppc64 explicitly enables them...
     */
    if (util_cpu_caps.has_altivec) {
       unsigned short mask[] = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
index 87ad780951e227bf9afcf42634fef3514b8c8226..2ff40bb00505fbff9b42383965fb287d3d26048c 100644 (file)
@@ -212,6 +212,44 @@ cpuid(uint32_t ax, uint32_t *p)
 #endif
 }
 
+/**
+ * @sa cpuid.h included in gcc-4.4 onwards.
+ * @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx
+ */
+static INLINE void
+cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p)
+{
+#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86)
+   __asm __volatile (
+     "xchgl %%ebx, %1\n\t"
+     "cpuid\n\t"
+     "xchgl %%ebx, %1"
+     : "=a" (p[0]),
+       "=S" (p[1]),
+       "=c" (p[2]),
+       "=d" (p[3])
+     : "0" (ax), "2" (cx)
+   );
+#elif (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86_64)
+   __asm __volatile (
+     "cpuid\n\t"
+     : "=a" (p[0]),
+       "=b" (p[1]),
+       "=c" (p[2]),
+       "=d" (p[3])
+     : "0" (ax), "2" (cx)
+   );
+#elif defined(PIPE_CC_MSVC)
+   __cpuidex(p, ax, cx);
+#else
+   p[0] = 0;
+   p[1] = 0;
+   p[2] = 0;
+   p[3] = 0;
+#endif
+}
+
+
 static INLINE uint64_t xgetbv(void)
 {
 #if defined(PIPE_CC_GCC)
@@ -341,6 +379,11 @@ util_cpu_detect(void)
          if (cacheline > 0)
             util_cpu_caps.cacheline = cacheline;
       }
+      if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) {
+         uint32_t regs7[4];
+         cpuid_count(0x00000007, 0x00000000, regs7);
+         util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1;
+      }
 
       if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) {
          /* GenuineIntel */
@@ -357,6 +400,9 @@ util_cpu_detect(void)
          util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1;
          util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1;
          util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1;
+
+         util_cpu_caps.has_xop = util_cpu_caps.has_avx &&
+                                 ((regs2[2] >> 11) & 1);
       }
 
       if (regs[0] >= 0x80000006) {
@@ -394,10 +440,12 @@ util_cpu_detect(void)
       debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1);
       debug_printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2);
       debug_printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx);
+      debug_printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2);
       debug_printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c);
       debug_printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt);
       debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow);
       debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
+      debug_printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop);
       debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
       debug_printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz);
    }
index cc3e0ce03443f80d53f4df6f8ae73ee52a631e7e..5ccfc931697f25e089474f58a27adc0af8f6d892 100644 (file)
@@ -64,9 +64,11 @@ struct util_cpu_caps {
    unsigned has_sse4_2:1;
    unsigned has_popcnt:1;
    unsigned has_avx:1;
+   unsigned has_avx2:1;
    unsigned has_f16c:1;
    unsigned has_3dnow:1;
    unsigned has_3dnow_ext:1;
+   unsigned has_xop:1;
    unsigned has_altivec:1;
    unsigned has_daz:1;
 };