X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Fgallium%2Fauxiliary%2Futil%2Fu_cpu_detect.c;h=4dbb4d8fb58671bb14dd07a99c9f4279a4bd1a07;hp=d7f0be40e3dcddd26c285c81305189c110c2b027;hb=37db383abbec071e2b3d26d0a531ec8296705b63;hpb=7eb504019731368fd55f01e0264b195d4f99ae93 diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c index d7f0be40e3d..4dbb4d8fb58 100644 --- a/src/gallium/auxiliary/util/u_cpu_detect.c +++ b/src/gallium/auxiliary/util/u_cpu_detect.c @@ -34,8 +34,9 @@ #include "pipe/p_config.h" -#include "u_debug.h" +#include "util/u_debug.h" #include "u_cpu_detect.h" +#include "c11/threads.h" #if defined(PIPE_ARCH_PPC) #if defined(PIPE_OS_APPLE) @@ -52,22 +53,28 @@ #include #endif -#if defined(PIPE_OS_FREEBSD) +#if defined(PIPE_OS_FREEBSD) || defined(PIPE_OS_DRAGONFLY) #include #include #endif #if defined(PIPE_OS_LINUX) #include +#include +#include #endif #ifdef PIPE_OS_UNIX #include #endif +#if defined(HAS_ANDROID_CPUFEATURES) +#include +#endif + #if defined(PIPE_OS_WINDOWS) #include -#if defined(MSVC) +#if defined(PIPE_CC_MSVC) #include #endif #endif @@ -126,16 +133,43 @@ check_os_altivec_support(void) if (setjmp(__lv_powerpc_jmpbuf)) { signal(SIGILL, SIG_DFL); } else { - __lv_powerpc_canjump = 1; + boolean enable_altivec = TRUE; /* Default: enable if available, and if not overridden */ + boolean enable_vsx = TRUE; +#ifdef DEBUG + /* Disabling Altivec code generation is not the same as disabling VSX code generation, + * which can be done simply by passing -mattr=-vsx to the LLVM compiler; cf. + * lp_build_create_jit_compiler_for_module(). + * If you want to disable Altivec code generation, the best place to do it is here. + */ + char *env_control = getenv("GALLIVM_ALTIVEC"); /* 1=enable (default); 0=disable */ + if (env_control && env_control[0] == '0') { + enable_altivec = FALSE; + } +#endif + /* VSX instructions can be explicitly enabled/disabled via GALLIVM_VSX=1 or 0 */ + char *env_vsx = getenv("GALLIVM_VSX"); + if (env_vsx && env_vsx[0] == '0') { + enable_vsx = FALSE; + } + if (enable_altivec) { + __lv_powerpc_canjump = 1; - __asm __volatile - ("mtspr 256, %0\n\t" - "vand %%v0, %%v0, %%v0" - : - : "r" (-1)); + __asm __volatile + ("mtspr 256, %0\n\t" + "vand %%v0, %%v0, %%v0" + : + : "r" (-1)); - signal(SIGILL, SIG_DFL); - util_cpu_caps.has_altivec = 1; + util_cpu_caps.has_altivec = 1; + + if (enable_vsx) { + __asm __volatile("xxland %vs0, %vs0, %vs0"); + util_cpu_caps.has_vsx = 1; + } + signal(SIGILL, SIG_DFL); + } else { + util_cpu_caps.has_altivec = 0; + } } #endif /* !PIPE_OS_APPLE */ } @@ -179,10 +213,10 @@ static int has_cpuid(void) * @sa cpuid.h included in gcc-4.3 onwards. * @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx */ -static INLINE void +static inline void cpuid(uint32_t ax, uint32_t *p) { -#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86) +#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) __asm __volatile ( "xchgl %%ebx, %1\n\t" "cpuid\n\t" @@ -193,7 +227,7 @@ cpuid(uint32_t ax, uint32_t *p) "=d" (p[3]) : "0" (ax) ); -#elif (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86_64) +#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64) __asm __volatile ( "cpuid\n\t" : "=a" (p[0]), @@ -211,16 +245,153 @@ cpuid(uint32_t ax, uint32_t *p) p[3] = 0; #endif } + +/** + * @sa cpuid.h included in gcc-4.4 onwards. + * @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx + */ +static inline void +cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p) +{ +#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) + __asm __volatile ( + "xchgl %%ebx, %1\n\t" + "cpuid\n\t" + "xchgl %%ebx, %1" + : "=a" (p[0]), + "=S" (p[1]), + "=c" (p[2]), + "=d" (p[3]) + : "0" (ax), "2" (cx) + ); +#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64) + __asm __volatile ( + "cpuid\n\t" + : "=a" (p[0]), + "=b" (p[1]), + "=c" (p[2]), + "=d" (p[3]) + : "0" (ax), "2" (cx) + ); +#elif defined(PIPE_CC_MSVC) + __cpuidex(p, ax, cx); +#else + p[0] = 0; + p[1] = 0; + p[2] = 0; + p[3] = 0; +#endif +} + + +static inline uint64_t xgetbv(void) +{ +#if defined(PIPE_CC_GCC) + uint32_t eax, edx; + + __asm __volatile ( + ".byte 0x0f, 0x01, 0xd0" // xgetbv isn't supported on gcc < 4.4 + : "=a"(eax), + "=d"(edx) + : "c"(0) + ); + + return ((uint64_t)edx << 32) | eax; +#elif defined(PIPE_CC_MSVC) && defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK) + return _xgetbv(_XCR_XFEATURE_ENABLED_MASK); +#else + return 0; +#endif +} + + +#if defined(PIPE_ARCH_X86) +PIPE_ALIGN_STACK static inline boolean sse2_has_daz(void) +{ + struct { + uint32_t pad1[7]; + uint32_t mxcsr_mask; + uint32_t pad2[128-8]; + } PIPE_ALIGN_VAR(16) fxarea; + + fxarea.mxcsr_mask = 0; +#if defined(PIPE_CC_GCC) + __asm __volatile ("fxsave %0" : "+m" (fxarea)); +#elif defined(PIPE_CC_MSVC) || defined(PIPE_CC_ICL) + _fxsave(&fxarea); +#else + fxarea.mxcsr_mask = 0; +#endif + return !!(fxarea.mxcsr_mask & (1 << 6)); +} +#endif + #endif /* X86 or X86_64 */ -void -util_cpu_detect(void) +#if defined(PIPE_ARCH_ARM) +static void +check_os_arm_support(void) { - static boolean util_cpu_detect_initialized = FALSE; + /* + * On Android, the cpufeatures library is preferred way of checking + * CPU capabilities. However, it is not available for standalone Mesa + * builds, i.e. when Android build system (Android.mk-based) is not + * used. Because of this we cannot use PIPE_OS_ANDROID here, but rather + * have a separate macro that only gets enabled from respective Android.mk. + */ +#if defined(HAS_ANDROID_CPUFEATURES) + AndroidCpuFamily cpu_family = android_getCpuFamily(); + uint64_t cpu_features = android_getCpuFeatures(); + + if (cpu_family == ANDROID_CPU_FAMILY_ARM) { + if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) + util_cpu_caps.has_neon = 1; + } +#elif defined(PIPE_OS_LINUX) + Elf32_auxv_t aux; + int fd; + + fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); + if (fd >= 0) { + while (read(fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) { + if (aux.a_type == AT_HWCAP) { + uint32_t hwcap = aux.a_un.a_val; + + util_cpu_caps.has_neon = (hwcap >> 12) & 1; + break; + } + } + close (fd); + } +#endif /* PIPE_OS_LINUX */ +} +#endif /* PIPE_ARCH_ARM */ - if(util_cpu_detect_initialized) - return; +static void +get_cpu_topology(void) +{ + uint32_t regs[4]; + + /* Default. This is correct if L3 is not present or there is only one. */ + util_cpu_caps.cores_per_L3 = util_cpu_caps.nr_cpus; +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + /* AMD Zen */ + if (util_cpu_caps.x86_cpu_type == 0x17) { + /* Query the L3 cache topology information. */ + cpuid_count(0x8000001D, 3, regs); + unsigned cache_level = (regs[0] >> 5) & 0x7; + unsigned cores_per_cache = ((regs[0] >> 14) & 0xfff) + 1; + + if (cache_level == 3) + util_cpu_caps.cores_per_L3 = cores_per_cache; + } +#endif +} + +static void +util_cpu_detect_once(void) +{ memset(&util_cpu_caps, 0, sizeof util_cpu_caps); /* Count the number of CPUs in system */ @@ -232,7 +403,7 @@ util_cpu_detect(void) } #elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN) util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - if (util_cpu_caps.nr_cpus == -1) + if (util_cpu_caps.nr_cpus == ~0) util_cpu_caps.nr_cpus = 1; #elif defined(PIPE_OS_BSD) { @@ -250,6 +421,11 @@ util_cpu_detect(void) util_cpu_caps.nr_cpus = 1; #endif + /* Make the fallback cacheline size nonzero so that it can be + * safely passed to align(). + */ + util_cpu_caps.cacheline = sizeof(void *); + #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) if (has_cpuid()) { uint32_t regs[4]; @@ -266,11 +442,12 @@ util_cpu_detect(void) cpuid (0x00000001, regs2); util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf; + /* Add "extended family". */ if (util_cpu_caps.x86_cpu_type == 0xf) - util_cpu_caps.x86_cpu_type = 8 + ((regs2[0] >> 20) & 255); /* use extended family (P4, IA64) */ + util_cpu_caps.x86_cpu_type += ((regs2[0] >> 20) & 0xff); /* general feature flags */ - util_cpu_caps.has_tsc = (regs2[3] >> 8) & 1; /* 0x0000010 */ + util_cpu_caps.has_tsc = (regs2[3] >> 4) & 1; /* 0x0000010 */ util_cpu_caps.has_mmx = (regs2[3] >> 23) & 1; /* 0x0800000 */ util_cpu_caps.has_sse = (regs2[3] >> 25) & 1; /* 0x2000000 */ util_cpu_caps.has_sse2 = (regs2[3] >> 26) & 1; /* 0x4000000 */ @@ -278,13 +455,46 @@ util_cpu_detect(void) util_cpu_caps.has_ssse3 = (regs2[2] >> 9) & 1; /* 0x0000020 */ util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1; util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1; - util_cpu_caps.has_avx = (regs2[2] >> 28) & 1; + util_cpu_caps.has_popcnt = (regs2[2] >> 23) & 1; + util_cpu_caps.has_avx = ((regs2[2] >> 28) & 1) && // AVX + ((regs2[2] >> 27) & 1) && // OSXSAVE + ((xgetbv() & 6) == 6); // XMM & YMM + util_cpu_caps.has_f16c = ((regs2[2] >> 29) & 1) && util_cpu_caps.has_avx; + util_cpu_caps.has_fma = ((regs2[2] >> 12) & 1) && util_cpu_caps.has_avx; util_cpu_caps.has_mmx2 = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */ +#if defined(PIPE_ARCH_X86_64) + util_cpu_caps.has_daz = 1; +#else + util_cpu_caps.has_daz = util_cpu_caps.has_sse3 || + (util_cpu_caps.has_sse2 && sse2_has_daz()); +#endif cacheline = ((regs2[1] >> 8) & 0xFF) * 8; if (cacheline > 0) util_cpu_caps.cacheline = cacheline; } + if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) { + uint32_t regs7[4]; + cpuid_count(0x00000007, 0x00000000, regs7); + util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1; + } + + // check for avx512 + if (((regs2[2] >> 27) & 1) && // OSXSAVE + (xgetbv() & (0x7 << 5)) && // OPMASK: upper-256 enabled by OS + ((xgetbv() & 6) == 6)) { // XMM/YMM enabled by OS + uint32_t regs3[4]; + cpuid_count(0x00000007, 0x00000000, regs3); + util_cpu_caps.has_avx512f = (regs3[1] >> 16) & 1; + util_cpu_caps.has_avx512dq = (regs3[1] >> 17) & 1; + util_cpu_caps.has_avx512ifma = (regs3[1] >> 21) & 1; + util_cpu_caps.has_avx512pf = (regs3[1] >> 26) & 1; + util_cpu_caps.has_avx512er = (regs3[1] >> 27) & 1; + util_cpu_caps.has_avx512cd = (regs3[1] >> 28) & 1; + util_cpu_caps.has_avx512bw = (regs3[1] >> 30) & 1; + util_cpu_caps.has_avx512vl = (regs3[1] >> 31) & 1; + util_cpu_caps.has_avx512vbmi = (regs3[2] >> 1) & 1; + } if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) { /* GenuineIntel */ @@ -301,11 +511,18 @@ util_cpu_detect(void) util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1; util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1; util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1; + + util_cpu_caps.has_xop = util_cpu_caps.has_avx && + ((regs2[2] >> 11) & 1); } if (regs[0] >= 0x80000006) { + /* should we really do this if the clflush size above worked? */ + unsigned int cacheline; cpuid(0x80000006, regs2); - util_cpu_caps.cacheline = regs2[2] & 0xFF; + cacheline = regs2[2] & 0xFF; + if (cacheline > 0) + util_cpu_caps.cacheline = cacheline; } if (!util_cpu_caps.has_sse) { @@ -317,10 +534,16 @@ util_cpu_detect(void) } #endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */ +#if defined(PIPE_ARCH_ARM) + check_os_arm_support(); +#endif + #if defined(PIPE_ARCH_PPC) check_os_altivec_support(); #endif /* PIPE_ARCH_PPC */ + get_cpu_topology(); + #ifdef DEBUG if (debug_get_option_dump_cpu()) { debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus); @@ -338,11 +561,33 @@ util_cpu_detect(void) debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1); debug_printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2); debug_printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx); + debug_printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2); + debug_printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c); + debug_printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt); debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow); debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext); + debug_printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop); debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec); + debug_printf("util_cpu_caps.has_vsx = %u\n", util_cpu_caps.has_vsx); + debug_printf("util_cpu_caps.has_neon = %u\n", util_cpu_caps.has_neon); + debug_printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz); + debug_printf("util_cpu_caps.has_avx512f = %u\n", util_cpu_caps.has_avx512f); + debug_printf("util_cpu_caps.has_avx512dq = %u\n", util_cpu_caps.has_avx512dq); + debug_printf("util_cpu_caps.has_avx512ifma = %u\n", util_cpu_caps.has_avx512ifma); + debug_printf("util_cpu_caps.has_avx512pf = %u\n", util_cpu_caps.has_avx512pf); + debug_printf("util_cpu_caps.has_avx512er = %u\n", util_cpu_caps.has_avx512er); + debug_printf("util_cpu_caps.has_avx512cd = %u\n", util_cpu_caps.has_avx512cd); + debug_printf("util_cpu_caps.has_avx512bw = %u\n", util_cpu_caps.has_avx512bw); + debug_printf("util_cpu_caps.has_avx512vl = %u\n", util_cpu_caps.has_avx512vl); + debug_printf("util_cpu_caps.has_avx512vbmi = %u\n", util_cpu_caps.has_avx512vbmi); } #endif +} + +static once_flag cpu_once_flag = ONCE_FLAG_INIT; - util_cpu_detect_initialized = TRUE; +void +util_cpu_detect(void) +{ + call_once(&cpu_once_flag, util_cpu_detect_once); }