util_cpu_detect();
+ /* AMD Bulldozer AVX's throughput is the same as SSE2; and because using
+ * 8-wide vector needs more floating ops than 4-wide (due to padding), it is
+ * actually more efficient to use 4-wide vectors on this processor.
+ *
+ * See also:
+ * - http://www.anandtech.com/show/4955/the-bulldozer-review-amd-fx8150-tested/2
+ */
if (HAVE_AVX &&
- util_cpu_caps.has_avx) {
+ util_cpu_caps.has_avx &&
+ util_cpu_caps.has_intel) {
lp_native_vector_width = 256;
} else {
/* Leave it at 128, even when no SIMD extensions are available.
util_cpu_caps.cacheline = cacheline;
}
+ if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) {
+ /* GenuineIntel */
+ util_cpu_caps.has_intel = 1;
+ }
+
cpuid(0x80000000, regs);
if (regs[0] >= 0x80000001) {