INLINE
UINT pdep_u32(UINT a, UINT mask)
{
-#if KNOB_ARCH==KNOB_ARCH_AVX2
+#if KNOB_ARCH >= KNOB_ARCH_AVX2
return _pdep_u32(a, mask);
#else
UINT result = 0;
INLINE
UINT pext_u32(UINT a, UINT mask)
{
-#if KNOB_ARCH==KNOB_ARCH_AVX2
+#if KNOB_ARCH >= KNOB_ARCH_AVX2
return _pext_u32(a, mask);
#else
UINT result = 0;
__m256i result = _mm256_castsi128_si256(resLo);
result = _mm256_insertf128_si256(result, resHi, 1);
return _mm256_castsi256_ps(result);
-#elif KNOB_ARCH==KNOB_ARCH_AVX2
+#elif KNOB_ARCH>=KNOB_ARCH_AVX2
return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
#endif
#else
__m256i result = _mm256_castsi128_si256(resLo);
result = _mm256_insertf128_si256(result, resHi, 1);
return _mm256_castsi256_ps(result);
-#elif KNOB_ARCH==KNOB_ARCH_AVX2
+#elif KNOB_ARCH>=KNOB_ARCH_AVX2
return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
#endif
#else
__m256i result = _mm256_castsi128_si256(resLo);
result = _mm256_insertf128_si256(result, resHi, 1);
return _mm256_castsi256_ps(result);
-#elif KNOB_ARCH==KNOB_ARCH_AVX2
+#elif KNOB_ARCH>=KNOB_ARCH_AVX2
return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
#endif
#else
__m256i result = _mm256_castsi128_si256(resLo);
result = _mm256_insertf128_si256(result, resHi, 1);
return _mm256_castsi256_ps(result);
-#elif KNOB_ARCH==KNOB_ARCH_AVX2
+#elif KNOB_ARCH>=KNOB_ARCH_AVX2
return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
#endif
#else
#define KNOB_SIMD_WIDTH 8
#define KNOB_SIMD_BYTES 32
#elif (KNOB_ARCH == KNOB_ARCH_AVX512)
-#define KNOB_ARCH_ISA AVX512F
-#define KNOB_ARCH_STR "AVX512"
-#define KNOB_SIMD_WIDTH 16
-#define KNOB_SIMD_BYTES 64
-#error "AVX512 not yet supported"
+#define KNOB_ARCH_ISA AVX2
+#define KNOB_ARCH_STR "AVX2"
+#define KNOB_SIMD_WIDTH 8
+#define KNOB_SIMD_BYTES 32
+// Disable AVX512 for now...
+//#define KNOB_ARCH_ISA AVX512F
+//#define KNOB_ARCH_STR "AVX512"
+//#define KNOB_SIMD_WIDTH 16
+//#define KNOB_SIMD_BYTES 64
+//#error "AVX512 not yet supported"
#else
#error "Unknown architecture"
#endif
// Convert from 32-bit float to 16-bit float using _mm_cvtps_ph
// @todo 16bit float instruction support is orthogonal to avx support. need to
// add check for F16C support instead.
-#if KNOB_ARCH == KNOB_ARCH_AVX2
+#if KNOB_ARCH >= KNOB_ARCH_AVX2
__m128 src128 = _mm_set1_ps(src);
__m128i srci128 = _mm_cvtps_ph(src128, _MM_FROUND_TRUNC);
UINT value = _mm_extract_epi16(srci128, 0);
float dst;
if (FormatTraits<SrcFormat>::GetBPC(comp) == 16)
{
-#if KNOB_ARCH == KNOB_ARCH_AVX2
+#if KNOB_ARCH >= KNOB_ARCH_AVX2
// Convert from 16-bit float to 32-bit float using _mm_cvtph_ps
// @todo 16bit float instruction support is orthogonal to avx support. need to
// add check for F16C support instead.
__m256i final = _mm256_castsi128_si256(vRow00);
final = _mm256_insertf128_si256(final, vRow10, 1);
-#elif KNOB_ARCH == KNOB_ARCH_AVX2
+#elif KNOB_ARCH >= KNOB_ARCH_AVX2
// logic is as above, only wider
src1 = _mm256_slli_si256(src1, 1);
__m256i final = _mm256_castsi128_si256(vRow00);
final = _mm256_insertf128_si256(final, vRow10, 1);
-#elif KNOB_ARCH == KNOB_ARCH_AVX2
+#elif KNOB_ARCH >= KNOB_ARCH_AVX2
// logic is as above, only wider
src1 = _mm256_slli_si256(src1, 1);