static simdscalar unpack(simdscalar &in)
{
#if KNOB_SIMD_WIDTH == 8
-#if KNOB_ARCH==KNOB_ARCH_AVX
+#if KNOB_ARCH <= KNOB_ARCH_AVX
__m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
__m128i resLo = _mm_cvtepu8_epi32(src);
__m128i resHi = _mm_shuffle_epi8(src,
__m256i result = _mm256_castsi128_si256(resLo);
result = _mm256_insertf128_si256(result, resHi, 1);
return _mm256_castsi256_ps(result);
-#elif KNOB_ARCH>=KNOB_ARCH_AVX2
+#else
return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
#endif
#else
static simdscalar unpack(simdscalar &in)
{
#if KNOB_SIMD_WIDTH == 8
-#if KNOB_ARCH==KNOB_ARCH_AVX
+#if KNOB_ARCH <= KNOB_ARCH_AVX
SWR_INVALID("I think this may be incorrect.");
__m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
__m128i resLo = _mm_cvtepi8_epi32(src);
__m256i result = _mm256_castsi128_si256(resLo);
result = _mm256_insertf128_si256(result, resHi, 1);
return _mm256_castsi256_ps(result);
-#elif KNOB_ARCH>=KNOB_ARCH_AVX2
+#else
return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
#endif
#else
static simdscalar unpack(simdscalar &in)
{
#if KNOB_SIMD_WIDTH == 8
-#if KNOB_ARCH==KNOB_ARCH_AVX
+#if KNOB_ARCH <= KNOB_ARCH_AVX
__m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
__m128i resLo = _mm_cvtepu16_epi32(src);
__m128i resHi = _mm_shuffle_epi8(src,
__m256i result = _mm256_castsi128_si256(resLo);
result = _mm256_insertf128_si256(result, resHi, 1);
return _mm256_castsi256_ps(result);
-#elif KNOB_ARCH>=KNOB_ARCH_AVX2
+#else
return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
#endif
#else
static simdscalar unpack(simdscalar &in)
{
#if KNOB_SIMD_WIDTH == 8
-#if KNOB_ARCH==KNOB_ARCH_AVX
+#if KNOB_ARCH <= KNOB_ARCH_AVX
SWR_INVALID("I think this may be incorrect.");
__m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
__m128i resLo = _mm_cvtepi16_epi32(src);
__m256i result = _mm256_castsi128_si256(resLo);
result = _mm256_insertf128_si256(result, resHi, 1);
return _mm256_castsi256_ps(result);
-#elif KNOB_ARCH>=KNOB_ARCH_AVX2
+#else
return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
#endif
#else
static inline simdscalar convertSrgb(simdscalar &in)
{
#if KNOB_SIMD_WIDTH == 8
-#if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
__m128 srcLo = _mm256_extractf128_ps(in, 0);
__m128 srcHi = _mm256_extractf128_ps(in, 1);
in = _mm256_insertf128_ps(in, srcLo, 0);
in = _mm256_insertf128_ps(in, srcHi, 1);
-#endif
#else
#error Unsupported vector width
#endif
simdscalari src = _simd_load_si((const simdscalari*)pSrc);
#if KNOB_SIMD_WIDTH == 8
-#if KNOB_ARCH == KNOB_ARCH_AVX
+#if KNOB_ARCH <= KNOB_ARCH_AVX
__m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
__m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
__m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
__m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
_mm_store_si128((__m128i*)pDst, c0123lo);
_mm_store_si128((__m128i*)(pDst + 16), c0123hi);
-#elif KNOB_ARCH == KNOB_ARCH_AVX2
+#else
simdscalari dst01 = _mm256_shuffle_epi8(src,
_mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
__m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
__m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
-#if KNOB_ARCH == KNOB_ARCH_AVX
+#if KNOB_ARCH <= KNOB_ARCH_AVX
// splitting into two sets of 4 wide integer vector types
// because AVX doesn't have instructions to support this operation at 8 wide
__m256i final = _mm256_castsi128_si256(vRow00);
final = _mm256_insertf128_si256(final, vRow10, 1);
-#elif KNOB_ARCH >= KNOB_ARCH_AVX2
+#else
// logic is as above, only wider
src1 = _mm256_slli_si256(src1, 1);
src2 = _mm256_or_si256(src2, src3);
__m256i final = _mm256_or_si256(src0, src2);
-#if 0
-
- __m256i perm = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
-
- final = _mm256_permutevar8x32_epi32(final, perm);
-#else
// adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
final = _mm256_permute4x64_epi64(final, 0xD8);
-#endif
#endif
_simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
__m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
__m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
-#if KNOB_ARCH == KNOB_ARCH_AVX
+#if KNOB_ARCH <= KNOB_ARCH_AVX
// splitting into two sets of 4 wide integer vector types
// because AVX doesn't have instructions to support this operation at 8 wide
__m256i final = _mm256_castsi128_si256(vRow00);
final = _mm256_insertf128_si256(final, vRow10, 1);
-#elif KNOB_ARCH >= KNOB_ARCH_AVX2
+#else
// logic is as above, only wider
src1 = _mm256_slli_si256(src1, 1);