+#if USE_8x2_TILE_BACKEND
+template<SWR_FORMAT DstFormat>
+INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
+{
+ // swizzle rgba -> bgra while we load
+ simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
+ simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
+ simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
+ simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
+
+ // clamp
+ const simd16scalar zero = _simd16_setzero_ps();
+ const simd16scalar ones = _simd16_set1_ps(1.0f);
+
+ comp0 = _simd16_max_ps(comp0, zero);
+ comp0 = _simd16_min_ps(comp0, ones);
+
+ comp1 = _simd16_max_ps(comp1, zero);
+ comp1 = _simd16_min_ps(comp1, ones);
+
+ comp2 = _simd16_max_ps(comp2, zero);
+ comp2 = _simd16_min_ps(comp2, ones);
+
+ comp3 = _simd16_max_ps(comp3, zero);
+ comp3 = _simd16_min_ps(comp3, ones);
+
+ if (FormatTraits<DstFormat>::isSRGB)
+ {
+ // Gamma-correct only rgb
+ comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
+ comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
+ comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
+ }
+
+ // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
+ comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
+ comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
+ comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
+ comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
+
+ // moving to 16 wide integer vector types
+ simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
+ simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
+ simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
+ simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
+
+#if 1
+ // SOA to AOS conversion
+ src1 = _simd16_slli_epi32(src1, 8);
+ src2 = _simd16_slli_epi32(src2, 16);
+ src3 = _simd16_slli_epi32(src3, 24);
+
+ simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
+
+ // de-swizzle conversion
+#if 1
+ simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
+ simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
+
+ final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
+
+#else
+ final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
+
+#endif
+#endif
+#if KNOB_ARCH == KNOB_ARCH_AVX
+
+ // splitting into two sets of 4 wide integer vector types
+ // because AVX doesn't have instructions to support this operation at 8 wide
+#if 0
+ __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
+ __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
+ __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
+ __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
+
+ __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
+ __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
+ __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
+ __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
+
+ srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
+ srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
+ srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
+ srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
+ srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
+ srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
+
+ srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
+ srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
+
+ srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
+ srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
+
+ srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
+ srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
+
+ // unpack into rows that get the tiling order correct
+ __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
+ __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
+
+ __m256i final = _mm256_castsi128_si256(vRow00);
+ final = _mm256_insertf128_si256(final, vRow10, 1);
+
+#else
+#if 0
+ simd16scalari final = _simd16_setzero_si();
+
+#endif
+#endif
+#elif KNOB_ARCH >= KNOB_ARCH_AVX2
+ // logic is as above, only wider
+#if 0
+ src1 = _simd16_slli_epi32(src1, 8);
+ src2 = _simd16_slli_epi32(src2, 16);
+ src3 = _simd16_slli_epi32(src3, 24);
+
+ simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3));
+
+ final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
+
+#endif
+#endif
+ _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
+ _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
+}
+
+#endif