typedef __m256 simdscalar;
typedef __m256i simdscalari;
typedef uint8_t simdmask;
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
-struct simdscalar
-{
- __m256 lo;
- __m256 hi;
-};
-struct simdscalard
-{
- __m256d lo;
- __m256d hi;
-};
-struct simdscalari
-{
- __m256i lo;
- __m256i hi;
-};
-typedef uint16_t simdmask;
-#else
-typedef __m512 simdscalar;
-typedef __m512d simdscalard;
-typedef __m512i simdscalari;
-typedef __mask16 simdmask;
-#endif
#else
#error Unsupported vector width
#endif
SWR_ASSERT(false, "Need to implement 8 wide version");
}
-#elif KNOB_SIMD_WIDTH == 16
-
-#if ENABLE_AVX512_EMULATION
-
-#define SIMD_EMU_AVX512_0(type, func, intrin) \
-INLINE type func()\
-{\
- type result;\
-\
- result.lo = intrin();\
- result.hi = intrin();\
-\
- return result;\
-}
-
-#define SIMD_EMU_AVX512_1(type, func, intrin) \
-INLINE type func(type a)\
-{\
- type result;\
-\
- result.lo = intrin(a.lo);\
- result.hi = intrin(a.hi);\
-\
- return result;\
-}
-
-#define SIMD_EMU_AVX512_2(type, func, intrin) \
-INLINE type func(type a, type b)\
-{\
- type result;\
-\
- result.lo = intrin(a.lo, b.lo);\
- result.hi = intrin(a.hi, b.hi);\
-\
- return result;\
-}
-
-#define SIMD_EMU_AVX512_3(type, func, intrin) \
-INLINE type func(type a, type b, type c)\
-{\
- type result;\
-\
- result.lo = intrin(a.lo, b.lo, c.lo);\
- result.hi = intrin(a.hi, b.hi, c.hi);\
-\
- return result;\
-}
-
-SIMD_EMU_AVX512_0(simdscalar, _simd_setzero_ps, _mm256_setzero_ps)
-SIMD_EMU_AVX512_0(simdscalari, _simd_setzero_si, _mm256_setzero_si256)
-
-INLINE simdscalar _simd_set1_ps(float a)
-{
- simdscalar result;
-
- result.lo = _mm256_set1_ps(a);
- result.hi = _mm256_set1_ps(a);
-
- return result;
-}
-
-INLINE simdscalari _simd_set1_epi8(char a)
-{
- simdscalari result;
-
- result.lo = _mm256_set1_epi8(a);
- result.hi = _mm256_set1_epi8(a);
-
- return result;
-}
-
-INLINE simdscalari _simd_set1_epi32(int a)
-{
- simdscalari result;
-
- result.lo = _mm256_set1_epi32(a);
- result.hi = _mm256_set1_epi32(a);
-
- return result;
-}
-
-INLINE simdscalari _simd_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
- simdscalari result;
-
- result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
- result.hi = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
-
- return result;
-}
-
-INLINE simdscalari _simd_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
- simdscalari result;
-
- result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
- result.hi = _mm256_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8);
-
- return result;
-}
-
-INLINE simdscalar _simd_load_ps(float const *m)
-{
- float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(simdscalar::lo));
-
- simdscalar result;
-
- result.lo = _mm256_load_ps(m);
- result.hi = _mm256_load_ps(n);
-
- return result;
-}
-
-INLINE simdscalar _simd_loadu_ps(float const *m)
-{
- float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(simdscalar::lo));
-
- simdscalar result;
-
- result.lo = _mm256_loadu_ps(m);
- result.hi = _mm256_loadu_ps(n);
-
- return result;
-}
-
-INLINE simdscalar _simd_load1_ps(float const *m)
-{
- simdscalar result;
-
- result.lo = _mm256_broadcast_ss(m);
- result.hi = _mm256_broadcast_ss(m);
-
- return result;
-}
-
-INLINE simdscalari _simd_load_si(simdscalari const *m)
-{
- simdscalari result;
-
- result.lo = _mm256_load_si256(&m[0].lo);
- result.hi = _mm256_load_si256(&m[0].hi);
-
- return result;
-}
-
-INLINE simdscalari _simd_loadu_si(simdscalari const *m)
-{
- simdscalari result;
-
- result.lo = _mm256_loadu_si256(&m[0].lo);
- result.hi = _mm256_loadu_si256(&m[0].hi);
-
- return result;
-}
-
-INLINE simdscalar _simd_broadcast_ss(float const *m)
-{
- simdscalar result;
-
- result.lo = _mm256_broadcast_ss(m);
- result.hi = _mm256_broadcast_ss(m);
-
- return result;
-}
-
-INLINE simdscalar _simd_broadcast_ps(__m128 const *m)
-{
- simdscalar result;
-
- result.lo = _mm256_broadcast_ps(m);
- result.hi = _mm256_broadcast_ps(m);
-
- return result;
-}
-
-INLINE void _simd_store_ps(float *m, simdscalar a)
-{
- float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(simdscalar::lo));
-
- _mm256_store_ps(m, a.lo);
- _mm256_store_ps(n, a.hi);
-}
-
-INLINE void _simd_maskstore_ps(float *m, simdscalari mask, simdscalar a)
-{
- float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(simdscalar::lo));
-
- _mm256_maskstore_ps(m, mask.lo, a.lo);
- _mm256_maskstore_ps(n, mask.hi, a.hi);
-}
-
-INLINE void _simd_store_si(simdscalari *m, simdscalari a)
-{
- _mm256_store_si256(&m[0].lo, a.lo);
- _mm256_store_si256(&m[0].hi, a.hi);
-}
-
-INLINE simdscalar _simd_blend_ps(simdscalar a, simdscalar b, const simdmask mask)
-{
- simdscalar result;
-
- result.lo = _mm256_blend_ps(a.lo, b.lo, reinterpret_cast<const uint8_t *>(&mask)[0]);
- result.hi = _mm256_blend_ps(a.hi, b.hi, reinterpret_cast<const uint8_t *>(&mask)[1]);
-
- return result;
-}
-
-SIMD_EMU_AVX512_3(simdscalar, _simd_blendv_ps, _mm256_blendv_ps)
-
-INLINE simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, const simdscalar mask)
-{
- simdscalari result;
-
- result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), mask.lo));
- result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), mask.hi));
-
- return result;
-}
-
-INLINE simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, const simdscalari mask)
-{
- simdscalari result;
-
- result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), _mm256_castsi256_ps(mask.lo)));
- result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), _mm256_castsi256_ps(mask.hi)));
-
- return result;
-}
-
-SIMD_EMU_AVX512_2(simdscalar, _simd_mul_ps, _mm256_mul_ps)
-SIMD_EMU_AVX512_2(simdscalar, _simd_add_ps, _mm256_add_ps)
-SIMD_EMU_AVX512_2(simdscalar, _simd_sub_ps, _mm256_sub_ps)
-SIMD_EMU_AVX512_1(simdscalar, _simd_rsqrt_ps, _mm256_rsqrt_ps)
-SIMD_EMU_AVX512_2(simdscalar, _simd_min_ps, _mm256_min_ps)
-SIMD_EMU_AVX512_2(simdscalar, _simd_max_ps, _mm256_max_ps)
-
-INLINE simdmask _simd_movemask_ps(simdscalar a)
-{
- simdmask mask;
-
- reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_ps(a.lo);
- reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_ps(a.hi);
-
- return mask;
-}
-
-INLINE simdmask _simd_movemask_pd(simdscalard a)
-{
- simdmask mask;
-
- reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_pd(a.lo);
- reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_pd(a.hi);
-
- return mask;
-}
-
-INLINE simdmask _simd_movemask_epi8(simdscalari a)
-{
- simdmask mask;
-
- reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_epi8(a.lo);
- reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_epi8(a.hi);
-
- return mask;
-}
-
-INLINE simdscalari _simd_cvtps_epi32(simdscalar a)
-{
- simdscalari result;
-
- result.lo = _mm256_cvtps_epi32(a.lo);
- result.hi = _mm256_cvtps_epi32(a.hi);
-
- return result;
-}
-
-INLINE simdscalari _simd_cvttps_epi32(simdscalar a)
-{
- simdscalari result;
-
- result.lo = _mm256_cvttps_epi32(a.lo);
- result.hi = _mm256_cvttps_epi32(a.hi);
-
- return result;
-}
-
-INLINE simdscalar _simd_cvtepi32_ps(simdscalari a)
-{
- simdscalar result;
-
- result.lo = _mm256_cvtepi32_ps(a.lo);
- result.hi = _mm256_cvtepi32_ps(a.hi);
-
- return result;
-}
-
-INLINE simdscalar _simd_cmp_ps(simdscalar a, simdscalar b, const int comp)
-{
- simdscalar result;
-
- result.lo = _mm256_cmp_ps(a.lo, b.lo, comp);
- result.hi = _mm256_cmp_ps(a.hi, b.hi, comp);
-
- return result;
-}
-
-#define _simd_cmplt_ps(a, b) _simd_cmp_ps(a, b, _CMP_LT_OQ)
-#define _simd_cmpgt_ps(a, b) _simd_cmp_ps(a, b, _CMP_GT_OQ)
-#define _simd_cmpneq_ps(a, b) _simd_cmp_ps(a, b, _CMP_NEQ_OQ)
-#define _simd_cmpeq_ps(a, b) _simd_cmp_ps(a, b, _CMP_EQ_OQ)
-#define _simd_cmpge_ps(a, b) _simd_cmp_ps(a, b, _CMP_GE_OQ)
-#define _simd_cmple_ps(a, b) _simd_cmp_ps(a, b, _CMP_LE_OQ)
-
-SIMD_EMU_AVX512_2(simdscalar, _simd_and_ps, _mm256_and_ps)
-SIMD_EMU_AVX512_2(simdscalar, _simd_or_ps, _mm256_or_ps)
-SIMD_EMU_AVX512_1(simdscalar, _simd_rcp_ps, _mm256_rcp_ps)
-SIMD_EMU_AVX512_2(simdscalar, _simd_div_ps, _mm256_div_ps)
-
-INLINE simdscalar _simd_castsi_ps(simdscalari a)
-{
- return *reinterpret_cast<simdscalar *>(&a);
-}
-
-INLINE simdscalari _simd_castps_si(simdscalar a)
-{
- return *reinterpret_cast<simdscalari *>(&a);
-}
-
-INLINE simdscalard _simd_castsi_pd(simdscalari a)
-{
- return *reinterpret_cast<simdscalard *>(&a);
-}
-
-INLINE simdscalari _simd_castpd_si(simdscalard a)
-{
- return *reinterpret_cast<simdscalari *>(&a);
-}
-
-INLINE simdscalar _simd_castpd_ps(simdscalard a)
-{
- return *reinterpret_cast<simdscalar *>(&a);
-}
-
-INLINE simdscalard _simd_castps_pd(simdscalar a)
-{
- return *reinterpret_cast<simdscalard *>(&a);
-}
-
-SIMD_EMU_AVX512_2(simdscalar, _simd_andnot_ps, _mm256_andnot_ps)
-
-INLINE simdscalar _simd_round_ps(simdscalar a, const int mode)
-{
- simdscalar result;
-
- result.lo = _mm256_round_ps(a.lo, mode);
- result.hi = _mm256_round_ps(a.hi, mode);
-
- return result;
-}
-
-SIMD_EMU_AVX512_2(simdscalari, _simd_mul_epi32, _mm256_mul_epi32)
-SIMD_EMU_AVX512_2(simdscalari, _simd_mullo_epi32, _mm256_mullo_epi32)
-SIMD_EMU_AVX512_2(simdscalari, _simd_sub_epi32, _mm256_sub_epi32)
-SIMD_EMU_AVX512_2(simdscalari, _simd_sub_epi64, _mm256_sub_epi64)
-SIMD_EMU_AVX512_2(simdscalari, _simd_min_epi32, _mm256_min_epi32)
-SIMD_EMU_AVX512_2(simdscalari, _simd_max_epi32, _mm256_max_epi32)
-SIMD_EMU_AVX512_2(simdscalari, _simd_min_epu32, _mm256_min_epu32)
-SIMD_EMU_AVX512_2(simdscalari, _simd_max_epu32, _mm256_max_epu32)
-SIMD_EMU_AVX512_2(simdscalari, _simd_add_epi32, _mm256_add_epi32)
-SIMD_EMU_AVX512_2(simdscalari, _simd_and_si, _mm256_and_si256)
-SIMD_EMU_AVX512_2(simdscalari, _simd_andnot_si, _mm256_andnot_si256)
-SIMD_EMU_AVX512_2(simdscalari, _simd_or_si, _mm256_or_si256)
-SIMD_EMU_AVX512_2(simdscalari, _simd_xor_si, _mm256_xor_si256)
-SIMD_EMU_AVX512_2(simdscalari, _simd_cmpeq_epi32, _mm256_cmpeq_epi32)
-SIMD_EMU_AVX512_2(simdscalari, _simd_cmpgt_epi32, _mm256_cmpgt_epi32)
-
-INLINE int _simd_testz_ps(simdscalar a, simdscalar b)
-{
- int lo = _mm256_testz_ps(a.lo, b.lo);
- int hi = _mm256_testz_ps(a.hi, b.hi);
-
- return lo & hi;
-}
-
-#define _simd_cmplt_epi32(a, b) _simd_cmpgt_epi32(b, a)
-
-SIMD_EMU_AVX512_2(simdscalari, _simd_unpacklo_epi32, _mm256_unpacklo_epi32)
-SIMD_EMU_AVX512_2(simdscalari, _simd_unpackhi_epi32, _mm256_unpackhi_epi32)
-
-INLINE simdscalari _simd_slli_epi32(simdscalari a, const int imm8)
-{
- simdscalari result;
-
- result.lo = _mm256_slli_epi32(a.lo, imm8);
- result.hi = _mm256_slli_epi32(a.hi, imm8);
-
- return result;
-}
-
-INLINE simdscalari _simd_srai_epi32(simdscalari a, const int imm8)
-{
- simdscalari result;
-
- result.lo = _mm256_srai_epi32(a.lo, imm8);
- result.hi = _mm256_srai_epi32(a.hi, imm8);
-
- return result;
-}
-
-INLINE simdscalari _simd_srli_epi32(simdscalari a, const int imm8)
-{
- simdscalari result;
-
- result.lo = _mm256_srli_epi32(a.lo, imm8);
- result.hi = _mm256_srli_epi32(a.hi, imm8);
-
- return result;
-}
-
-#define _simd128_fmadd_ps _mm_fmadd_ps
-
-SIMD_EMU_AVX512_3(simdscalar, _simd_fmadd_ps, _mm256_fmadd_ps)
-SIMD_EMU_AVX512_3(simdscalar, _simd_fmsub_ps, _mm256_fmsub_ps)
-
-SIMD_EMU_AVX512_2(simdscalari, _simd_shuffle_epi8, _mm256_shuffle_epi8)
-SIMD_EMU_AVX512_2(simdscalari, _simd_adds_epu8, _mm256_adds_epu8)
-SIMD_EMU_AVX512_2(simdscalari, _simd_subs_epu8, _mm256_subs_epu8)
-SIMD_EMU_AVX512_2(simdscalari, _simd_add_epi8, _mm256_add_epi8)
-
-INLINE simdscalar _simd_i32gather_ps(float const *m, simdscalari a, const int imm8)
-{
- simdscalar result;
-
- result.lo = _mm256_i32gather_ps(m, a.lo, imm8);
- result.hi = _mm256_i32gather_ps(m, a.hi, imm8);
-
- return result;
-}
-
-SIMD_EMU_AVX512_1(simdscalari, _simd_abs_epi32, _mm256_abs_epi32)
-SIMD_EMU_AVX512_2(simdscalari, _simd_cmpeq_epi64, _mm256_cmpeq_epi64)
-SIMD_EMU_AVX512_2(simdscalari, _simd_cmpgt_epi64, _mm256_cmpgt_epi64)
-SIMD_EMU_AVX512_2(simdscalari, _simd_cmpeq_epi16, _mm256_cmpeq_epi16)
-SIMD_EMU_AVX512_2(simdscalari, _simd_cmpgt_epi16, _mm256_cmpgt_epi16)
-SIMD_EMU_AVX512_2(simdscalari, _simd_cmpeq_epi8, _mm256_cmpeq_epi8)
-SIMD_EMU_AVX512_2(simdscalari, _simd_cmpgt_epi8, _mm256_cmpgt_epi8)
-
-INLINE simdscalar _simd_permute_ps(simdscalar a, simdscalari b)
-{
- simdscalar result;
-
- result.lo = _mm256_permutevar8x32_ps(a.lo, b.lo);
- result.hi = _mm256_permutevar8x32_ps(a.hi, b.hi);
-
- return result;
-}
-
-SIMD_EMU_AVX512_2(simdscalari, _simd_permute_epi32, _mm256_permutevar8x32_epi32)
-
-SIMD_EMU_AVX512_2(simdscalari, _simd_srlv_epi32, _mm256_srlv_epi32)
-SIMD_EMU_AVX512_2(simdscalari, _simd_sllv_epi32, _mm256_sllv_epi32)
-
-INLINE simdscalar _simd_shuffle_ps(simdscalar a, simdscalar b, const int imm8)
-{
- simdscalar result;
-
- result.lo = _mm256_shuffle_ps(a.lo, b.lo, imm8);
- result.hi = _mm256_shuffle_ps(a.hi, b.hi, imm8);
-
- return result;
-}
-
-// convert bitmask to vector mask
-INLINE simdscalar vMask(int32_t mask)
-{
- simdscalari temp = _simd_set1_epi32(mask);
-
- simdscalari bits = _simd_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
-
- simdscalari result = _simd_cmplt_epi32(_simd_setzero_si(), _simd_and_si(temp, bits));
-
- return _simd_castsi_ps(result);
-}
-
-#else
-
-INLINE __m512 _m512_broadcast_ss(void const *m)
-{
- return _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0);
-}
-
-INLINE __m512 _m512_broadcast_ps(void const *m)
-{
- return _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0);
-}
-
-INLINE __m512 _m512_blend_ps(__m512 a, __m512 b, const int mask)
-{
- const __mask16 mask16 = _mm512_int2mask(mask);
-
- return _mm512_mask_blend_ps(mask16, a, b);
-}
-
-INLINE __m512 _m512_blendv_ps(__m512 a, __m512 b, __m512 mask)
-{
- const __mask16 mask16 = _mm512_cmpeq_ps_mask(mask, _mm512_setzero_ps());
-
- return _mm512_mask_blend_ps(mask16, a, b);
-}
-
-INLINE int _m512_movemask_ps(__m512 a)
-{
- __m512 mask = _mm512_set1_epi32(0x80000000);
-
- __m512 temp = _mm512_and_epi32(a, mask);
-
- const __mask16 mask16 = _mm512_cmpeq_epu32_mask(temp, mask);
-
- return _mm512mask2int(mask16);
-}
-
-INLINE int _m512_movemask_pd(__m512 a)
-{
- __m512 mask = _mm512_set1_epi64(0x8000000000000000);
-
- __m512 temp = _mm512_and_epi64(a, mask);
-
- const __mask16 mask16 = _mm512_cmpeq_epu64_mask(temp, mask);
-
- return _mm512mask2int(mask16);
-}
-
-INLINE __m512 _m512_cmp_ps(__m512 a, __m512 b, __m512 comp)
-{
- const __mask16 mask16 = _mm512_cmpeq_ps_mask(a, b, comp);
-
- return _mm512_mask_blend_epi32(mask16, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE __m512 _mm512_cmplt_epi32(__m512 a, __m512 b)
-{
- const __mask16 mask16 = _mm512_cmplt_epi32_mask(a, b);
-
- return _mm512_mask_blend_epi32(mask16, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE __m512 _mm512_cmpgt_epi32(__m512 a, __m512 b)
-{
- const __mask16 mask16 = _mm512_cmpgt_epi32_mask(a, b);
-
- return _mm512_mask_blend_epi32(mask16, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-#define _simd_load_ps _mm512_load_ps
-#define _simd_load1_ps _mm256_broadcast_ss
-#define _simd_loadu_ps _mm512_loadu_ps
-#define _simd_setzero_ps _mm512_setzero_ps
-#define _simd_set1_ps _mm512_set1_ps
-#define _simd_blend_ps _mm512_blend_ps
-#define _simd_blendv_ps _mm512_blendv_ps
-#define _simd_store_ps _mm512_store_ps
-#define _simd_mul_ps _mm512_mul_ps
-#define _simd_add_ps _mm512_add_ps
-#define _simd_sub_ps _mm512_sub_ps
-#define _simd_rsqrt_ps _mm512_rsqrt28_ps
-#define _simd_min_ps _mm512_min_ps
-#define _simd_max_ps _mm512_max_ps
-#define _simd_movemask_ps _mm512_movemask_ps
-#define _simd_cvtps_epi32 _mm512_cvtps_epi32
-#define _simd_cvttps_epi32 _mm512_cvttps_epi32
-#define _simd_cvtepi32_ps _mm512_cvtepi32_ps
-#define _simd_cmplt_ps(a, b) _mm512_cmp_ps(a, b, _CMP_LT_OQ)
-#define _simd_cmpgt_ps(a, b) _mm512_cmp_ps(a, b, _CMP_GT_OQ)
-#define _simd_cmpneq_ps(a, b) _mm512_cmp_ps(a, b, _CMP_NEQ_OQ)
-#define _simd_cmpeq_ps(a, b) _mm512_cmp_ps(a, b, _CMP_EQ_OQ)
-#define _simd_cmpge_ps(a, b) _mm512_cmp_ps(a, b, _CMP_GE_OQ)
-#define _simd_cmple_ps(a, b) _mm512_cmp_ps(a, b, _CMP_LE_OQ)
-#define _simd_cmp_ps(a, b, comp) _mm512_cmp_ps(a, b, comp)
-#define _simd_and_ps _mm512_and_ps
-#define _simd_or_ps _mm512_or_ps
-#define _simd_rcp_ps _mm512_rcp28_ps
-#define _simd_div_ps _mm512_div_ps
-#define _simd_castsi_ps _mm512_castsi512_ps
-#define _simd_andnot_ps _mm512_andnot_ps
-#define _simd_round_ps _mm512_round_ps
-#define _simd_castpd_ps _mm512_castpd_ps
-#define _simd_broadcast_ps _m512_broadcast_ps
-#define _simd_movemask_pd _mm512_movemask_pd
-#define _simd_castsi_pd _mm512_castsi512_pd
-
-#define _simd_mul_epi32 _mm512_mul_epi32
-#define _simd_mullo_epi32 _mm512_mullo_epi32
-#define _simd_sub_epi32 _mm512_sub_epi32
-#define _simd_sub_epi64 _mm512_sub_epi64
-#define _simd_min_epi32 _mm512_min_epi32
-#define _simd_max_epi32 _mm512_max_epi32
-#define _simd_min_epu32 _mm512_min_epu32
-#define _simd_max_epu32 _mm512_max_epu32
-#define _simd_add_epi32 _mm512_add_epi32
-#define _simd_and_si _mm512_and_si512
-#define _simd_andnot_si _mm512_andnot_si512
-#define _simd_cmpeq_epi32 _mm512_cmpeq_epi32
-#define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
-#define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
-#define _simd_or_si _mm512_or_si512
-#define _simd_castps_si _mm512_castps_si512
-
-#endif
-
#else
#error Unsupported vector width
#endif
__m256 result = _mm256_setzero_ps();
__m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
return _mm256_insertf128_ps(result, vLo, 0);
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalar result = _simd_setzero_ps();
-
- __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc));
-
- result.lo = _mm256_insertf128_ps(result.lo, src, 0);
-
- return result;
-#endif
#else
#error Unsupported vector width
#endif
// store simd bytes
#if KNOB_SIMD_WIDTH == 8
_mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- _mm_store_ps(reinterpret_cast<float*>(pDst), _mm256_castps256_ps128(src.lo));
-#endif
#else
#error Unsupported vector width
#endif
#elif KNOB_ARCH>=KNOB_ARCH_AVX2
return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
#endif
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalari result;
-
- __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in.lo));
-
- result.lo = _mm256_cvtepu8_epi32(src);
-
- result.hi = _mm256_cvtepu8_epi32(_mm_srli_si128(src, 8));
-
- return _simd_castsi_ps(result);
-#endif
#else
#error Unsupported vector width
#endif
__m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
__m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalari result = _simd_setzero_si();
-
- __m128i templo = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.lo)), _mm256_extractf128_si256(_mm256_castps_si256(in.lo), 1));
-
- __m128i temphi = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.hi)), _mm256_extractf128_si256(_mm256_castps_si256(in.hi), 1));
-
- __m128i temp = _mm_packus_epi16(templo, temphi);
-
- result.lo = _mm256_insertf128_si256(result.lo, temp, 0);
-
- return _simd_castsi_ps(result);
-#endif
#else
#error Unsupported vector width
#endif
__m256 result = _mm256_setzero_ps();
__m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
return _mm256_insertf128_ps(result, vLo, 0);
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalar result = _simd_setzero_ps();
-
- __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc));
-
- result.lo = _mm256_insertf128_ps(result.lo, src, 0);
-
- return result;
-#endif
#else
#error Unsupported vector width
#endif
// store simd bytes
#if KNOB_SIMD_WIDTH == 8
_mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- _mm_store_ps(reinterpret_cast<float*>(pDst), _mm256_castps256_ps128(src.lo));
-#endif
#else
#error Unsupported vector width
#endif
#elif KNOB_ARCH>=KNOB_ARCH_AVX2
return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
#endif
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalari result;
-
- __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in.lo));
-
- result.lo = _mm256_cvtepu8_epi32(src);
-
- result.hi = _mm256_cvtepu8_epi32(_mm_srli_si128(src, 8));
-
- return _simd_castsi_ps(result);
-#endif
#else
#error Unsupported vector width
#endif
__m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
__m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalari result = _simd_setzero_si();
-
- __m128i templo = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.lo)), _mm256_extractf128_si256(_mm256_castps_si256(in.lo), 1));
-
- __m128i temphi = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(in.hi)), _mm256_extractf128_si256(_mm256_castps_si256(in.hi), 1));
-
- __m128i temp = _mm_packs_epi16(templo, temphi);
-
- result.lo = _mm256_insertf128_si256(result.lo, temp, 0);
-
- return _simd_castsi_ps(result);
-#endif
#else
#error Unsupported vector width
#endif
__m256 result = _mm256_setzero_ps();
__m128 vLo = _mm_load_ps((const float*)pSrc);
return _mm256_insertf128_ps(result, vLo, 0);
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalar result;
-
- result.lo = _mm256_load_ps(reinterpret_cast<const float*>(pSrc));
-
- result.hi = _mm256_undefined_ps();
-
- return result;
-#endif
#else
#error Unsupported vector width
#endif
#if KNOB_SIMD_WIDTH == 8
// store 16B (2B * 8)
_mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- _mm256_store_ps(reinterpret_cast<float*>(pDst), src.lo);
-#endif
#else
#error Unsupported vector width
#endif
#elif KNOB_ARCH>=KNOB_ARCH_AVX2
return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
#endif
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalari result;
-
- result.lo = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), 0));
-
- result.hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), 1));
-
- return _simd_castsi_ps(result);
-#endif
#else
#error Unsupported vector width
#endif
simdscalari src = _simd_castps_si(in);
__m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
return _mm256_castsi256_ps(res);
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalari result;
-
- __m256i inlo = _mm256_castps_si256(in.lo);
- __m256i inhi = _mm256_castps_si256(in.hi);
-
- __m256i templo = _mm256_permute2x128_si256(inlo, inhi, 0x20);
- __m256i temphi = _mm256_permute2x128_si256(inlo, inhi, 0x31);
-
- result.lo = _mm256_packus_epi32(templo, temphi);
- result.hi = _mm256_undefined_si256();
-
- return _simd_castsi_ps(result);
-#endif
#else
#error Unsupported vector width
#endif
__m256 result = _mm256_setzero_ps();
__m128 vLo = _mm_load_ps((const float*)pSrc);
return _mm256_insertf128_ps(result, vLo, 0);
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalar result;
-
- result.lo = _mm256_load_ps(reinterpret_cast<const float*>(pSrc));
-
- result.hi = _mm256_undefined_ps();
-
- return result;
-#endif
#else
#error Unsupported vector width
#endif
#if KNOB_SIMD_WIDTH == 8
// store 16B (2B * 8)
_mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- _mm256_store_ps(reinterpret_cast<float*>(pDst), src.lo);
-#endif
#else
#error Unsupported vector width
#endif
#elif KNOB_ARCH>=KNOB_ARCH_AVX2
return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
#endif
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalari result;
-
- result.lo = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), 0));
-
- result.hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(_mm256_castps_si256(in.lo), 1));
-
- return _simd_castsi_ps(result);
-#endif
#else
#error Unsupported vector width
#endif
simdscalari src = _simd_castps_si(in);
__m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
return _mm256_castsi256_ps(res);
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalari result;
-
- __m256i inlo = _mm256_castps_si256(in.lo);
- __m256i inhi = _mm256_castps_si256(in.hi);
-
- __m256i templo = _mm256_permute2x128_si256(inlo, inhi, 0x20);
- __m256i temphi = _mm256_permute2x128_si256(inlo, inhi, 0x31);
-
- result.lo = _mm256_packs_epi32(templo, temphi);
- result.hi = _mm256_undefined_si256();
-
- return _simd_castsi_ps(result);
-#endif
#else
#error Unsupported vector width
#endif
#else
return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
#endif
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
-simdscalari result;
-
- __m128i templo = _mm256_cvtps_ph(in.lo, _MM_FROUND_TRUNC);
- __m128i temphi = _mm256_cvtps_ph(in.hi, _MM_FROUND_TRUNC);
-
- result.lo = _mm256_castsi128_si256(templo);
- result.lo = _mm256_insertf128_si256(result.lo, temphi, 1);
-
- result.hi = _mm256_undefined_si256();
-
- return _simd_castsi_ps(result);
-#endif
#else
#error Unsupported vector width
#endif
in = _mm256_insertf128_ps(in, srcLo, 0);
in = _mm256_insertf128_ps(in, srcHi, 1);
#endif
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- __m128 inlo0 = _mm256_extractf128_ps(in.lo, 0);
- __m128 inlo1 = _mm256_extractf128_ps(in.lo, 1);
- __m128 inhi0 = _mm256_extractf128_ps(in.hi, 0);
- __m128 inhi1 = _mm256_extractf128_ps(in.hi, 1);
-
- inlo0 = ConvertFloatToSRGB2(inlo0);
- inlo1 = ConvertFloatToSRGB2(inlo1);
- inhi0 = ConvertFloatToSRGB2(inhi0);
- inhi1 = ConvertFloatToSRGB2(inhi1);
-
- in.lo = _mm256_insertf128_ps(in.lo, inlo0, 0);
- in.lo = _mm256_insertf128_ps(in.lo, inlo1, 1);
- in.hi = _mm256_insertf128_ps(in.hi, inhi0, 0);
- in.hi = _mm256_insertf128_ps(in.hi, inhi1, 1);
-#endif
#else
#error Unsupported vector width
#endif
#endif
#endif
-#if KNOB_SIMD_WIDTH == 8 || KNOB_SIMD_WIDTH == 16
+#if KNOB_SIMD_WIDTH == 8
INLINE
void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
{
simdscalari dst = _mm256_or_si256(dst01, dst23);
_simd_store_si((simdscalari*)pDst, dst);
#endif
-#elif KNOB_SIMD_WIDTH == 16
- simdscalari mask0 = _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
-
- simdscalari dst01 = _simd_shuffle_epi8(src, mask0);
-
- simdscalari perm1 = _simd_permute_128(src, src, 1);
-
- simdscalari mask1 = _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
-
- simdscalari dst23 = _simd_shuffle_epi8(perm1, mask1);
-
- simdscalari dst = _simd_or_si(dst01, dst23);
-
- _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);
#else
#error Unsupported vector width
#endif
__m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
rg = _mm_unpacklo_epi8(rg, g);
_mm_store_si128((__m128i*)pDst, rg);
-#elif KNOB_SIMD_WIDTH == 16
- __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg
-
- __m256i r = _mm256_permute4x64_epi64(src, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
-
- __m256i g = _mm256_permute4x64_epi64(src, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx
-
- __m256i dst = _mm256_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
-
- _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst);
#else
#error Unsupported vector width
#endif
_mm_store_ps((float*)pDst+20, vDst[5]);
_mm_store_ps((float*)pDst+24, vDst[6]);
_mm_store_ps((float*)pDst+28, vDst[7]);
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
- simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
- simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
- simdscalar src3 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 48);
-
- __m128 vDst[8];
-
- vTranspose4x8(vDst, src0.lo, src1.lo, src2.lo, src3.lo);
-
- _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
-
- vTranspose4x8(vDst, src0.hi, src1.hi, src2.hi, src3.hi);
-
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
-#endif
#else
#error Unsupported vector width
#endif
_mm_store_ps((float*)pDst + 20, vDst[5]);
_mm_store_ps((float*)pDst + 24, vDst[6]);
_mm_store_ps((float*)pDst + 28, vDst[7]);
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
- simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
- simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
-
- __m128 vDst[8];
-
- vTranspose3x8(vDst, src0.lo, src1.lo, src2.lo);
-
- _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
-
- vTranspose3x8(vDst, src0.hi, src1.hi, src2.hi);
-
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
- _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
-#endif
#else
#error Unsupported vector width
#endif
_mm_store_ps(pfDst + 4, dst1);
_mm_store_ps(pfDst + 8, dst2);
_mm_store_ps(pfDst + 12, dst3);
-#elif KNOB_SIMD_WIDTH == 16
- const float* pfSrc = (const float*)pSrc;
- __m256 src_r0 = _mm256_load_ps(pfSrc + 0);
- __m256 src_r1 = _mm256_load_ps(pfSrc + 8);
- __m256 src_g0 = _mm256_load_ps(pfSrc + 16);
- __m256 src_g1 = _mm256_load_ps(pfSrc + 24);
-
- __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0);
- __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0);
- __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1);
- __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1);
-
- float* pfDst = (float*)pDst;
- _mm256_store_ps(pfDst + 0, dst0);
- _mm256_store_ps(pfDst + 8, dst1);
- _mm256_store_ps(pfDst + 16, dst2);
- _mm256_store_ps(pfDst + 24, dst3);
#else
#error Unsupported vector width
#endif
_mm_store_si128(((__m128i*)pDst) + 1, dst1);
_mm_store_si128(((__m128i*)pDst) + 2, dst2);
_mm_store_si128(((__m128i*)pDst) + 3, dst3);
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
- simdscalari src_ba = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc + sizeof(simdscalari)));
-
- __m256i src_r = src_rg.lo;
- __m256i src_g = src_rg.hi;
- __m256i src_b = src_ba.lo;
- __m256i src_a = src_ba.hi;
-
- __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
- __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
- __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
- __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
-
- __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
- __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
- __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
- __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
-
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
-#endif
#else
#error Unsupported vector width
#endif
_mm_store_si128(((__m128i*)pDst) + 1, dst1);
_mm_store_si128(((__m128i*)pDst) + 2, dst2);
_mm_store_si128(((__m128i*)pDst) + 3, dst3);
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
-
- __m256i src_r = src_rg.lo;
- __m256i src_g = src_rg.hi;
- __m256i src_b = _mm256_load_si256(reinterpret_cast<const __m256i*>(pSrc + sizeof(simdscalari)));
- __m256i src_a = _mm256_undefined_si256();
-
- __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
- __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
- __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
- __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
-
- __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
- __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
- __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
- __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
-
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
-#endif
#else
#error Unsupported vector width
#endif
_mm_store_si128((__m128i*)pDst, resLo);
_mm_store_si128((__m128i*)pDst + 1, resHi);
-#elif KNOB_SIMD_WIDTH == 16
-#if ENABLE_AVX512_EMULATION
- simdscalari src = _simd_castps_si(_simd_load_ps(reinterpret_cast<const float*>(pSrc)));
-
- simdscalari result;
-
- result.lo = _mm256_unpacklo_epi16(src.lo, src.hi);
- result.hi = _mm256_unpackhi_epi16(src.lo, src.hi);
-
- _simd_store_si(reinterpret_cast<simdscalari *>(pDst), result);
-#endif
#else
#error Unsupported vector width
#endif