From: Tim Rowley Date: Mon, 19 Dec 2016 21:25:52 +0000 (-0600) Subject: swr: [rasterizer core] fix SIMD16 transpose functions X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=bd22c3d41151ce265e61d64f9034928f83d3c959;p=mesa.git swr: [rasterizer core] fix SIMD16 transpose functions Fixed Transpose_16 methods of following formats: Transpose8_8_8_8 Transpose8_8 Transpose32_32 Transpose16_16_16_16 Transpose16_16_16 Transpose16_16 Reviewed-by: Bruce Cherniak --- diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h index cf6a6b6883f..94da225c651 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h @@ -536,6 +536,15 @@ INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b) #define _simd16_cmplt_epi32(a, b) _simd16_cmpgt_epi32(b, a) +SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpacklo_ps, _simd_unpacklo_ps) +SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpackhi_ps, _simd_unpackhi_ps) +SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpacklo_pd, _simd_unpacklo_pd) +SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpackhi_pd, _simd_unpackhi_pd) + +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi8, _simd_unpacklo_epi8) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi8, _simd_unpackhi_epi8) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi16, _simd_unpacklo_epi16) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi16, _simd_unpackhi_epi16) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi32, _simd_unpacklo_epi32) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi32, _simd_unpackhi_epi32) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi64, _simd_unpacklo_epi64) @@ -583,24 +592,38 @@ INLINE simd16scalari _simd16_srli_epi32_temp(simd16scalari a) SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps) SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8) +//__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale) +template +INLINE simd16scalar _simd16_i32gather_ps_temp(const float *m, simd16scalari index) +{ + simd16scalar result; -template -INLINE simd16scalar _simd16_i32gather_ps_temp(float const *m, simd16scalari a) + result.lo = _simd_i32gather_ps(m, index.lo, scale); + result.hi = _simd_i32gather_ps(m, index.hi, scale); + + return result; +} + +#define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp(m, index) + +//__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale) +template +INLINE simd16scalar _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask) { simd16scalar result; - result.lo = _mm256_i32gather_ps(m, a.lo, imm8); - result.hi = _mm256_i32gather_ps(m, a.hi, imm8); + result.lo = _simd_mask_i32gather_ps(a.lo, m, index.lo, _simd_castsi_ps(mask.lo), scale); + result.hi = _simd_mask_i32gather_ps(a.hi, m, index.hi, _simd_castsi_ps(mask.hi), scale); return result; } -#define _simd16_i32gather_ps(m, a, imm8) _simd16_i32gather_ps_temp(m, a) +#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp(a, m, mask, index) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8) SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64) @@ -716,6 +739,39 @@ INLINE simd16scalari _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp(a, b) +INLINE simd16scalari _simd16_cvtepu8_epi16(simdscalari a) +{ + simd16scalari result; + + result.lo = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 0)); + result.hi = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 1)); + + return result; +} + +INLINE simd16scalari _simd16_cvtepu8_epi32(__m128i a) +{ + simd16scalari result; + + result.lo = _simd_cvtepu8_epi32(a); + result.hi = _simd_cvtepu8_epi32(_mm_srli_si128(a, 8)); + + return result; +} + +INLINE simd16scalari _simd16_cvtepu16_epi32(simdscalari a) +{ + simd16scalari result; + + result.lo = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 0)); + result.hi = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 1)); + + return result; +} + +SIMD16_EMU_AVX512_2(simd16scalari, _simd_packus_epi32, _mm256_packus_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd_packs_epi32, _mm256_packs_epi32) + INLINE simd16mask _simd16_int2mask(int mask) { return mask; @@ -939,6 +995,14 @@ INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b) #endif +#define _simd16_unpacklo_ps _mm512_unpacklo_ps +#define _simd16_unpackhi_ps _mm512_unpackhi_ps +#define _simd16_unpacklo_pd _mm512_unpacklo_pd +#define _simd16_unpackhi_pd _mm512_unpackhi_pd +#define _simd16_unpacklo_epi8 _mm512_unpacklo_epi8 +#define _simd16_unpackhi_epi8 _mm512_unpackhi_epi8 +#define _simd16_unpacklo_epi16 _mm512_unpacklo_epi16 +#define _simd16_unpackhi_epi16 _mm512_unpackhi_epi16 #define _simd16_unpacklo_epi32 _mm512_unpacklo_epi32 #define _simd16_unpackhi_epi32 _mm512_unpackhi_epi32 #define _simd16_unpacklo_epi64 _mm512_unpacklo_epi64 @@ -953,7 +1017,11 @@ INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b) #define _simd16_add_epi8 _mm512_add_epi8 #define _simd16_shuffle_epi8 _mm512_shuffle_epi8 -#define _simd16_i32gather_ps(m, index, scale) _mm512_i32gather_ps(index, m, scale) +#define _simd16_fmadd_ps _mm512_fmadd_ps +#define _simd16_fmsub_ps _mm512_fmsub_ps + +#define _simd16_i32gather_ps(m, index, scale) _mm512_i32gather_ps(index, m, scale) +#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _mm512_mask_i32gather_ps(a, m, index, mask, scale) #define _simd16_abs_epi32 _mm512_abs_epi32 #define _simd16_cmpeq_epi64 _mm512_abs_epi32 @@ -1009,6 +1077,11 @@ INLINE simd16scalari _simd16_cmpgt_epi8(simd16scalari a, simd16scalari b) #define _simd16_permute2f128_si _mm512_shuffle_i32x4 #define _simd16_shuffle_ps _mm512_shuffle_ps #define _simd16_shuffle_pd _mm512_shuffle_pd +#define _simd16_cvtepu8_epi16 _mm512_cvtepu8_epi16 +#define _simd16_cvtepu8_epi32 _mm512_cvtepu8_epi32 +#define _simd16_cvtepu16_epi32 _mm512_cvtepu16_epi32 +#define _simd16_packus_epi32 _mm512_packus_epi32 +#define _simd16_packs_epi32 _mm512_packs_epi32 template INLINE simd16scalari _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index e777b22ec1c..671e3b82a34 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -422,11 +422,22 @@ int _simdemu_movemask_epi8(__m256i a) return (resHi << 16) | resLo; } +INLINE +__m256i _simd_cvtepu8_epi16(__m128i a) +{ + __m128i resultlo = _mm_cvtepu8_epi16(a); + __m128i resulthi = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8)); + + __m256i result = _mm256_castsi128_si256(resultlo); + + return _mm256_insertf128_si256(result, resulthi, 1); +} + INLINE __m256i _simd_cvtepu8_epi32(__m128i a) { __m128i resultlo = _mm_cvtepu8_epi32(a); - __m128i resulthi = _mm_shuffle_epi8(a, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); + __m128i resulthi = _mm_cvtepu8_epi32(_mm_srli_si128(a, 4)); __m256i result = _mm256_castsi128_si256(resultlo); @@ -437,7 +448,7 @@ INLINE __m256i _simd_cvtepu16_epi32(__m128i a) { __m128i resultlo = _mm_cvtepu16_epi32(a); - __m128i resulthi = _mm_shuffle_epi8(a, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); + __m128i resulthi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8)); __m256i result = _mm256_castsi128_si256(resultlo); @@ -534,6 +545,7 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b) #define _simd_permute_epi32 _mm256_permutevar8x32_epi32 #define _simd_srlv_epi32 _mm256_srlv_epi32 #define _simd_sllv_epi32 _mm256_sllv_epi32 +#define _simd_cvtepu8_epi16 _mm256_cvtepu8_epi16 #define _simd_cvtepu8_epi32 _mm256_cvtepu8_epi32 #define _simd_cvtepu16_epi32 _mm256_cvtepu16_epi32 #define _simd_packus_epi32 _mm256_packus_epi32 @@ -542,7 +554,9 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b) #endif #define _simd_unpacklo_ps _mm256_unpacklo_ps +#define _simd_unpackhi_ps _mm256_unpackhi_ps #define _simd_unpacklo_pd _mm256_unpacklo_pd +#define _simd_unpackhi_pd _mm256_unpackhi_pd #define _simd_insertf128_ps _mm256_insertf128_ps #define _simd_insertf128_pd _mm256_insertf128_pd #define _simd_insertf128_si _mm256_insertf128_si256 diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h index 0e2cb47f2bd..6caee16623b 100644 --- a/src/gallium/drivers/swr/rasterizer/core/utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h @@ -194,6 +194,29 @@ void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); } +#if ENABLE_AVX512_SIMD16 +INLINE +void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3) +{ + const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking + + simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r + simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g + simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b + simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a + + simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2); + simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3); + simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2); + simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3); + + dst[0] = _simd16_unpacklo_ps(rblo, galo); + dst[1] = _simd16_unpackhi_ps(rblo, galo); + dst[2] = _simd16_unpacklo_ps(rbhi, gahi); + dst[3] = _simd16_unpackhi_ps(rbhi, gahi); +} + +#endif INLINE void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7) { @@ -296,21 +319,23 @@ struct Transpose8_8_8_8 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - simd16scalari src = _simd16_load_si(reinterpret_cast(pSrc)); + __m128i src0 = _mm_load_si128(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + __m128i src1 = _mm_load_si128(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + __m128i src2 = _mm_load_si128(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb + __m128i src3 = _mm_load_si128(reinterpret_cast(pSrc) + 3); // aaaaaaaaaaaaaaaa - simd16scalari mask0 = _simd16_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800); + simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0); + simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1); + simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2); + simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3); - simd16scalari dst01 = _simd16_shuffle_epi8(src, mask0); + simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8); + simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16); + simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24); - simd16scalari perm1 = _simd16_permute2f128_si(src, src, 1); + simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3)); - simd16scalari mask1 = _simd16_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080); - - simd16scalari dst23 = _simd16_shuffle_epi8(perm1, mask1); - - simd16scalari dst = _simd16_or_si(dst01, dst23); - - _simd16_store_si(reinterpret_cast(pDst), dst); + _simd16_store_si(reinterpret_cast(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba } #endif }; @@ -357,17 +382,17 @@ struct Transpose8_8 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - simdscalari r = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg + __m128i src0 = _mm_load_si128(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + __m128i src1 = _mm_load_si128(reinterpret_cast(pSrc) + 1); // gggggggggggggggg - simdscalari g = _simd_permute2f128_si(r, r, 1); // ggggggggggggggggxxxxxxxxxxxxxxxx + simdscalari cvt0 = _simd_cvtepu8_epi16(src0); + simdscalari cvt1 = _simd_cvtepu8_epi16(src1); - r = _simd_insertf128_si(r, _mm_srli_si128(_simd_extractf128_si(r, 0), 8), 1); // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx + simdscalari shl1 = _simd_slli_epi32(cvt1, 8); - g = _simd_insertf128_si(g, _mm_srli_si128(_simd_extractf128_si(g, 0), 8), 1); // ggggggggxxxxxxxxggggggggxxxxxxxx + simdscalari dst = _simd_or_si(cvt0, shl1); - simdscalari dst = _simd_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg - - _simd_store_si(reinterpret_cast(pDst), dst); + _simd_store_si(reinterpret_cast(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg } #endif }; @@ -412,17 +437,14 @@ struct Transpose32_32_32_32 simd16scalar src2 = _simd16_load_ps(reinterpret_cast(pSrc) + 32); simd16scalar src3 = _simd16_load_ps(reinterpret_cast(pSrc) + 48); - __m128 vDst[8]; + simd16scalar dst[4]; - vTranspose4x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0), _simd16_extract_ps(src3, 0)); + vTranspose4x16(dst, src0, src1, src2, src3); - _simd16_store_ps(reinterpret_cast(pDst) + 0, reinterpret_cast(vDst)[0]); - _simd16_store_ps(reinterpret_cast(pDst) + 16, reinterpret_cast(vDst)[1]); - - vTranspose4x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1), _simd16_extract_ps(src3, 1)); - - _simd16_store_ps(reinterpret_cast(pDst) + 32, reinterpret_cast(vDst)[0]); - _simd16_store_ps(reinterpret_cast(pDst) + 48, reinterpret_cast(vDst)[1]); + _simd16_store_ps(reinterpret_cast(pDst) + 0, dst[0]); + _simd16_store_ps(reinterpret_cast(pDst) + 16, dst[1]); + _simd16_store_ps(reinterpret_cast(pDst) + 32, dst[2]); + _simd16_store_ps(reinterpret_cast(pDst) + 48, dst[3]); } #endif }; @@ -464,18 +486,16 @@ struct Transpose32_32_32 simd16scalar src0 = _simd16_load_ps(reinterpret_cast(pSrc)); simd16scalar src1 = _simd16_load_ps(reinterpret_cast(pSrc) + 16); simd16scalar src2 = _simd16_load_ps(reinterpret_cast(pSrc) + 32); + simd16scalar src3 = _simd16_setzero_ps(); - __m128 vDst[8]; - - vTranspose3x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0)); - - _simd16_store_ps(reinterpret_cast(pDst) + 0, reinterpret_cast(vDst)[0]); - _simd16_store_ps(reinterpret_cast(pDst) + 16, reinterpret_cast(vDst)[1]); + simd16scalar dst[4]; - vTranspose3x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1)); + vTranspose4x16(dst, src0, src1, src2, src3); - _simd16_store_ps(reinterpret_cast(pDst) + 32, reinterpret_cast(vDst)[0]); - _simd16_store_ps(reinterpret_cast(pDst) + 48, reinterpret_cast(vDst)[1]); + _simd16_store_ps(reinterpret_cast(pDst) + 0, dst[0]); + _simd16_store_ps(reinterpret_cast(pDst) + 16, dst[1]); + _simd16_store_ps(reinterpret_cast(pDst) + 32, dst[2]); + _simd16_store_ps(reinterpret_cast(pDst) + 48, dst[3]); } #endif }; @@ -516,20 +536,20 @@ struct Transpose32_32 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - simdscalar src_r0 = _simd_load_ps(reinterpret_cast(pSrc)); - simdscalar src_r1 = _simd_load_ps(reinterpret_cast(pSrc) + 8); - simdscalar src_g0 = _simd_load_ps(reinterpret_cast(pSrc) + 16); - simdscalar src_g1 = _simd_load_ps(reinterpret_cast(pSrc) + 24); - - simdscalar dst0 = _simd_unpacklo_ps(src_r0, src_g0); - simdscalar dst1 = _simd_unpacklo_ps(src_r0, src_g0); - simdscalar dst2 = _simd_unpacklo_ps(src_r1, src_g1); - simdscalar dst3 = _simd_unpacklo_ps(src_r1, src_g1); - - _simd_store_ps(reinterpret_cast(pDst) + 0, dst0); - _simd_store_ps(reinterpret_cast(pDst) + 8, dst1); - _simd_store_ps(reinterpret_cast(pDst) + 16, dst2); - _simd_store_ps(reinterpret_cast(pDst) + 24, dst3); + simd16scalar src0 = _simd16_load_ps(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simd16scalar src1 = _simd16_load_ps(reinterpret_cast(pSrc) + 16); // gggggggggggggggg + + simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD + simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF + + simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7 + simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF + + simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7 + simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF + + _simd16_store_ps(reinterpret_cast(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg + _simd16_store_ps(reinterpret_cast(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg } #endif }; @@ -576,28 +596,30 @@ struct Transpose16_16_16_16 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - simd16scalari src_rg = _simd16_load_si(reinterpret_cast(pSrc)); - simd16scalari src_ba = _simd16_load_si(reinterpret_cast(pSrc + sizeof(simd16scalari))); - - simdscalari src_r = _simd16_extract_si(src_rg, 0); - simdscalari src_g = _simd16_extract_si(src_rg, 1); - simdscalari src_b = _simd16_extract_si(src_ba, 0); - simdscalari src_a = _simd16_extract_si(src_ba, 1); - - simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g); - simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g); - simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a); - simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a); - - simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0); - simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0); - simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1); - simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1); - - _simd_store_si(reinterpret_cast(pDst) + 0, dst0); - _simd_store_si(reinterpret_cast(pDst) + 1, dst1); - _simd_store_si(reinterpret_cast(pDst) + 2, dst2); - _simd_store_si(reinterpret_cast(pDst) + 3, dst3); + simdscalari src0 = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + simdscalari src2 = _simd_load_si(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb + simdscalari src3 = _simd_load_si(reinterpret_cast(pSrc) + 3); // aaaaaaaaaaaaaaaa + + simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB + simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF + + simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 + simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB + simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD + simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF + + simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 + simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 + simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB + simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF + + _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 2, dst2); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 3, dst3); // rgbargbargbargba } #endif }; @@ -643,27 +665,30 @@ struct Transpose16_16_16 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - simd16scalari src_rg = _simd16_load_si(reinterpret_cast(pSrc)); - - simdscalari src_r = _simd16_extract_si(src_rg, 0); - simdscalari src_g = _simd16_extract_si(src_rg, 1); - simdscalari src_b = _simd_load_si(reinterpret_cast(pSrc + sizeof(simd16scalari))); - simdscalari src_a = _mm256_undefined_si256(); - - simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g); - simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g); - simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a); - simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a); - - simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0); - simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0); - simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1); - simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1); - - _simd_store_si(reinterpret_cast(pDst) + 0, dst0); - _simd_store_si(reinterpret_cast(pDst) + 1, dst1); - _simd_store_si(reinterpret_cast(pDst) + 2, dst2); - _simd_store_si(reinterpret_cast(pDst) + 3, dst3); + simdscalari src0 = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + simdscalari src2 = _simd_load_si(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb + simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa + + simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB + simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF + + simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 + simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB + simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD + simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF + + simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 + simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 + simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB + simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF + + _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 2, dst2); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 3, dst3); // rgbargbargbargba } #endif }; @@ -701,17 +726,17 @@ struct Transpose16_16 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - simd16scalari result = _simd16_setzero_si(); - - simd16scalari src = _simd16_load_si(reinterpret_cast(pSrc)); + simdscalari src0 = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg - simdscalari srclo = _simd16_extract_si(src, 0); - simdscalari srchi = _simd16_extract_si(src, 1); + simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - result = _simd16_insert_si(result, _simd_unpacklo_epi16(srclo, srchi), 0); - result = _simd16_insert_si(result, _simd_unpackhi_epi16(srclo, srchi), 1); + simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x00); // (0, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7 + simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x11); // (1, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF - _simd16_store_si(reinterpret_cast(pDst), result); + _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg + _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg } #endif };