X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fswr%2Frasterizer%2Fcore%2Futils.h;h=c4162b4e71cb98738bc868d19913f89f45789faf;hb=8cd8240cfce1e26f2f237f1eb98d46ba47bca626;hp=63ecd5cfe1b824ad805b4ee2c343d43833aae830;hpb=27cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4e;p=mesa.git diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h index 63ecd5cfe1b..c4162b4e71c 100644 --- a/src/gallium/drivers/swr/rasterizer/core/utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h @@ -29,29 +29,17 @@ #include #include +#include #include "common/os.h" #include "common/simdintrin.h" #include "common/swr_assert.h" - -#if defined(_WIN32) -void SaveImageToPNGFile( - const WCHAR *pFilename, - void *pBuffer, - uint32_t width, - uint32_t height); - -void OpenBitmapFromFile( - const WCHAR *pFilename, - void **pBuffer, - uint32_t *width, - uint32_t *height); -#endif +#include "core/api.h" #if defined(_WIN64) || defined(__x86_64__) #define _MM_INSERT_EPI64 _mm_insert_epi64 #define _MM_EXTRACT_EPI64 _mm_extract_epi64 #else -INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx) +INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx) { OSALIGNLINE(uint32_t) elems[4]; _mm_store_si128((__m128i*)elems, a); @@ -69,7 +57,7 @@ INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx) } } -INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx) +INLINE __m128i _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx) { OSALIGNLINE(int64_t) elems[2]; _mm_store_si128((__m128i*)elems, a); @@ -87,36 +75,12 @@ INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx) } #endif -OSALIGNLINE(struct) BBOX -{ - int top{ 0 }; - int bottom{ 0 }; - int left{ 0 }; - int right{ 0 }; - - BBOX() {} - BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {} - - bool operator==(const BBOX& rhs) - { - return (this->top == rhs.top && - this->bottom == rhs.bottom && - this->left == rhs.left && - this->right == rhs.right); - } - - bool operator!=(const BBOX& rhs) - { - return !(*this == rhs); - } -}; - struct simdBBox { - simdscalari top; - simdscalari bottom; - simdscalari left; - simdscalari right; + simdscalari ymin; + simdscalari ymax; + simdscalari xmin; + simdscalari xmax; }; INLINE @@ -173,7 +137,7 @@ void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3) + __GNUC_MINOR__ * 100 \ + __GNUC_PATCHLEVEL__) -#if defined(__GNUC__) && (GCC_VERSION < 40900) +#if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900)) #define _mm_undefined_ps _mm_setzero_ps #define _mm_undefined_si128 _mm_setzero_si128 #if KNOB_SIMD_WIDTH == 8 @@ -183,7 +147,7 @@ void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3) #if KNOB_SIMD_WIDTH == 8 INLINE -void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2) +void vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2) { __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5 @@ -207,7 +171,7 @@ void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc } INLINE -void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3) +void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3) { __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 @@ -230,6 +194,29 @@ void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); } +#if ENABLE_AVX512_SIMD16 +INLINE +void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3) +{ + const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking + + simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r + simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g + simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b + simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a + + simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2); + simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3); + simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2); + simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3); + + dst[0] = _simd16_unpacklo_ps(rblo, galo); + dst[1] = _simd16_unpackhi_ps(rblo, galo); + dst[2] = _simd16_unpacklo_ps(rbhi, gahi); + dst[3] = _simd16_unpackhi_ps(rbhi, gahi); +} + +#endif INLINE void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7) { @@ -281,6 +268,13 @@ struct TransposeSingleComponent { memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8); + } +#endif }; ////////////////////////////////////////////////////////////////////////// @@ -295,6 +289,7 @@ struct Transpose8_8_8_8 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { simdscalari src = _simd_load_si((const simdscalari*)pSrc); + #if KNOB_SIMD_WIDTH == 8 #if KNOB_ARCH == KNOB_ARCH_AVX __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg @@ -320,6 +315,29 @@ struct Transpose8_8_8_8 #error Unsupported vector width #endif } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + __m128i src0 = _mm_load_si128(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + __m128i src1 = _mm_load_si128(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + __m128i src2 = _mm_load_si128(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb + __m128i src3 = _mm_load_si128(reinterpret_cast(pSrc) + 3); // aaaaaaaaaaaaaaaa + + simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0); + simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1); + simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2); + simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3); + + simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8); + simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16); + simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24); + + simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3)); + + _simd16_store_si(reinterpret_cast(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba + } +#endif }; ////////////////////////////////////////////////////////////////////////// @@ -332,6 +350,10 @@ struct Transpose8_8_8 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif }; ////////////////////////////////////////////////////////////////////////// @@ -345,9 +367,9 @@ struct Transpose8_8 /// @param pDst - output data in AOS form INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { +#if KNOB_SIMD_WIDTH == 8 simdscalari src = _simd_load_si((const simdscalari*)pSrc); -#if KNOB_SIMD_WIDTH == 8 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg rg = _mm_unpacklo_epi8(rg, g); @@ -356,6 +378,23 @@ struct Transpose8_8 #error Unsupported vector width #endif } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + __m128i src0 = _mm_load_si128(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + __m128i src1 = _mm_load_si128(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + + simdscalari cvt0 = _simd_cvtepu8_epi16(src0); + simdscalari cvt1 = _simd_cvtepu8_epi16(src1); + + simdscalari shl1 = _simd_slli_epi32(cvt1, 8); + + simdscalari dst = _simd_or_si(cvt0, shl1); + + _simd_store_si(reinterpret_cast(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg + } +#endif }; ////////////////////////////////////////////////////////////////////////// @@ -389,6 +428,25 @@ struct Transpose32_32_32_32 #error Unsupported vector width #endif } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + simd16scalar src0 = _simd16_load_ps(reinterpret_cast(pSrc)); + simd16scalar src1 = _simd16_load_ps(reinterpret_cast(pSrc) + 16); + simd16scalar src2 = _simd16_load_ps(reinterpret_cast(pSrc) + 32); + simd16scalar src3 = _simd16_load_ps(reinterpret_cast(pSrc) + 48); + + simd16scalar dst[4]; + + vTranspose4x16(dst, src0, src1, src2, src3); + + _simd16_store_ps(reinterpret_cast(pDst) + 0, dst[0]); + _simd16_store_ps(reinterpret_cast(pDst) + 16, dst[1]); + _simd16_store_ps(reinterpret_cast(pDst) + 32, dst[2]); + _simd16_store_ps(reinterpret_cast(pDst) + 48, dst[3]); + } +#endif }; ////////////////////////////////////////////////////////////////////////// @@ -421,6 +479,25 @@ struct Transpose32_32_32 #error Unsupported vector width #endif } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + simd16scalar src0 = _simd16_load_ps(reinterpret_cast(pSrc)); + simd16scalar src1 = _simd16_load_ps(reinterpret_cast(pSrc) + 16); + simd16scalar src2 = _simd16_load_ps(reinterpret_cast(pSrc) + 32); + simd16scalar src3 = _simd16_setzero_ps(); + + simd16scalar dst[4]; + + vTranspose4x16(dst, src0, src1, src2, src3); + + _simd16_store_ps(reinterpret_cast(pDst) + 0, dst[0]); + _simd16_store_ps(reinterpret_cast(pDst) + 16, dst[1]); + _simd16_store_ps(reinterpret_cast(pDst) + 32, dst[2]); + _simd16_store_ps(reinterpret_cast(pDst) + 48, dst[3]); + } +#endif }; ////////////////////////////////////////////////////////////////////////// @@ -434,6 +511,7 @@ struct Transpose32_32 /// @param pDst - output data in AOS form INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { +#if KNOB_SIMD_WIDTH == 8 const float* pfSrc = (const float*)pSrc; __m128 src_r0 = _mm_load_ps(pfSrc + 0); __m128 src_r1 = _mm_load_ps(pfSrc + 4); @@ -450,7 +528,30 @@ struct Transpose32_32 _mm_store_ps(pfDst + 4, dst1); _mm_store_ps(pfDst + 8, dst2); _mm_store_ps(pfDst + 12, dst3); +#else +#error Unsupported vector width +#endif } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + simd16scalar src0 = _simd16_load_ps(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simd16scalar src1 = _simd16_load_ps(reinterpret_cast(pSrc) + 16); // gggggggggggggggg + + simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD + simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF + + simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7 + simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF + + simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7 + simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF + + _simd16_store_ps(reinterpret_cast(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg + _simd16_store_ps(reinterpret_cast(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg + } +#endif }; ////////////////////////////////////////////////////////////////////////// @@ -491,6 +592,36 @@ struct Transpose16_16_16_16 #error Unsupported vector width #endif } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + simdscalari src0 = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + simdscalari src2 = _simd_load_si(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb + simdscalari src3 = _simd_load_si(reinterpret_cast(pSrc) + 3); // aaaaaaaaaaaaaaaa + + simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB + simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF + + simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 + simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB + simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD + simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF + + simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 + simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 + simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB + simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF + + _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 2, dst2); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 3, dst3); // rgbargbargbargba + } +#endif }; ////////////////////////////////////////////////////////////////////////// @@ -530,6 +661,36 @@ struct Transpose16_16_16 #error Unsupported vector width #endif } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + simdscalari src0 = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + simdscalari src2 = _simd_load_si(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb + simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa + + simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB + simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF + + simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 + simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB + simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD + simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF + + simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 + simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 + simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB + simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF + + _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 2, dst2); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 3, dst3); // rgbargbargbargba + } +#endif }; ////////////////////////////////////////////////////////////////////////// @@ -543,9 +704,9 @@ struct Transpose16_16 /// @param pDst - output data in AOS form INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { +#if KNOB_SIMD_WIDTH == 8 simdscalar src = _simd_load_ps((const float*)pSrc); -#if KNOB_SIMD_WIDTH == 8 __m128 comp0 = _mm256_castps256_ps128(src); __m128 comp1 = _mm256_extractf128_ps(src, 1); @@ -561,6 +722,23 @@ struct Transpose16_16 #error Unsupported vector width #endif } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + simdscalari src0 = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + + simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + + simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7 + simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF + + _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg + _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg + } +#endif }; ////////////////////////////////////////////////////////////////////////// @@ -573,6 +751,10 @@ struct Transpose24_8 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif }; ////////////////////////////////////////////////////////////////////////// @@ -585,9 +767,11 @@ struct Transpose32_8_24 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; - +#if ENABLE_AVX512_SIMD16 + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; ////////////////////////////////////////////////////////////////////////// /// Transpose4_4_4_4 @@ -599,6 +783,10 @@ struct Transpose4_4_4_4 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif }; ////////////////////////////////////////////////////////////////////////// @@ -611,6 +799,10 @@ struct Transpose5_6_5 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif }; ////////////////////////////////////////////////////////////////////////// @@ -623,12 +815,32 @@ struct Transpose9_9_9_5 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif }; ////////////////////////////////////////////////////////////////////////// /// Transpose5_5_5_1 ////////////////////////////////////////////////////////////////////////// struct Transpose5_5_5_1 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose1_5_5_5 +////////////////////////////////////////////////////////////////////////// +struct Transpose1_5_5_5 { ////////////////////////////////////////////////////////////////////////// /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. @@ -647,6 +859,10 @@ struct Transpose10_10_10_2 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif }; ////////////////////////////////////////////////////////////////////////// @@ -659,6 +875,74 @@ struct Transpose11_11_10 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose64 +////////////////////////////////////////////////////////////////////////// +struct Transpose64 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose64_64 +////////////////////////////////////////////////////////////////////////// +struct Transpose64_64 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose64_64_64 +////////////////////////////////////////////////////////////////////////// +struct Transpose64_64_64 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose64_64_64_64 +////////////////////////////////////////////////////////////////////////// +struct Transpose64_64_64_64 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif }; // helper function to unroll loops @@ -678,6 +962,26 @@ struct UnrollerL { } }; +// helper function to unroll loops, with mask to skip specific iterations +template +struct UnrollerLMask { + template + INLINE static void step(Lambda& func) { + if(Mask & (1 << Begin)) + { + func(Begin); + } + UnrollerL::step(func); + } +}; + +template +struct UnrollerLMask { + template + static void step(Lambda& func) { + } +}; + // general CRC compute INLINE uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size) @@ -836,11 +1140,22 @@ public: } }; +// Ranged integer argument for TemplateArgUnroller +template +struct IntArg +{ + uint32_t val; +}; + // Recursive template used to auto-nest conditionals. Converts dynamic boolean function // arguments to static template arguments. template struct TemplateArgUnroller { + //----------------------------------------- + // Boolean value + //----------------------------------------- + // Last Arg Terminator static typename TermT::FuncType GetFunc(bool bArg) { @@ -863,5 +1178,52 @@ struct TemplateArgUnroller return TemplateArgUnroller::GetFunc(remainingArgs...); } + + //----------------------------------------- + // Integer value (within specified range) + //----------------------------------------- + + // Last Arg Terminator + template + static typename TermT::FuncType GetFunc(IntArg iArg) + { + if (iArg.val == TMax) + { + return TermT::template GetFunc>(); + } + if (TMax > TMin) + { + return TemplateArgUnroller::GetFunc(IntArg{iArg.val}); + } + SWR_ASSUME(false); return nullptr; + } + template + static typename TermT::FuncType GetFunc(IntArg iArg) + { + SWR_ASSERT(iArg.val == TVal); + return TermT::template GetFunc>(); + } + + // Recursively parse args + template + static typename TermT::FuncType GetFunc(IntArg iArg, TArgsT... remainingArgs) + { + if (iArg.val == TMax) + { + return TemplateArgUnroller>::GetFunc(remainingArgs...); + } + if (TMax > TMin) + { + return TemplateArgUnroller::GetFunc(IntArg{iArg.val}, remainingArgs...); + } + SWR_ASSUME(false); return nullptr; + } + template + static typename TermT::FuncType GetFunc(IntArg iArg, TArgsT... remainingArgs) + { + SWR_ASSERT(iArg.val == TVal); + return TemplateArgUnroller>::GetFunc(remainingArgs...); + } }; +