From 9b448da60ffb5aa807d9145bbac0fdbd580acea9 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Fri, 9 Jun 2017 16:58:59 -0500 Subject: [PATCH] swr/rast: Refactor includes to limit simdintrin.h usage Reduces the files rebuilt after modifying simdintrin.h from 84 to 64. Reviewed-by: Bruce Cherniak --- src/gallium/drivers/swr/Makefile.sources | 3 + .../drivers/swr/rasterizer/common/intrin.h | 169 ++++ .../swr/rasterizer/common/simd16intrin.h | 52 - .../swr/rasterizer/common/simdintrin.h | 87 +- src/gallium/drivers/swr/rasterizer/core/api.h | 2 +- .../drivers/swr/rasterizer/core/context.h | 2 +- .../drivers/swr/rasterizer/core/fifo.hpp | 4 +- .../swr/rasterizer/core/format_traits.h | 2 +- .../swr/rasterizer/core/format_types.h | 1 + .../swr/rasterizer/core/format_utils.h | 882 +++++++++++++++++ .../drivers/swr/rasterizer/core/frontend.h | 1 + .../drivers/swr/rasterizer/core/multisample.h | 10 +- .../drivers/swr/rasterizer/core/state.h | 48 +- .../drivers/swr/rasterizer/core/state_funcs.h | 68 ++ .../drivers/swr/rasterizer/core/utils.h | 893 +----------------- .../swr/rasterizer/memory/tilingtraits.h | 2 +- 16 files changed, 1147 insertions(+), 1079 deletions(-) create mode 100644 src/gallium/drivers/swr/rasterizer/common/intrin.h create mode 100644 src/gallium/drivers/swr/rasterizer/core/format_utils.h create mode 100644 src/gallium/drivers/swr/rasterizer/core/state_funcs.h diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources index 6b76bd11979..a1172b72cad 100644 --- a/src/gallium/drivers/swr/Makefile.sources +++ b/src/gallium/drivers/swr/Makefile.sources @@ -55,6 +55,7 @@ ARCHRAST_CXX_SOURCES := \ COMMON_CXX_SOURCES := \ rasterizer/common/formats.cpp \ rasterizer/common/formats.h \ + rasterizer/common/intrin.h \ rasterizer/common/isa.hpp \ rasterizer/common/os.cpp \ rasterizer/common/os.h \ @@ -85,6 +86,7 @@ CORE_CXX_SOURCES := \ rasterizer/core/format_conversion.h \ rasterizer/core/format_traits.h \ rasterizer/core/format_types.h \ + rasterizer/core/format_utils.h \ rasterizer/core/frontend.cpp \ rasterizer/core/frontend.h \ rasterizer/core/knobs.h \ @@ -99,6 +101,7 @@ CORE_CXX_SOURCES := \ rasterizer/core/rdtsc_core.h \ rasterizer/core/ringbuffer.h \ rasterizer/core/state.h \ + rasterizer/core/state_funcs.h \ rasterizer/core/tessellator.h \ rasterizer/core/threads.cpp \ rasterizer/core/threads.h \ diff --git a/src/gallium/drivers/swr/rasterizer/common/intrin.h b/src/gallium/drivers/swr/rasterizer/common/intrin.h new file mode 100644 index 00000000000..f45b2e55880 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/common/intrin.h @@ -0,0 +1,169 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +****************************************************************************/ + +#ifndef __SWR_INTRIN_H__ +#define __SWR_INTRIN_H__ + +#include "os.h" + +#include + +#include +#include +#include + +#if KNOB_SIMD_WIDTH == 8 +typedef __m256 simdscalar; +typedef __m256i simdscalari; +typedef uint8_t simdmask; +#else +#error Unsupported vector width +#endif + +// simd vector +OSALIGNSIMD(union) simdvector +{ + simdscalar v[4]; + struct + { + simdscalar x, y, z, w; + }; + + simdscalar& operator[] (const int i) { return v[i]; } + const simdscalar& operator[] (const int i) const { return v[i]; } +}; + +#if ENABLE_AVX512_SIMD16 + +#if KNOB_SIMD16_WIDTH == 16 + +#if ENABLE_AVX512_EMULATION +struct simd16scalar +{ + __m256 lo; + __m256 hi; +}; +struct simd16scalard +{ + __m256d lo; + __m256d hi; +}; +struct simd16scalari +{ + __m256i lo; + __m256i hi; +}; +typedef uint16_t simd16mask; + +#else +typedef __m512 simd16scalar; +typedef __m512d simd16scalard; +typedef __m512i simd16scalari; +typedef __mmask16 simd16mask; +#endif//ENABLE_AVX512_EMULATION +#else +#error Unsupported vector width +#endif//KNOB_SIMD16_WIDTH == 16 + +#define _simd16_masklo(mask) ((mask) & 0xFF) +#define _simd16_maskhi(mask) (((mask) >> 8) & 0xFF) +#define _simd16_setmask(hi, lo) (((hi) << 8) | (lo)) + +#if defined(_WIN32) +#define SIMDAPI __vectorcall +#else +#define SIMDAPI +#endif + +OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector +{ + simd16scalar v[4]; + struct + { + simd16scalar x, y, z, w; + }; + + simd16scalar& operator[] (const int i) { return v[i]; } + const simd16scalar& operator[] (const int i) const { return v[i]; } +}; + +#endif // ENABLE_AVX512_SIMD16 + +INLINE +UINT pdep_u32(UINT a, UINT mask) +{ +#if KNOB_ARCH >= KNOB_ARCH_AVX2 + return _pdep_u32(a, mask); +#else + UINT result = 0; + + // copied from http://wm.ite.pl/articles/pdep-soft-emu.html + // using bsf instead of funky loop + DWORD maskIndex; + while (_BitScanForward(&maskIndex, mask)) + { + // 1. isolate lowest set bit of mask + const UINT lowest = 1 << maskIndex; + + // 2. populate LSB from src + const UINT LSB = (UINT)((int)(a << 31) >> 31); + + // 3. copy bit from mask + result |= LSB & lowest; + + // 4. clear lowest bit + mask &= ~lowest; + + // 5. prepare for next iteration + a >>= 1; + } + + return result; +#endif +} + +INLINE +UINT pext_u32(UINT a, UINT mask) +{ +#if KNOB_ARCH >= KNOB_ARCH_AVX2 + return _pext_u32(a, mask); +#else + UINT result = 0; + DWORD maskIndex; + uint32_t currentBit = 0; + while (_BitScanForward(&maskIndex, mask)) + { + // 1. isolate lowest set bit of mask + const UINT lowest = 1 << maskIndex; + + // 2. copy bit from mask + result |= ((a & lowest) > 0) << currentBit++; + + // 3. clear lowest bit + mask &= ~lowest; + } + return result; +#endif +} + +#endif//__SWR_INTRIN_H__ diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h index e303ce59713..a822420ae37 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h @@ -26,58 +26,6 @@ #if ENABLE_AVX512_SIMD16 -#if KNOB_SIMD16_WIDTH == 16 - -#if ENABLE_AVX512_EMULATION -struct simd16scalar -{ - __m256 lo; - __m256 hi; -}; -struct simd16scalard -{ - __m256d lo; - __m256d hi; -}; -struct simd16scalari -{ - __m256i lo; - __m256i hi; -}; -typedef uint16_t simd16mask; - -#else -typedef __m512 simd16scalar; -typedef __m512d simd16scalard; -typedef __m512i simd16scalari; -typedef __mmask16 simd16mask; -#endif//ENABLE_AVX512_EMULATION -#else -#error Unsupported vector width -#endif//KNOB_SIMD16_WIDTH == 16 - -#define _simd16_masklo(mask) ((mask) & 0xFF) -#define _simd16_maskhi(mask) (((mask) >> 8) & 0xFF) -#define _simd16_setmask(hi, lo) (((hi) << 8) | (lo)) - -#if defined(_WIN32) -#define SIMDAPI __vectorcall -#else -#define SIMDAPI -#endif - -OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector -{ - simd16scalar v[4]; - struct - { - simd16scalar x, y, z, w; - }; - - simd16scalar& operator[] (const int i) { return v[i]; } - const simd16scalar& operator[] (const int i) const { return v[i]; } -}; - #if ENABLE_AVX512_EMULATION #define SIMD16_EMU_AVX512_0(type, func, intrin) \ diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index ed6e56b5e26..5ccb6c3ea95 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -24,34 +24,8 @@ #ifndef __SWR_SIMDINTRIN_H__ #define __SWR_SIMDINTRIN_H__ -#include "os.h" - -#include - -#include -#include -#include - -#if KNOB_SIMD_WIDTH == 8 -typedef __m256 simdscalar; -typedef __m256i simdscalari; -typedef uint8_t simdmask; -#else -#error Unsupported vector width -#endif - -// simd vector -OSALIGNSIMD(union) simdvector -{ - simdscalar v[4]; - struct - { - simdscalar x, y, z, w; - }; - - simdscalar& operator[] (const int i) { return v[i]; } - const simdscalar& operator[] (const int i) const { return v[i]; } -}; +#include "common/os.h" +#include "common/intrin.h" #if KNOB_SIMD_WIDTH == 8 #define _simd128_maskstore_ps _mm_maskstore_ps @@ -1210,63 +1184,6 @@ static INLINE simdscalar _simd_abs_ps(simdscalar a) return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff))); } -INLINE -UINT pdep_u32(UINT a, UINT mask) -{ -#if KNOB_ARCH >= KNOB_ARCH_AVX2 - return _pdep_u32(a, mask); -#else - UINT result = 0; - - // copied from http://wm.ite.pl/articles/pdep-soft-emu.html - // using bsf instead of funky loop - DWORD maskIndex; - while (_BitScanForward(&maskIndex, mask)) - { - // 1. isolate lowest set bit of mask - const UINT lowest = 1 << maskIndex; - - // 2. populate LSB from src - const UINT LSB = (UINT)((int)(a << 31) >> 31); - - // 3. copy bit from mask - result |= LSB & lowest; - - // 4. clear lowest bit - mask &= ~lowest; - - // 5. prepare for next iteration - a >>= 1; - } - - return result; -#endif -} - -INLINE -UINT pext_u32(UINT a, UINT mask) -{ -#if KNOB_ARCH >= KNOB_ARCH_AVX2 - return _pext_u32(a, mask); -#else - UINT result = 0; - DWORD maskIndex; - uint32_t currentBit = 0; - while (_BitScanForward(&maskIndex, mask)) - { - // 1. isolate lowest set bit of mask - const UINT lowest = 1 << maskIndex; - - // 2. copy bit from mask - result |= ((a & lowest) > 0) << currentBit++; - - // 3. clear lowest bit - mask &= ~lowest; - } - return result; -#endif -} - #if ENABLE_AVX512_SIMD16 #include "simd16intrin.h" #endif//ENABLE_AVX512_SIMD16 diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h index f9f3573561b..236e0fcd666 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -34,7 +34,7 @@ #include #include -#include "common/simdintrin.h" +#include "common/intrin.h" #include "common/formats.h" #include "core/state.h" diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index fa308feade2..f60ddfd77ef 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -39,7 +39,7 @@ #include "core/arena.h" #include "core/fifo.hpp" #include "core/knobs.h" -#include "common/simdintrin.h" +#include "common/intrin.h" #include "core/threads.h" #include "ringbuffer.h" #include "archrast/archrast.h" diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp index 7e07e6aeb2c..49ba71f6435 100644 --- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp +++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp @@ -109,8 +109,8 @@ struct QUEUE auto lambda = [&](int32_t i) { - __m256 vSrc = _simd_load_ps(pSrc + i*KNOB_SIMD_WIDTH); - _simd_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc); + __m256 vSrc = _mm256_load_ps(pSrc + i*KNOB_SIMD_WIDTH); + _mm256_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc); }; const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4); diff --git a/src/gallium/drivers/swr/rasterizer/core/format_traits.h b/src/gallium/drivers/swr/rasterizer/core/format_traits.h index 478fe3e4f2a..1721aa46e75 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_traits.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_traits.h @@ -30,7 +30,7 @@ #pragma once #include "format_types.h" -#include "utils.h" +#include "format_utils.h" ////////////////////////////////////////////////////////////////////////// /// FormatSwizzle - Component swizzle selects diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h index e7e17f696e8..1ad3d61927a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_types.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h @@ -28,6 +28,7 @@ #pragma once #include "utils.h" +#include "common/simdintrin.h" ////////////////////////////////////////////////////////////////////////// /// PackTraits - Helpers for packing / unpacking same pixel sizes diff --git a/src/gallium/drivers/swr/rasterizer/core/format_utils.h b/src/gallium/drivers/swr/rasterizer/core/format_utils.h new file mode 100644 index 00000000000..94b6c1b09e3 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/format_utils.h @@ -0,0 +1,882 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file utils.h +* +* @brief Utilities used by SWR core related to pixel formats. +* +******************************************************************************/ +#pragma once + +#include "core/utils.h" +#include "common/simdintrin.h" + +INLINE +void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3) +{ + __m128i row0i = _mm_castps_si128(row0); + __m128i row1i = _mm_castps_si128(row1); + __m128i row2i = _mm_castps_si128(row2); + __m128i row3i = _mm_castps_si128(row3); + + __m128i vTemp = row2i; + row2i = _mm_unpacklo_epi32(row2i, row3i); + vTemp = _mm_unpackhi_epi32(vTemp, row3i); + + row3i = row0i; + row0i = _mm_unpacklo_epi32(row0i, row1i); + row3i = _mm_unpackhi_epi32(row3i, row1i); + + row1i = row0i; + row0i = _mm_unpacklo_epi64(row0i, row2i); + row1i = _mm_unpackhi_epi64(row1i, row2i); + + row2i = row3i; + row2i = _mm_unpacklo_epi64(row2i, vTemp); + row3i = _mm_unpackhi_epi64(row3i, vTemp); + + row0 = _mm_castsi128_ps(row0i); + row1 = _mm_castsi128_ps(row1i); + row2 = _mm_castsi128_ps(row2i); + row3 = _mm_castsi128_ps(row3i); +} + +INLINE +void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3) +{ + __m128i vTemp = row2; + row2 = _mm_unpacklo_epi32(row2, row3); + vTemp = _mm_unpackhi_epi32(vTemp, row3); + + row3 = row0; + row0 = _mm_unpacklo_epi32(row0, row1); + row3 = _mm_unpackhi_epi32(row3, row1); + + row1 = row0; + row0 = _mm_unpacklo_epi64(row0, row2); + row1 = _mm_unpackhi_epi64(row1, row2); + + row2 = row3; + row2 = _mm_unpacklo_epi64(row2, vTemp); + row3 = _mm_unpackhi_epi64(row3, vTemp); +} + +#if KNOB_SIMD_WIDTH == 8 +INLINE +void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2) +{ + simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 + simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); //y0w0y1w1 y4w4y5w5 + simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 + simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 + + r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 + r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); //y2w2y3w3 y6w6yw77 + simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 + simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 + + vDst[0] = _mm256_castps256_ps128(r02r1xlolo); + vDst[1] = _mm256_castps256_ps128(r02r1xlohi); + vDst[2] = _mm256_castps256_ps128(r02r1xhilo); + vDst[3] = _mm256_castps256_ps128(r02r1xhihi); + + vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1); + vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1); + vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1); + vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1); +} + +INLINE +void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3) +{ + simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 + simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 + simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 + simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 + + r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 + r1rx = _simd_unpackhi_ps(vSrc1, vSrc3); //y2w2y3w3 y6w6yw77 + simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 + simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 + + vDst[0] = _mm256_castps256_ps128(r02r1xlolo); + vDst[1] = _mm256_castps256_ps128(r02r1xlohi); + vDst[2] = _mm256_castps256_ps128(r02r1xhilo); + vDst[3] = _mm256_castps256_ps128(r02r1xhihi); + + vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1); + vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1); + vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1); + vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1); +} + +#if ENABLE_AVX512_SIMD16 +INLINE +void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3) +{ + const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking + + simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r + simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g + simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b + simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a + + simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2); + simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3); + simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2); + simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3); + + dst[0] = _simd16_unpacklo_ps(rblo, galo); + dst[1] = _simd16_unpackhi_ps(rblo, galo); + dst[2] = _simd16_unpacklo_ps(rbhi, gahi); + dst[3] = _simd16_unpackhi_ps(rbhi, gahi); +} + +#endif +INLINE +void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar &vMask0, const simdscalar &vMask1, const simdscalar &vMask2, const simdscalar &vMask3, const simdscalar &vMask4, const simdscalar &vMask5, const simdscalar &vMask6, const simdscalar &vMask7) +{ + simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1); + simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1); + simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3); + simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3); + simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5); + simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5); + simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7); + simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7); + simdscalar __tt0 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); + simdscalar __tt1 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); + simdscalar __tt2 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); + simdscalar __tt3 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); + simdscalar __tt4 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); + simdscalar __tt5 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); + simdscalar __tt6 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); + simdscalar __tt7 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); + vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20); + vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20); + vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20); + vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20); + vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31); + vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31); + vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31); + vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31); +} + +INLINE +void vTranspose8x8(simdscalar (&vDst)[8], const simdscalari &vMask0, const simdscalari &vMask1, const simdscalari &vMask2, const simdscalari &vMask3, const simdscalari &vMask4, const simdscalari &vMask5, const simdscalari &vMask6, const simdscalari &vMask7) +{ + vTranspose8x8(vDst, _simd_castsi_ps(vMask0), _simd_castsi_ps(vMask1), _simd_castsi_ps(vMask2), _simd_castsi_ps(vMask3), + _simd_castsi_ps(vMask4), _simd_castsi_ps(vMask5), _simd_castsi_ps(vMask6), _simd_castsi_ps(vMask7)); +} +#endif + +////////////////////////////////////////////////////////////////////////// +/// TranposeSingleComponent +////////////////////////////////////////////////////////////////////////// +template +struct TransposeSingleComponent +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Pass-thru for single component. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) + { + memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); + } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8); + } +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose8_8_8_8 +////////////////////////////////////////////////////////////////////////// +struct Transpose8_8_8_8 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) + { + simdscalari src = _simd_load_si((const simdscalari*)pSrc); + +#if KNOB_SIMD_WIDTH == 8 +#if KNOB_ARCH <= KNOB_ARCH_AVX + __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg + __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa + __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb + __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa + __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg + __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa + __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba + __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba + _mm_store_si128((__m128i*)pDst, c0123lo); + _mm_store_si128((__m128i*)(pDst + 16), c0123hi); +#else + simdscalari dst01 = _simd_shuffle_epi8(src, + _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800)); + simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01); + dst23 = _simd_shuffle_epi8(dst23, + _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080)); + simdscalari dst = _simd_or_si(dst01, dst23); + _simd_store_si((simdscalari*)pDst, dst); +#endif +#else +#error Unsupported vector width +#endif + } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + __m128i src0 = _mm_load_si128(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + __m128i src1 = _mm_load_si128(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + __m128i src2 = _mm_load_si128(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb + __m128i src3 = _mm_load_si128(reinterpret_cast(pSrc) + 3); // aaaaaaaaaaaaaaaa + + simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0); + simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1); + simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2); + simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3); + + simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8); + simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16); + simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24); + + simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3)); + + _simd16_store_si(reinterpret_cast(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba + } +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose8_8_8 +////////////////////////////////////////////////////////////////////////// +struct Transpose8_8_8 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose8_8 +////////////////////////////////////////////////////////////////////////// +struct Transpose8_8 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 8_8 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) + { +#if KNOB_SIMD_WIDTH == 8 + simdscalari src = _simd_load_si((const simdscalari*)pSrc); + + __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg + __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg + rg = _mm_unpacklo_epi8(rg, g); + _mm_store_si128((__m128i*)pDst, rg); +#else +#error Unsupported vector width +#endif + } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + __m128i src0 = _mm_load_si128(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + __m128i src1 = _mm_load_si128(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + + simdscalari cvt0 = _simd_cvtepu8_epi16(src0); + simdscalari cvt1 = _simd_cvtepu8_epi16(src1); + + simdscalari shl1 = _simd_slli_epi32(cvt1, 8); + + simdscalari dst = _simd_or_si(cvt0, shl1); + + _simd_store_si(reinterpret_cast(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg + } +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose32_32_32_32 +////////////////////////////////////////////////////////////////////////// +struct Transpose32_32_32_32 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) + { +#if KNOB_SIMD_WIDTH == 8 + simdscalar src0 = _simd_load_ps((const float*)pSrc); + simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); + simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); + simdscalar src3 = _simd_load_ps((const float*)pSrc + 24); + + __m128 vDst[8]; + vTranspose4x8(vDst, src0, src1, src2, src3); + _mm_store_ps((float*)pDst, vDst[0]); + _mm_store_ps((float*)pDst+4, vDst[1]); + _mm_store_ps((float*)pDst+8, vDst[2]); + _mm_store_ps((float*)pDst+12, vDst[3]); + _mm_store_ps((float*)pDst+16, vDst[4]); + _mm_store_ps((float*)pDst+20, vDst[5]); + _mm_store_ps((float*)pDst+24, vDst[6]); + _mm_store_ps((float*)pDst+28, vDst[7]); +#else +#error Unsupported vector width +#endif + } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + simd16scalar src0 = _simd16_load_ps(reinterpret_cast(pSrc)); + simd16scalar src1 = _simd16_load_ps(reinterpret_cast(pSrc) + 16); + simd16scalar src2 = _simd16_load_ps(reinterpret_cast(pSrc) + 32); + simd16scalar src3 = _simd16_load_ps(reinterpret_cast(pSrc) + 48); + + simd16scalar dst[4]; + + vTranspose4x16(dst, src0, src1, src2, src3); + + _simd16_store_ps(reinterpret_cast(pDst) + 0, dst[0]); + _simd16_store_ps(reinterpret_cast(pDst) + 16, dst[1]); + _simd16_store_ps(reinterpret_cast(pDst) + 32, dst[2]); + _simd16_store_ps(reinterpret_cast(pDst) + 48, dst[3]); + } +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose32_32_32 +////////////////////////////////////////////////////////////////////////// +struct Transpose32_32_32 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) + { +#if KNOB_SIMD_WIDTH == 8 + simdscalar src0 = _simd_load_ps((const float*)pSrc); + simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); + simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); + + __m128 vDst[8]; + vTranspose3x8(vDst, src0, src1, src2); + _mm_store_ps((float*)pDst, vDst[0]); + _mm_store_ps((float*)pDst + 4, vDst[1]); + _mm_store_ps((float*)pDst + 8, vDst[2]); + _mm_store_ps((float*)pDst + 12, vDst[3]); + _mm_store_ps((float*)pDst + 16, vDst[4]); + _mm_store_ps((float*)pDst + 20, vDst[5]); + _mm_store_ps((float*)pDst + 24, vDst[6]); + _mm_store_ps((float*)pDst + 28, vDst[7]); +#else +#error Unsupported vector width +#endif + } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + simd16scalar src0 = _simd16_load_ps(reinterpret_cast(pSrc)); + simd16scalar src1 = _simd16_load_ps(reinterpret_cast(pSrc) + 16); + simd16scalar src2 = _simd16_load_ps(reinterpret_cast(pSrc) + 32); + simd16scalar src3 = _simd16_setzero_ps(); + + simd16scalar dst[4]; + + vTranspose4x16(dst, src0, src1, src2, src3); + + _simd16_store_ps(reinterpret_cast(pDst) + 0, dst[0]); + _simd16_store_ps(reinterpret_cast(pDst) + 16, dst[1]); + _simd16_store_ps(reinterpret_cast(pDst) + 32, dst[2]); + _simd16_store_ps(reinterpret_cast(pDst) + 48, dst[3]); + } +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose32_32 +////////////////////////////////////////////////////////////////////////// +struct Transpose32_32 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 32_32 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) + { +#if KNOB_SIMD_WIDTH == 8 + const float* pfSrc = (const float*)pSrc; + __m128 src_r0 = _mm_load_ps(pfSrc + 0); + __m128 src_r1 = _mm_load_ps(pfSrc + 4); + __m128 src_g0 = _mm_load_ps(pfSrc + 8); + __m128 src_g1 = _mm_load_ps(pfSrc + 12); + + __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0); + __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0); + __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1); + __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1); + + float* pfDst = (float*)pDst; + _mm_store_ps(pfDst + 0, dst0); + _mm_store_ps(pfDst + 4, dst1); + _mm_store_ps(pfDst + 8, dst2); + _mm_store_ps(pfDst + 12, dst3); +#else +#error Unsupported vector width +#endif + } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + simd16scalar src0 = _simd16_load_ps(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simd16scalar src1 = _simd16_load_ps(reinterpret_cast(pSrc) + 16); // gggggggggggggggg + + simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD + simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF + + simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7 + simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF + + simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7 + simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF + + _simd16_store_ps(reinterpret_cast(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg + _simd16_store_ps(reinterpret_cast(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg + } +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose16_16_16_16 +////////////////////////////////////////////////////////////////////////// +struct Transpose16_16_16_16 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) + { +#if KNOB_SIMD_WIDTH == 8 + simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); + simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari))); + + __m128i src_r = _mm256_extractf128_si256(src_rg, 0); + __m128i src_g = _mm256_extractf128_si256(src_rg, 1); + __m128i src_b = _mm256_extractf128_si256(src_ba, 0); + __m128i src_a = _mm256_extractf128_si256(src_ba, 1); + + __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); + __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); + __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); + __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); + + __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); + __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); + __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); + __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); + + _mm_store_si128(((__m128i*)pDst) + 0, dst0); + _mm_store_si128(((__m128i*)pDst) + 1, dst1); + _mm_store_si128(((__m128i*)pDst) + 2, dst2); + _mm_store_si128(((__m128i*)pDst) + 3, dst3); +#else +#error Unsupported vector width +#endif + } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + simdscalari src0 = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + simdscalari src2 = _simd_load_si(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb + simdscalari src3 = _simd_load_si(reinterpret_cast(pSrc) + 3); // aaaaaaaaaaaaaaaa + + simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB + simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF + + simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 + simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB + simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD + simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF + + simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 + simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 + simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB + simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF + + _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 2, dst2); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 3, dst3); // rgbargbargbargba + } +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose16_16_16 +////////////////////////////////////////////////////////////////////////// +struct Transpose16_16_16 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) + { +#if KNOB_SIMD_WIDTH == 8 + simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); + + __m128i src_r = _mm256_extractf128_si256(src_rg, 0); + __m128i src_g = _mm256_extractf128_si256(src_rg, 1); + __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari))); + __m128i src_a = _mm_undefined_si128(); + + __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); + __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); + __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); + __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); + + __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); + __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); + __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); + __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); + + _mm_store_si128(((__m128i*)pDst) + 0, dst0); + _mm_store_si128(((__m128i*)pDst) + 1, dst1); + _mm_store_si128(((__m128i*)pDst) + 2, dst2); + _mm_store_si128(((__m128i*)pDst) + 3, dst3); +#else +#error Unsupported vector width +#endif + } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + simdscalari src0 = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + simdscalari src2 = _simd_load_si(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb + simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa + + simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB + simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF + + simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 + simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB + simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD + simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF + + simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 + simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 + simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB + simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF + + _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 2, dst2); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 3, dst3); // rgbargbargbargba + } +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose16_16 +////////////////////////////////////////////////////////////////////////// +struct Transpose16_16 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 16_16 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) + { +#if KNOB_SIMD_WIDTH == 8 + simdscalar src = _simd_load_ps((const float*)pSrc); + + __m128 comp0 = _mm256_castps256_ps128(src); + __m128 comp1 = _mm256_extractf128_ps(src, 1); + + __m128i comp0i = _mm_castps_si128(comp0); + __m128i comp1i = _mm_castps_si128(comp1); + + __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i); + __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i); + + _mm_store_si128((__m128i*)pDst, resLo); + _mm_store_si128((__m128i*)pDst + 1, resHi); +#else +#error Unsupported vector width +#endif + } +#if ENABLE_AVX512_SIMD16 + + INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + { + simdscalari src0 = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + + simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + + simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7 + simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF + + _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg + _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg + } +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose24_8 +////////////////////////////////////////////////////////////////////////// +struct Transpose24_8 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 24_8 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose32_8_24 +////////////////////////////////////////////////////////////////////////// +struct Transpose32_8_24 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose4_4_4_4 +////////////////////////////////////////////////////////////////////////// +struct Transpose4_4_4_4 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose5_6_5 +////////////////////////////////////////////////////////////////////////// +struct Transpose5_6_5 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose9_9_9_5 +////////////////////////////////////////////////////////////////////////// +struct Transpose9_9_9_5 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose5_5_5_1 +////////////////////////////////////////////////////////////////////////// +struct Transpose5_5_5_1 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose1_5_5_5 +////////////////////////////////////////////////////////////////////////// +struct Transpose1_5_5_5 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose10_10_10_2 +////////////////////////////////////////////////////////////////////////// +struct Transpose10_10_10_2 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose11_11_10 +////////////////////////////////////////////////////////////////////////// +struct Transpose11_11_10 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data. + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose64 +////////////////////////////////////////////////////////////////////////// +struct Transpose64 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose64_64 +////////////////////////////////////////////////////////////////////////// +struct Transpose64_64 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose64_64_64 +////////////////////////////////////////////////////////////////////////// +struct Transpose64_64_64 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + +////////////////////////////////////////////////////////////////////////// +/// Transpose64_64_64_64 +////////////////////////////////////////////////////////////////////////// +struct Transpose64_64_64_64 +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs an SOA to AOS conversion + /// @param pSrc - source data in SOA form + /// @param pDst - output data in AOS form + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; +#if ENABLE_AVX512_SIMD16 + + static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; +#endif +}; + diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h index 0a2a23d9e0f..65b7f02813f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.h +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h @@ -28,6 +28,7 @@ ******************************************************************************/ #pragma once #include "context.h" +#include "common/simdintrin.h" #include // Calculates the A and B coefficients for the 3 edges of the triangle diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.h b/src/gallium/drivers/swr/rasterizer/core/multisample.h index 19a5a80715e..2ca8c1b3e8d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/multisample.h +++ b/src/gallium/drivers/swr/rasterizer/core/multisample.h @@ -33,14 +33,6 @@ /// @brief convenience typedef for testing for single sample case typedef std::integral_constant SingleSampleT; -INLINE -uint32_t GetNumSamples(SWR_MULTISAMPLE_COUNT sampleCount) -{ - static const uint32_t sampleCountLUT[SWR_MULTISAMPLE_TYPE_COUNT] {1, 2, 4, 8, 16}; - assert(sampleCount < SWR_MULTISAMPLE_TYPE_COUNT); - return sampleCountLUT[sampleCount]; -} - INLINE SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples) { @@ -302,4 +294,4 @@ bool isNonStandardPattern(const SWR_MULTISAMPLE_COUNT sampleCount, const SWR_MUL } } return !bIsStandard; -} \ No newline at end of file +} diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 7609d51ed49..94a507139c1 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -28,7 +28,7 @@ #pragma once #include "common/formats.h" -#include "common/simdintrin.h" +#include "common/intrin.h" #include #include @@ -798,6 +798,13 @@ enum SWR_MULTISAMPLE_COUNT SWR_MULTISAMPLE_TYPE_COUNT }; +INLINE uint32_t GetNumSamples(SWR_MULTISAMPLE_COUNT sampleCount) // @llvm_func_start +{ + static const uint32_t sampleCountLUT[SWR_MULTISAMPLE_TYPE_COUNT] {1, 2, 4, 8, 16}; + assert(sampleCount < SWR_MULTISAMPLE_TYPE_COUNT); + return sampleCountLUT[sampleCount]; +} // @llvm_func_end + struct SWR_BLEND_STATE { // constant blend factor color in RGBA float @@ -951,43 +958,13 @@ public: INLINE const __m128i& TileSampleOffsetsX() const { return tileSampleOffsetsX; }; // @llvm_func INLINE const __m128i& TileSampleOffsetsY() const { return tileSampleOffsetsY; }; // @llvm_func - INLINE void PrecalcSampleData(int numSamples) // @llvm_func_start - { - for(int i = 0; i < numSamples; i++) - { - _vXi[i] = _mm_set1_epi32(_xi[i]); - _vYi[i] = _mm_set1_epi32(_yi[i]); - _vX[i] = _simd_set1_ps(_x[i]); - _vY[i] = _simd_set1_ps(_y[i]); - } - // precalculate the raster tile BB for the rasterizer. - CalcTileSampleOffsets(numSamples); - } // @llvm_func_end - + INLINE void PrecalcSampleData(int numSamples); //@llvm_func private: template - INLINE __m128i expandThenBlend4(uint32_t* min, uint32_t* max) // @llvm_func_start - { - __m128i vMin = _mm_set1_epi32(*min); - __m128i vMax = _mm_set1_epi32(*max); - return _simd_blend4_epi32(vMin, vMax); - } // @llvm_func_end + INLINE __m128i expandThenBlend4(uint32_t* min, uint32_t* max); // @llvm_func + INLINE void CalcTileSampleOffsets(int numSamples); // @llvm_func - INLINE void CalcTileSampleOffsets(int numSamples) // @llvm_func_start - { - auto minXi = std::min_element(std::begin(_xi), &_xi[numSamples]); - auto maxXi = std::max_element(std::begin(_xi), &_xi[numSamples]); - using xMask = std::integral_constant; - // BR(max), BL(min), UR(max), UL(min) - tileSampleOffsetsX = expandThenBlend4(minXi, maxXi); - - auto minYi = std::min_element(std::begin(_yi), &_yi[numSamples]); - auto maxYi = std::max_element(std::begin(_yi), &_yi[numSamples]); - using yMask = std::integral_constant; - // BR(max), BL(min), UR(max), UL(min) - tileSampleOffsetsY = expandThenBlend4(minYi, maxYi); - }; // @llvm_func_end // scalar sample values uint32_t _xi[SWR_MAX_NUM_MULTISAMPLES]; uint32_t _yi[SWR_MAX_NUM_MULTISAMPLES]; @@ -1000,8 +977,7 @@ private: simdscalar _vX[SWR_MAX_NUM_MULTISAMPLES]; simdscalar _vY[SWR_MAX_NUM_MULTISAMPLES]; __m128i tileSampleOffsetsX; - __m128i tileSampleOffsetsY; - + __m128i tileSampleOffsetsY; }; ////////////////////////////////////////////////////////////////////////// diff --git a/src/gallium/drivers/swr/rasterizer/core/state_funcs.h b/src/gallium/drivers/swr/rasterizer/core/state_funcs.h new file mode 100644 index 00000000000..eaf0094b626 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/state_funcs.h @@ -0,0 +1,68 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file state.h +* +* @brief Definitions for API state - complex function implementation. +* +******************************************************************************/ +#pragma once + +#include "core/state.h" +#include "common/simdintrin.h" + + +template +INLINE __m128i SWR_MULTISAMPLE_POS::expandThenBlend4(uint32_t* min, uint32_t* max) +{ + __m128i vMin = _mm_set1_epi32(*min); + __m128i vMax = _mm_set1_epi32(*max); + return _simd_blend4_epi32(vMin, vMax); +} + +INLINE void SWR_MULTISAMPLE_POS::PrecalcSampleData(int numSamples) +{ + for(int i = 0; i < numSamples; i++) + { + _vXi[i] = _mm_set1_epi32(_xi[i]); + _vYi[i] = _mm_set1_epi32(_yi[i]); + _vX[i] = _simd_set1_ps(_x[i]); + _vY[i] = _simd_set1_ps(_y[i]); + } + // precalculate the raster tile BB for the rasterizer. + CalcTileSampleOffsets(numSamples); +} + +INLINE void SWR_MULTISAMPLE_POS::CalcTileSampleOffsets(int numSamples) +{ + auto minXi = std::min_element(std::begin(_xi), &_xi[numSamples]); + auto maxXi = std::max_element(std::begin(_xi), &_xi[numSamples]); + using xMask = std::integral_constant; + // BR(max), BL(min), UR(max), UL(min) + tileSampleOffsetsX = expandThenBlend4(minXi, maxXi); + + auto minYi = std::min_element(std::begin(_yi), &_yi[numSamples]); + auto maxYi = std::max_element(std::begin(_yi), &_yi[numSamples]); + using yMask = std::integral_constant; + // BR(max), BL(min), UR(max), UL(min) + tileSampleOffsetsY = expandThenBlend4(minYi, maxYi); +}; diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h index 28d10c755ef..a8c58d9d4ef 100644 --- a/src/gallium/drivers/swr/rasterizer/core/utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h @@ -31,50 +31,10 @@ #include #include #include "common/os.h" -#include "common/simdintrin.h" +#include "common/intrin.h" #include "common/swr_assert.h" #include "core/api.h" -#if defined(_WIN64) || defined(__x86_64__) -#define _MM_INSERT_EPI64 _mm_insert_epi64 -#define _MM_EXTRACT_EPI64 _mm_extract_epi64 -#else -INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx) -{ - OSALIGNLINE(uint32_t) elems[4]; - _mm_store_si128((__m128i*)elems, a); - if (ndx == 0) - { - uint64_t foo = elems[0]; - foo |= (uint64_t)elems[1] << 32; - return foo; - } - else - { - uint64_t foo = elems[2]; - foo |= (uint64_t)elems[3] << 32; - return foo; - } -} - -INLINE __m128i _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx) -{ - OSALIGNLINE(int64_t) elems[2]; - _mm_store_si128((__m128i*)elems, a); - if (ndx == 0) - { - elems[0] = b; - } - else - { - elems[1] = b; - } - __m128i out; - out = _mm_load_si128((const __m128i*)elems); - return out; -} -#endif - struct simdBBox { simdscalari ymin; @@ -91,857 +51,8 @@ struct simd16BBox simd16scalari xmin; simd16scalari xmax; }; - #endif -INLINE -void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3) -{ - __m128i row0i = _mm_castps_si128(row0); - __m128i row1i = _mm_castps_si128(row1); - __m128i row2i = _mm_castps_si128(row2); - __m128i row3i = _mm_castps_si128(row3); - - __m128i vTemp = row2i; - row2i = _mm_unpacklo_epi32(row2i, row3i); - vTemp = _mm_unpackhi_epi32(vTemp, row3i); - - row3i = row0i; - row0i = _mm_unpacklo_epi32(row0i, row1i); - row3i = _mm_unpackhi_epi32(row3i, row1i); - - row1i = row0i; - row0i = _mm_unpacklo_epi64(row0i, row2i); - row1i = _mm_unpackhi_epi64(row1i, row2i); - - row2i = row3i; - row2i = _mm_unpacklo_epi64(row2i, vTemp); - row3i = _mm_unpackhi_epi64(row3i, vTemp); - - row0 = _mm_castsi128_ps(row0i); - row1 = _mm_castsi128_ps(row1i); - row2 = _mm_castsi128_ps(row2i); - row3 = _mm_castsi128_ps(row3i); -} - -INLINE -void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3) -{ - __m128i vTemp = row2; - row2 = _mm_unpacklo_epi32(row2, row3); - vTemp = _mm_unpackhi_epi32(vTemp, row3); - - row3 = row0; - row0 = _mm_unpacklo_epi32(row0, row1); - row3 = _mm_unpackhi_epi32(row3, row1); - - row1 = row0; - row0 = _mm_unpacklo_epi64(row0, row2); - row1 = _mm_unpackhi_epi64(row1, row2); - - row2 = row3; - row2 = _mm_unpacklo_epi64(row2, vTemp); - row3 = _mm_unpackhi_epi64(row3, vTemp); -} - -#if KNOB_SIMD_WIDTH == 8 -INLINE -void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2) -{ - simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 - simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); //y0w0y1w1 y4w4y5w5 - simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 - simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 - - r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 - r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); //y2w2y3w3 y6w6yw77 - simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 - simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 - - vDst[0] = _mm256_castps256_ps128(r02r1xlolo); - vDst[1] = _mm256_castps256_ps128(r02r1xlohi); - vDst[2] = _mm256_castps256_ps128(r02r1xhilo); - vDst[3] = _mm256_castps256_ps128(r02r1xhihi); - - vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1); - vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1); - vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1); - vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1); -} - -INLINE -void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3) -{ - simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 - simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 - simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 - simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 - - r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 - r1rx = _simd_unpackhi_ps(vSrc1, vSrc3); //y2w2y3w3 y6w6yw77 - simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 - simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 - - vDst[0] = _mm256_castps256_ps128(r02r1xlolo); - vDst[1] = _mm256_castps256_ps128(r02r1xlohi); - vDst[2] = _mm256_castps256_ps128(r02r1xhilo); - vDst[3] = _mm256_castps256_ps128(r02r1xhihi); - - vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1); - vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1); - vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1); - vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1); -} - -#if ENABLE_AVX512_SIMD16 -INLINE -void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3) -{ - const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking - - simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r - simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g - simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b - simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a - - simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2); - simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3); - simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2); - simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3); - - dst[0] = _simd16_unpacklo_ps(rblo, galo); - dst[1] = _simd16_unpackhi_ps(rblo, galo); - dst[2] = _simd16_unpacklo_ps(rbhi, gahi); - dst[3] = _simd16_unpackhi_ps(rbhi, gahi); -} - -#endif -INLINE -void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar &vMask0, const simdscalar &vMask1, const simdscalar &vMask2, const simdscalar &vMask3, const simdscalar &vMask4, const simdscalar &vMask5, const simdscalar &vMask6, const simdscalar &vMask7) -{ - simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1); - simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1); - simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3); - simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3); - simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5); - simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5); - simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7); - simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7); - simdscalar __tt0 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); - simdscalar __tt1 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); - simdscalar __tt2 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); - simdscalar __tt3 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); - simdscalar __tt4 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); - simdscalar __tt5 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); - simdscalar __tt6 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); - simdscalar __tt7 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); - vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20); - vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20); - vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20); - vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20); - vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31); - vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31); - vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31); - vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31); -} - -INLINE -void vTranspose8x8(simdscalar (&vDst)[8], const simdscalari &vMask0, const simdscalari &vMask1, const simdscalari &vMask2, const simdscalari &vMask3, const simdscalari &vMask4, const simdscalari &vMask5, const simdscalari &vMask6, const simdscalari &vMask7) -{ - vTranspose8x8(vDst, _simd_castsi_ps(vMask0), _simd_castsi_ps(vMask1), _simd_castsi_ps(vMask2), _simd_castsi_ps(vMask3), - _simd_castsi_ps(vMask4), _simd_castsi_ps(vMask5), _simd_castsi_ps(vMask6), _simd_castsi_ps(vMask7)); -} -#endif - -////////////////////////////////////////////////////////////////////////// -/// TranposeSingleComponent -////////////////////////////////////////////////////////////////////////// -template -struct TransposeSingleComponent -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Pass-thru for single component. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { - memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); - } -#if ENABLE_AVX512_SIMD16 - - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) - { - memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8); - } -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose8_8_8_8 -////////////////////////////////////////////////////////////////////////// -struct Transpose8_8_8_8 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { - simdscalari src = _simd_load_si((const simdscalari*)pSrc); - -#if KNOB_SIMD_WIDTH == 8 -#if KNOB_ARCH <= KNOB_ARCH_AVX - __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg - __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa - __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb - __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa - __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg - __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa - __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba - __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba - _mm_store_si128((__m128i*)pDst, c0123lo); - _mm_store_si128((__m128i*)(pDst + 16), c0123hi); -#else - simdscalari dst01 = _simd_shuffle_epi8(src, - _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800)); - simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01); - dst23 = _simd_shuffle_epi8(dst23, - _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080)); - simdscalari dst = _simd_or_si(dst01, dst23); - _simd_store_si((simdscalari*)pDst, dst); -#endif -#else -#error Unsupported vector width -#endif - } -#if ENABLE_AVX512_SIMD16 - - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) - { - __m128i src0 = _mm_load_si128(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr - __m128i src1 = _mm_load_si128(reinterpret_cast(pSrc) + 1); // gggggggggggggggg - __m128i src2 = _mm_load_si128(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb - __m128i src3 = _mm_load_si128(reinterpret_cast(pSrc) + 3); // aaaaaaaaaaaaaaaa - - simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0); - simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1); - simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2); - simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3); - - simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8); - simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16); - simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24); - - simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3)); - - _simd16_store_si(reinterpret_cast(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba - } -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose8_8_8 -////////////////////////////////////////////////////////////////////////// -struct Transpose8_8_8 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose8_8 -////////////////////////////////////////////////////////////////////////// -struct Transpose8_8 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 8_8 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalari src = _simd_load_si((const simdscalari*)pSrc); - - __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg - __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg - rg = _mm_unpacklo_epi8(rg, g); - _mm_store_si128((__m128i*)pDst, rg); -#else -#error Unsupported vector width -#endif - } -#if ENABLE_AVX512_SIMD16 - - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) - { - __m128i src0 = _mm_load_si128(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr - __m128i src1 = _mm_load_si128(reinterpret_cast(pSrc) + 1); // gggggggggggggggg - - simdscalari cvt0 = _simd_cvtepu8_epi16(src0); - simdscalari cvt1 = _simd_cvtepu8_epi16(src1); - - simdscalari shl1 = _simd_slli_epi32(cvt1, 8); - - simdscalari dst = _simd_or_si(cvt0, shl1); - - _simd_store_si(reinterpret_cast(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg - } -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose32_32_32_32 -////////////////////////////////////////////////////////////////////////// -struct Transpose32_32_32_32 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalar src0 = _simd_load_ps((const float*)pSrc); - simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); - simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); - simdscalar src3 = _simd_load_ps((const float*)pSrc + 24); - - __m128 vDst[8]; - vTranspose4x8(vDst, src0, src1, src2, src3); - _mm_store_ps((float*)pDst, vDst[0]); - _mm_store_ps((float*)pDst+4, vDst[1]); - _mm_store_ps((float*)pDst+8, vDst[2]); - _mm_store_ps((float*)pDst+12, vDst[3]); - _mm_store_ps((float*)pDst+16, vDst[4]); - _mm_store_ps((float*)pDst+20, vDst[5]); - _mm_store_ps((float*)pDst+24, vDst[6]); - _mm_store_ps((float*)pDst+28, vDst[7]); -#else -#error Unsupported vector width -#endif - } -#if ENABLE_AVX512_SIMD16 - - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) - { - simd16scalar src0 = _simd16_load_ps(reinterpret_cast(pSrc)); - simd16scalar src1 = _simd16_load_ps(reinterpret_cast(pSrc) + 16); - simd16scalar src2 = _simd16_load_ps(reinterpret_cast(pSrc) + 32); - simd16scalar src3 = _simd16_load_ps(reinterpret_cast(pSrc) + 48); - - simd16scalar dst[4]; - - vTranspose4x16(dst, src0, src1, src2, src3); - - _simd16_store_ps(reinterpret_cast(pDst) + 0, dst[0]); - _simd16_store_ps(reinterpret_cast(pDst) + 16, dst[1]); - _simd16_store_ps(reinterpret_cast(pDst) + 32, dst[2]); - _simd16_store_ps(reinterpret_cast(pDst) + 48, dst[3]); - } -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose32_32_32 -////////////////////////////////////////////////////////////////////////// -struct Transpose32_32_32 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalar src0 = _simd_load_ps((const float*)pSrc); - simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); - simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); - - __m128 vDst[8]; - vTranspose3x8(vDst, src0, src1, src2); - _mm_store_ps((float*)pDst, vDst[0]); - _mm_store_ps((float*)pDst + 4, vDst[1]); - _mm_store_ps((float*)pDst + 8, vDst[2]); - _mm_store_ps((float*)pDst + 12, vDst[3]); - _mm_store_ps((float*)pDst + 16, vDst[4]); - _mm_store_ps((float*)pDst + 20, vDst[5]); - _mm_store_ps((float*)pDst + 24, vDst[6]); - _mm_store_ps((float*)pDst + 28, vDst[7]); -#else -#error Unsupported vector width -#endif - } -#if ENABLE_AVX512_SIMD16 - - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) - { - simd16scalar src0 = _simd16_load_ps(reinterpret_cast(pSrc)); - simd16scalar src1 = _simd16_load_ps(reinterpret_cast(pSrc) + 16); - simd16scalar src2 = _simd16_load_ps(reinterpret_cast(pSrc) + 32); - simd16scalar src3 = _simd16_setzero_ps(); - - simd16scalar dst[4]; - - vTranspose4x16(dst, src0, src1, src2, src3); - - _simd16_store_ps(reinterpret_cast(pDst) + 0, dst[0]); - _simd16_store_ps(reinterpret_cast(pDst) + 16, dst[1]); - _simd16_store_ps(reinterpret_cast(pDst) + 32, dst[2]); - _simd16_store_ps(reinterpret_cast(pDst) + 48, dst[3]); - } -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose32_32 -////////////////////////////////////////////////////////////////////////// -struct Transpose32_32 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 32_32 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD_WIDTH == 8 - const float* pfSrc = (const float*)pSrc; - __m128 src_r0 = _mm_load_ps(pfSrc + 0); - __m128 src_r1 = _mm_load_ps(pfSrc + 4); - __m128 src_g0 = _mm_load_ps(pfSrc + 8); - __m128 src_g1 = _mm_load_ps(pfSrc + 12); - - __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0); - __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0); - __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1); - __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1); - - float* pfDst = (float*)pDst; - _mm_store_ps(pfDst + 0, dst0); - _mm_store_ps(pfDst + 4, dst1); - _mm_store_ps(pfDst + 8, dst2); - _mm_store_ps(pfDst + 12, dst3); -#else -#error Unsupported vector width -#endif - } -#if ENABLE_AVX512_SIMD16 - - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) - { - simd16scalar src0 = _simd16_load_ps(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr - simd16scalar src1 = _simd16_load_ps(reinterpret_cast(pSrc) + 16); // gggggggggggggggg - simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD - simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF - - simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7 - simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF - - simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7 - simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF - - _simd16_store_ps(reinterpret_cast(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg - _simd16_store_ps(reinterpret_cast(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg - } -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose16_16_16_16 -////////////////////////////////////////////////////////////////////////// -struct Transpose16_16_16_16 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); - simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari))); - - __m128i src_r = _mm256_extractf128_si256(src_rg, 0); - __m128i src_g = _mm256_extractf128_si256(src_rg, 1); - __m128i src_b = _mm256_extractf128_si256(src_ba, 0); - __m128i src_a = _mm256_extractf128_si256(src_ba, 1); - - __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); - __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); - __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); - __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); - - __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); - __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); - __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); - __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); - - _mm_store_si128(((__m128i*)pDst) + 0, dst0); - _mm_store_si128(((__m128i*)pDst) + 1, dst1); - _mm_store_si128(((__m128i*)pDst) + 2, dst2); - _mm_store_si128(((__m128i*)pDst) + 3, dst3); -#else -#error Unsupported vector width -#endif - } -#if ENABLE_AVX512_SIMD16 - - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) - { - simdscalari src0 = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr - simdscalari src1 = _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg - simdscalari src2 = _simd_load_si(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb - simdscalari src3 = _simd_load_si(reinterpret_cast(pSrc) + 3); // aaaaaaaaaaaaaaaa - - simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB - simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB - simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF - - simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 - simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB - simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD - simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF - - simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 - simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 - simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB - simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF - - _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgbargbargbargba - _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgbargbargbargba - _simd_store_si(reinterpret_cast(pDst) + 2, dst2); // rgbargbargbargba - _simd_store_si(reinterpret_cast(pDst) + 3, dst3); // rgbargbargbargba - } -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose16_16_16 -////////////////////////////////////////////////////////////////////////// -struct Transpose16_16_16 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); - - __m128i src_r = _mm256_extractf128_si256(src_rg, 0); - __m128i src_g = _mm256_extractf128_si256(src_rg, 1); - __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari))); - __m128i src_a = _mm_undefined_si128(); - - __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); - __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); - __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); - __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); - - __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); - __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); - __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); - __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); - - _mm_store_si128(((__m128i*)pDst) + 0, dst0); - _mm_store_si128(((__m128i*)pDst) + 1, dst1); - _mm_store_si128(((__m128i*)pDst) + 2, dst2); - _mm_store_si128(((__m128i*)pDst) + 3, dst3); -#else -#error Unsupported vector width -#endif - } -#if ENABLE_AVX512_SIMD16 - - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) - { - simdscalari src0 = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr - simdscalari src1 = _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg - simdscalari src2 = _simd_load_si(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb - simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa - - simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB - simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB - simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF - - simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 - simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB - simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD - simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF - - simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 - simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 - simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB - simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF - - _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgbargbargbargba - _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgbargbargbargba - _simd_store_si(reinterpret_cast(pDst) + 2, dst2); // rgbargbargbargba - _simd_store_si(reinterpret_cast(pDst) + 3, dst3); // rgbargbargbargba - } -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose16_16 -////////////////////////////////////////////////////////////////////////// -struct Transpose16_16 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 16_16 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) - { -#if KNOB_SIMD_WIDTH == 8 - simdscalar src = _simd_load_ps((const float*)pSrc); - - __m128 comp0 = _mm256_castps256_ps128(src); - __m128 comp1 = _mm256_extractf128_ps(src, 1); - - __m128i comp0i = _mm_castps_si128(comp0); - __m128i comp1i = _mm_castps_si128(comp1); - - __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i); - __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i); - - _mm_store_si128((__m128i*)pDst, resLo); - _mm_store_si128((__m128i*)pDst + 1, resHi); -#else -#error Unsupported vector width -#endif - } -#if ENABLE_AVX512_SIMD16 - - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) - { - simdscalari src0 = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr - simdscalari src1 = _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg - - simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB - simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - - simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7 - simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF - - _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg - _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg - } -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose24_8 -////////////////////////////////////////////////////////////////////////// -struct Transpose24_8 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 24_8 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose32_8_24 -////////////////////////////////////////////////////////////////////////// -struct Transpose32_8_24 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose4_4_4_4 -////////////////////////////////////////////////////////////////////////// -struct Transpose4_4_4_4 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose5_6_5 -////////////////////////////////////////////////////////////////////////// -struct Transpose5_6_5 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose9_9_9_5 -////////////////////////////////////////////////////////////////////////// -struct Transpose9_9_9_5 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose5_5_5_1 -////////////////////////////////////////////////////////////////////////// -struct Transpose5_5_5_1 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose1_5_5_5 -////////////////////////////////////////////////////////////////////////// -struct Transpose1_5_5_5 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose10_10_10_2 -////////////////////////////////////////////////////////////////////////// -struct Transpose10_10_10_2 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose11_11_10 -////////////////////////////////////////////////////////////////////////// -struct Transpose11_11_10 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data. - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose64 -////////////////////////////////////////////////////////////////////////// -struct Transpose64 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose64_64 -////////////////////////////////////////////////////////////////////////// -struct Transpose64_64 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose64_64_64 -////////////////////////////////////////////////////////////////////////// -struct Transpose64_64_64 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif -}; - -////////////////////////////////////////////////////////////////////////// -/// Transpose64_64_64_64 -////////////////////////////////////////////////////////////////////////// -struct Transpose64_64_64_64 -{ - ////////////////////////////////////////////////////////////////////////// - /// @brief Performs an SOA to AOS conversion - /// @param pSrc - source data in SOA form - /// @param pDst - output data in AOS form - static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif -}; // helper function to unroll loops template @@ -1029,7 +140,7 @@ template INLINE static bool IsPow2(T value) { - return value == (value & (0 - value)); + return value == (value & (T(0) - value)); } ////////////////////////////////////////////////////////////////////////// diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h index 226d7dc66b7..c2a87d85dd1 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h +++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h @@ -28,7 +28,7 @@ #pragma once #include "core/state.h" -#include "common/simdintrin.h" +#include "common/intrin.h" template struct TilingTraits -- 2.30.2