From 97dab87a22ef3be7d6b35200204dfbca3e7b4399 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Tue, 28 Mar 2017 16:18:21 -0500 Subject: [PATCH] swr: [rasterizer core/memory] Move intrinics to _simd functions Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/core/backend.cpp | 8 +- .../drivers/swr/rasterizer/core/backend.h | 56 ++++---- .../swr/rasterizer/core/format_types.h | 7 + .../drivers/swr/rasterizer/core/utils.h | 130 ++++++++---------- .../drivers/swr/rasterizer/memory/StoreTile.h | 28 ++-- 5 files changed, 112 insertions(+), 117 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index b76b36fcbcb..e3ed52430af 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -419,10 +419,10 @@ void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint3 } #if KNOB_SIMD_WIDTH == 8 -const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5}; -const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5}; -const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; -const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; +const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5}; +const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5}; +const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; +const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; #else #error Unsupported vector width #endif diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 82765c2e877..ade9afccd95 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -64,10 +64,10 @@ enum SWR_BACKEND_FUNCS }; #if KNOB_SIMD_WIDTH == 8 -extern const __m256 vCenterOffsetsX; -extern const __m256 vCenterOffsetsY; -extern const __m256 vULOffsetsX; -extern const __m256 vULOffsetsY; +extern const simdscalar vCenterOffsetsX; +extern const simdscalar vCenterOffsetsY; +extern const simdscalar vULOffsetsX; +extern const simdscalar vULOffsetsY; #define MASK 0xff #endif @@ -151,8 +151,8 @@ struct generateInputCoverage // will need to update for avx512 assert(KNOB_SIMD_WIDTH == 8); - __m256i mask[2]; - __m256i sampleCoverage[2]; + simdscalari mask[2]; + simdscalari sampleCoverage[2]; if(T::bIsCenterPattern) { @@ -220,9 +220,9 @@ struct generateInputCoverage mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0); // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane - __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); + simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); - __m256i packedCoverage1; + simdscalari packedCoverage1; if(T::MultisampleT::numSamples > 8) { // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane @@ -231,11 +231,11 @@ struct generateInputCoverage #if (KNOB_ARCH == KNOB_ARCH_AVX) // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane - __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); - __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); + simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); + simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); - __m256i packedSampleCoverage; + simdscalari packedSampleCoverage; if(T::MultisampleT::numSamples > 8) { // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane @@ -250,11 +250,11 @@ struct generateInputCoverage packedSampleCoverage = packedCoverage0; } #else - __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); + simdscalari permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); - __m256i packedSampleCoverage; + simdscalari packedSampleCoverage; if(T::MultisampleT::numSamples > 8) { permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); @@ -286,7 +286,7 @@ struct generateInputCoverage } } - INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask) + INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask) { uint32_t inputMask[KNOB_SIMD_WIDTH]; generateInputCoverage(coverageMask, inputMask, sampleMask); @@ -298,12 +298,12 @@ struct generateInputCoverage template struct generateInputCoverage { - INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask) + INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask) { // will need to update for avx512 assert(KNOB_SIMD_WIDTH == 8); - __m256i vec = _mm256_set1_epi32(coverageMask[0]); - const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + simdscalari vec = _mm256_set1_epi32(coverageMask[0]); + const simdscalari bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); vec = _simd_and_si(vec, bit); vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec); vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec); @@ -376,29 +376,29 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS vYSample = _simd_add_ps(vYSamplePosUL, vYSample); // Case (1) and case (3b) - All samples covered or not covered with full SampleMask - static const __m256i vFullyCoveredMask = T::MultisampleT::FullSampleMask(); - __m256i vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]); - __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask); + static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask(); + simdscalari vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]); + simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask); - static const __m256i vZero = _simd_setzero_si(); - const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask); - __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero); - __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask); - __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask); + static const simdscalari vZero = _simd_setzero_si(); + const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask); + simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero); + simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask); + simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask); - __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b); + simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b); // set the centroid position based on results from above psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter)); psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter)); // Case (3a) No samples covered and partial sample mask - __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask); + simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask); // sample mask should never be all 0's for this case, but handle it anyways unsigned long firstCoveredSampleMaskSample = 0; (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0); - __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples); + simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples); vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample)); vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample)); diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h index 4c94c312ba2..f33988aefb7 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_types.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h @@ -1289,6 +1289,13 @@ struct ComponentTraits return CompType[comp]; } + INLINE static constexpr uint32_t GetConstBPC(uint32_t comp) + { + return (comp == 3) ? NumBitsW : + ((comp == 2) ? NumBitsZ : + ((comp == 1) ? NumBitsY : NumBitsX) ); + } + INLINE static uint32_t GetBPC(uint32_t comp) { static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW }; diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h index 3a0eb257165..9dfa16a529e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h @@ -133,65 +133,53 @@ void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3) row3 = _mm_unpackhi_epi64(row3, vTemp); } -#define GCC_VERSION (__GNUC__ * 10000 \ - + __GNUC_MINOR__ * 100 \ - + __GNUC_PATCHLEVEL__) - -#if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900)) -#define _mm_undefined_ps _mm_setzero_ps -#define _mm_undefined_si128 _mm_setzero_si128 -#if KNOB_SIMD_WIDTH == 8 -#define _mm256_undefined_ps _mm256_setzero_ps -#endif -#endif - #if KNOB_SIMD_WIDTH == 8 INLINE -void vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2) +void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2) { - __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 - __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5 - __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 - __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 + simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 + simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); //y0w0y1w1 y4w4y5w5 + simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 + simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 - r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 - r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77 - __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 - __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 + r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 + r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); //y2w2y3w3 y6w6yw77 + simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 + simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 vDst[0] = _mm256_castps256_ps128(r02r1xlolo); vDst[1] = _mm256_castps256_ps128(r02r1xlohi); vDst[2] = _mm256_castps256_ps128(r02r1xhilo); vDst[3] = _mm256_castps256_ps128(r02r1xhihi); - vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); - vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); - vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); - vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); + vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1); + vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1); + vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1); + vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1); } INLINE -void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3) +void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3) { - __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 - __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 - __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 - __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 + simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 + simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 + simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 + simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 - r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 - r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77 - __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 - __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 + r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 + r1rx = _simd_unpackhi_ps(vSrc1, vSrc3); //y2w2y3w3 y6w6yw77 + simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 + simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 vDst[0] = _mm256_castps256_ps128(r02r1xlolo); vDst[1] = _mm256_castps256_ps128(r02r1xlohi); vDst[2] = _mm256_castps256_ps128(r02r1xhilo); vDst[3] = _mm256_castps256_ps128(r02r1xhihi); - vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); - vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); - vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); - vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); + vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1); + vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1); + vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1); + vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1); } #if ENABLE_AVX512_SIMD16 @@ -218,39 +206,39 @@ void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd1 #endif INLINE -void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7) +void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar &vMask0, const simdscalar &vMask1, const simdscalar &vMask2, const simdscalar &vMask3, const simdscalar &vMask4, const simdscalar &vMask5, const simdscalar &vMask6, const simdscalar &vMask7) { - __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1); - __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1); - __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3); - __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3); - __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5); - __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5); - __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7); - __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7); - __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); - __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); - __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); - __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); - __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); - __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); - __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); - __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); - vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); - vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); - vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); - vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); - vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); - vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); - vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); - vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); + simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1); + simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1); + simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3); + simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3); + simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5); + simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5); + simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7); + simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7); + simdscalar __tt0 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); + simdscalar __tt1 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); + simdscalar __tt2 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); + simdscalar __tt3 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); + simdscalar __tt4 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); + simdscalar __tt5 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); + simdscalar __tt6 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); + simdscalar __tt7 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); + vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20); + vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20); + vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20); + vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20); + vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31); + vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31); + vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31); + vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31); } INLINE -void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7) +void vTranspose8x8(simdscalar (&vDst)[8], const simdscalari &vMask0, const simdscalari &vMask1, const simdscalari &vMask2, const simdscalari &vMask3, const simdscalari &vMask4, const simdscalari &vMask5, const simdscalari &vMask6, const simdscalari &vMask7) { - vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3), - _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7)); + vTranspose8x8(vDst, _simd_castsi_ps(vMask0), _simd_castsi_ps(vMask1), _simd_castsi_ps(vMask2), _simd_castsi_ps(vMask3), + _simd_castsi_ps(vMask4), _simd_castsi_ps(vMask5), _simd_castsi_ps(vMask6), _simd_castsi_ps(vMask7)); } #endif @@ -303,12 +291,12 @@ struct Transpose8_8_8_8 _mm_store_si128((__m128i*)pDst, c0123lo); _mm_store_si128((__m128i*)(pDst + 16), c0123hi); #else - simdscalari dst01 = _mm256_shuffle_epi8(src, - _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800)); + simdscalari dst01 = _simd_shuffle_epi8(src, + _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800)); simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01); - dst23 = _mm256_shuffle_epi8(dst23, - _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080)); - simdscalari dst = _mm256_or_si256(dst01, dst23); + dst23 = _simd_shuffle_epi8(dst23, + _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080)); + simdscalari dst = _simd_or_si(dst01, dst23); _simd_store_si((simdscalari*)pDst, dst); #endif #else diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h index f0fdc8c7083..ffde574c035 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h @@ -509,9 +509,9 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM > // pack simdscalari packed = _simd_castps_si(dst.x); - packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits::GetBPC(0))); - packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits::GetBPC(0) + - FormatTraits::GetBPC(1))); + packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits::GetConstBPC(0))); + packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits::GetConstBPC(0) + + FormatTraits::GetConstBPC(1))); // pack low 16 bits of each 32 bit lane to low 128 bits of dst uint32_t *pPacked = (uint32_t*)&packed; @@ -727,10 +727,10 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits::fromFloat(3))); // moving to 8 wide integer vector types - __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr - __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg - __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb - __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa + simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr + simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg + simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb + simdscalari src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa #if KNOB_ARCH <= KNOB_ARCH_AVX @@ -766,7 +766,7 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); - __m256i final = _mm256_castsi128_si256(vRow00); + simdscalari final = _mm256_castsi128_si256(vRow00); final = _mm256_insertf128_si256(final, vRow10, 1); #else @@ -779,7 +779,7 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst src0 = _mm256_or_si256(src0, src1); src2 = _mm256_or_si256(src2, src3); - __m256i final = _mm256_or_si256(src0, src2); + simdscalari final = _mm256_or_si256(src0, src2); // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 final = _mm256_permute4x64_epi64(final, 0xD8); @@ -886,9 +886,9 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_ vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits::fromFloat(2))); // moving to 8 wide integer vector types - __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr - __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg - __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb + simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr + simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg + simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb #if KNOB_ARCH <= KNOB_ARCH_AVX @@ -918,7 +918,7 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_ __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); - __m256i final = _mm256_castsi128_si256(vRow00); + simdscalari final = _mm256_castsi128_si256(vRow00); final = _mm256_insertf128_si256(final, vRow10, 1); #else @@ -929,7 +929,7 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_ src0 = _mm256_or_si256(src0, src1); - __m256i final = _mm256_or_si256(src0, src2); + simdscalari final = _mm256_or_si256(src0, src2); // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 final = _mm256_permute4x64_epi64(final, 0xD8); -- 2.30.2