From d08493f9cef236af57538d4dd3087277f3a65ad2 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Tue, 18 Jul 2017 23:52:38 -0500 Subject: [PATCH] swr/rast: fix USE_SIMD16_FRONTEND issues Fix problems found when enabling USE_SIMD16_FRONTEND, mostly related to vMask / movemask_ps(pd). Reviewed-by: Bruce Cherniak --- .../swr/rasterizer/common/simd16intrin.h | 14 ++----------- .../swr/rasterizer/common/simdintrin.h | 21 +------------------ .../swr/rasterizer/common/simdlib_128_avx.inl | 15 +++++++++++++ .../swr/rasterizer/common/simdlib_256_avx.inl | 10 +++++++++ .../rasterizer/common/simdlib_512_avx512.inl | 4 ++-- .../common/simdlib_512_avx512_knights.inl | 21 ------------------- .../swr/rasterizer/common/simdlib_512_emu.inl | 12 +++++------ .../drivers/swr/rasterizer/core/backend.cpp | 2 +- .../swr/rasterizer/core/backend_impl.h | 8 +++---- .../swr/rasterizer/core/backend_sample.cpp | 2 +- .../rasterizer/core/backend_singlesample.cpp | 2 +- .../drivers/swr/rasterizer/core/clip.h | 6 +++--- .../drivers/swr/rasterizer/core/frontend.cpp | 2 +- src/gallium/drivers/swr/rasterizer/core/pa.h | 4 +++- 14 files changed, 49 insertions(+), 74 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h index a160ca2c5ed..019b26d8cfb 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h @@ -159,20 +159,10 @@ typedef SIMD512 SIMD16; #define _simd16_packus_epi32 SIMD16::packus_epi32 #define _simd16_packs_epi32 SIMD16::packs_epi32 #define _simd16_cmplt_ps_mask SIMD16::cmp_ps_mask +#define _simd16_cmpeq_ps_mask SIMD16::cmp_ps_mask #define _simd16_int2mask(mask) simd16mask(mask) #define _simd16_mask2int(mask) int(mask) - -// convert bitmask to vector mask -SIMDINLINE simd16scalar vMask16(int32_t mask) -{ - simd16scalari temp = _simd16_set1_epi32(mask); - - simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001); - - simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits)); - - return _simd16_castsi_ps(result); -} +#define _simd16_vmask_ps SIMD16::vmask_ps #endif//ENABLE_AVX512_SIMD16 diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index f95c109e6fe..f4b9e1055ce 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -181,6 +181,7 @@ typedef SIMD256 SIMD; #define _simd_storeu2_si SIMD::storeu2_si #define _simd_blendv_epi32 SIMD::blendv_epi32 +#define _simd_vmask_ps SIMD::vmask_ps template SIMDINLINE SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer a, SIMD128::Integer b) @@ -188,26 +189,6 @@ SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer a, SIMD128::Integer b) return SIMD128::castps_si(SIMD128::blend_ps(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b))); } -// convert bitmask to vector mask -SIMDINLINE -SIMD256::Float vMask(int32_t mask) -{ - SIMD256::Integer vec = SIMD256::set1_epi32(mask); - const SIMD256::Integer bit = SIMD256::set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); - vec = SIMD256::and_si(vec, bit); - vec = SIMD256::cmplt_epi32(SIMD256::setzero_si(), vec); - return SIMD256::castsi_ps(vec); -} - -SIMDINLINE -SIMD256::Integer vMaski(int32_t mask) -{ - SIMD256::Integer vec = SIMD256::set1_epi32(mask); - const SIMD256::Integer bit = SIMD256::set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); - vec = SIMD256::and_si(vec, bit); - return SIMD256::cmplt_epi32(SIMD256::setzero_si(), vec); -} - SIMDINLINE void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane) { diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl index 5bcedf39713..72327918937 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl @@ -519,6 +519,11 @@ static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float i return _mm_set_ps(in3, in2, in1, in0); } +static SIMDINLINE Integer SIMDCALL set_epi32(int in3, int in2, int in1, int in0) +{ + return _mm_set_epi32(in3, in2, in1, in0); +} + template static SIMDINLINE float SIMDCALL extract_ps(Float a) { @@ -526,6 +531,16 @@ static SIMDINLINE float SIMDCALL extract_ps(Float a) return *reinterpret_cast(&tmp); } +static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) +{ + Integer vec = set1_epi32(mask); + const Integer bit = set_epi32( + 0x08, 0x04, 0x02, 0x01); + vec = and_si(vec, bit); + vec = cmplt_epi32(setzero_si(), vec); + return castsi_ps(vec); +} + #undef SIMD_WRAPPER_1 #undef SIMD_WRAPPER_2 #undef SIMD_DWRAPPER_2 diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl index 16eb5217cba..77086119e2f 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl @@ -741,6 +741,16 @@ static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Imp _mm256_storeu2_m128i(&phi->v, &plo->v, src); } +static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) +{ + Integer vec = set1_epi32(mask); + const Integer bit = set_epi32( + 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + vec = and_si(vec, bit); + vec = cmplt_epi32(setzero_si(), vec); + return castsi_ps(vec); +} + #undef SIMD_WRAPPER_1 #undef SIMD_WRAPPER_2 #undef SIMD_DWRAPPER_2 diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl index 1f93da7345f..1001417704d 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl @@ -554,12 +554,12 @@ static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a) static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a) { - __mmask8 m = _mm512_cmplt_pd_mask(a, setzero_pd()); + __mmask8 m = _mm512_test_epi64_mask(castpd_si(a), set1_epi32(-1)); return static_cast(m); } static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a) { - __mmask16 m = _mm512_cmplt_ps_mask(a, setzero_ps()); + __mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(-1)); return static_cast(m); } diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl index 310f1540065..17001be0674 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl @@ -29,9 +29,6 @@ // //============================================================================ -static const int TARGET_SIMD_WIDTH = 16; -using SIMD256T = SIMD256Impl::AVX2Impl; - #define SIMD_WRAPPER_1_(op, intrin) \ static SIMDINLINE Float SIMDCALL op(Float a) \ {\ @@ -135,24 +132,6 @@ using SIMD256T = SIMD256Impl::AVX2Impl; } #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) -private: - static SIMDINLINE Integer vmask(__mmask8 m) - { - return _mm512_maskz_set1_epi64(m, -1LL); - } - static SIMDINLINE Integer vmask(__mmask16 m) - { - return _mm512_maskz_set1_epi32(m, -1); - } - static SIMDINLINE Integer vmask(__mmask32 m) - { - return _mm512_maskz_set1_epi16(m, -1); - } - static SIMDINLINE Integer vmask(__mmask64 m) - { - return _mm512_maskz_set1_epi8(m, -1); - } - public: SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int) SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl index a45429f4b6b..c414d75d42e 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl @@ -821,13 +821,11 @@ static SIMDINLINE Float SIMDCALL set_ps( static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) { - Integer vec = set1_epi32(mask); - const Integer bit = set_epi32( - 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, - 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); - vec = and_si(vec, bit); - vec = cmplt_epi32(setzero_si(), vec); - return castsi_ps(vec); + return Float + { + SIMD256T::vmask_ps(mask), + SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH) + }; } #undef SIMD_WRAPPER_1 diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index fe11cdfd2f9..363349f6c83 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -277,7 +277,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); } - simdscalar vCoverageMask = vMask(coverageMask); + simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); simdscalar stencilPassMask = vCoverageMask; AR_BEGIN(BEEarlyDepthTest, pDC->drawId); diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h index b6a86b59ecb..97ca0ef1ef9 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h @@ -576,7 +576,7 @@ struct PixelRateZTestLoop for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) { const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample]; - vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK)); + vCoverageMask[sample] = _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK)); if(!_simd_movemask_ps(vCoverageMask[sample])) { @@ -597,7 +597,7 @@ struct PixelRateZTestLoop const float minz = state.depthBoundsState.depthBoundsTestMinValue; const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; - vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz))); + vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz))); } AR_BEGIN(BEBarycentric, pDC->drawId); @@ -630,7 +630,7 @@ struct PixelRateZTestLoop { uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); - vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask)); + vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask)); } // ZTest for this sample @@ -907,7 +907,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t #endif simdscalar activeLanes; if(!(work.anyCoveredSamples & MASK)) {goto Endtile;}; - activeLanes = vMask(work.anyCoveredSamples & MASK); + activeLanes = _simd_vmask_ps(work.anyCoveredSamples & MASK); if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) { diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp index d81352aee57..bb2e9a9f631 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp @@ -133,7 +133,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); } - simdscalar vCoverageMask = vMask(coverageMask); + simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); simdscalar depthPassMask = vCoverageMask; simdscalar stencilPassMask = vCoverageMask; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp index 34875d342d5..18f4299f514 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp @@ -117,7 +117,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center); } - simdscalar vCoverageMask = vMask(coverageMask); + simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); simdscalar depthPassMask = vCoverageMask; simdscalar stencilPassMask = vCoverageMask; diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index 36c84025957..bf16792a0a6 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -1013,7 +1013,7 @@ public: AR_BEGIN(FEGuardbandClip, pa.pDC->drawId); // we have to clip tris, execute the clipper, which will also // call the binner - ClipSimd(vMask(primMask), vMask(clipMask), pa, primId); + ClipSimd(_simd_vmask_ps(primMask), _simd_vmask_ps(clipMask), pa, primId); AR_END(FEGuardbandClip, 1); } else if (validMask) @@ -1081,7 +1081,7 @@ public: // cull prims outside view frustum simd16scalar clipIntersection = ComputeClipCodeIntersection_simd16(); - int validMask = primMask & _simd16_movemask_ps(_simd16_cmpeq_ps(clipIntersection, _simd16_setzero_ps())); + int validMask = primMask & _simd16_cmpeq_ps_mask(clipIntersection, _simd16_setzero_ps()); // skip clipping for points uint32_t clipMask = 0; @@ -1095,7 +1095,7 @@ public: AR_BEGIN(FEGuardbandClip, pa.pDC->drawId); // we have to clip tris, execute the clipper, which will also // call the binner - ClipSimd(vMask(primMask), vMask(clipMask), pa, primId); + ClipSimd(_simd16_vmask_ps(primMask), _simd16_vmask_ps(clipMask), pa, primId); AR_END(FEGuardbandClip, 1); } else if (validMask) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 8796878c586..f9eda839cc3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -481,7 +481,7 @@ static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining) { uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining; uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0; - return _simd_castps_si(vMask(mask)); + return _simd_castps_si(_simd_vmask_ps(mask)); } ////////////////////////////////////////////////////////////////////////// diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h index d2e61096d14..4bb3236a638 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa.h +++ b/src/gallium/drivers/swr/rasterizer/core/pa.h @@ -703,7 +703,9 @@ struct PA_STATE_CUT : public PA_STATE #if USE_SIMD16_FRONTEND simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1); - verts[v].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0); + // Assigning to a temporary first to avoid an MSVC 2017 compiler bug + simdscalar t = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0); + verts[v].v[c] = t; #else verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1); #endif -- 2.30.2