From a7fa0cc0a5b812c0732a0a7e05888b4dd37d55b3 Mon Sep 17 00:00:00 2001 From: Alok Hota Date: Thu, 13 Sep 2018 16:12:12 -0500 Subject: [PATCH] swr/rast: simdlib cleanup, clipper stack space fixes Reduce stack space used by clipper, which had lead to crashes in some versions for MSVC Reviewed-by: Bruce Cherniak --- .../swr/rasterizer/common/simdlib_128_avx.inl | 4 +- .../rasterizer/common/simdlib_128_avx2.inl | 4 +- .../rasterizer/common/simdlib_128_avx512.inl | 4 +- .../swr/rasterizer/common/simdlib_256_avx.inl | 18 +- .../rasterizer/common/simdlib_256_avx2.inl | 4 +- .../rasterizer/common/simdlib_256_avx512.inl | 4 +- .../rasterizer/common/simdlib_512_avx512.inl | 6 +- .../swr/rasterizer/common/simdlib_512_emu.inl | 24 ++- .../rasterizer/common/simdlib_interface.hpp | 2 +- .../drivers/swr/rasterizer/core/clip.cpp | 6 - .../drivers/swr/rasterizer/core/clip.h | 175 +++++++----------- .../drivers/swr/rasterizer/core/frontend.cpp | 8 +- .../drivers/swr/rasterizer/core/threads.h | 3 +- 13 files changed, 127 insertions(+), 135 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl index 0c5795cf136..9d190bc6941 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl @@ -415,7 +415,7 @@ SIMD_WRAPPER_2(unpacklo_ps); //----------------------------------------------------------------------- // Load / store operations //----------------------------------------------------------------------- -template +template static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { @@ -462,7 +462,7 @@ static SIMDINLINE Integer SIMDCALL } // for each element: (mask & (1 << 31)) ? (i32gather_ps(p, idx), mask = 0) : old -template +template static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) { diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl index 35f9175ea46..0da66ebb56c 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl @@ -48,7 +48,7 @@ static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return return _mm_srlv_epi32(vA, vB); } -template +template static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { @@ -56,7 +56,7 @@ static SIMDINLINE Float SIMDCALL } // for each element: (mask & (1 << 31)) ? (i32gather_ps(p, idx), mask = 0) : old -template +template static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) { diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl index 2ce3caa582f..b076daa080a 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl @@ -276,7 +276,7 @@ static SIMDINLINE Integer SIMDCALL return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p)); } -template +template static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { @@ -285,7 +285,7 @@ static SIMDINLINE Float SIMDCALL } // for each element: (mask & (1 << 31)) ? (i32gather_ps(p, idx), mask = 0) : old -template +template static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) { diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl index 4ac0f95a468..232f43faec7 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl @@ -574,7 +574,7 @@ SIMD_WRAPPER_2(unpacklo_ps); //----------------------------------------------------------------------- // Load / store operations //----------------------------------------------------------------------- -template +template static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { @@ -591,6 +591,13 @@ static SIMDINLINE Float SIMDCALL return vResult; } +template +static SIMDINLINE Float SIMDCALL +sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) +{ + return i32gather_ps(p, idx); +} + static SIMDINLINE Float SIMDCALL load1_ps(float const* p) // return *p (broadcast 1 value to all elements) { @@ -621,7 +628,7 @@ static SIMDINLINE Integer SIMDCALL } // for each element: (mask & (1 << 31)) ? (i32gather_ps(p, idx), mask = 0) : old -template +template static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) { @@ -641,6 +648,13 @@ static SIMDINLINE Float SIMDCALL return vResult; } +template +static SIMDINLINE Float SIMDCALL +sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) +{ + return mask_i32gather_ps(old, p, idx, mask); +} + static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src) { _mm256_maskstore_ps(p, mask, src); diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl index 59a61cf9263..49650d52442 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl @@ -206,7 +206,7 @@ SIMD_IWRAPPER_2(unpacklo_epi8); //----------------------------------------------------------------------- // Load / store operations //----------------------------------------------------------------------- -template +template static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { @@ -214,7 +214,7 @@ static SIMDINLINE Float SIMDCALL } // for each element: (mask & (1 << 31)) ? (i32gather_ps(p, idx), mask = 0) : old -template +template static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) { diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl index 790609861e5..4c883b11a25 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl @@ -277,7 +277,7 @@ static SIMDINLINE Integer SIMDCALL return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p)); } -template +template static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { @@ -286,7 +286,7 @@ static SIMDINLINE Float SIMDCALL } // for each element: (mask & (1 << 31)) ? (i32gather_ps(p, idx), mask = 0) : old -template +template static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) { diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl index e9e908ac3c6..5053275e8d6 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl @@ -492,7 +492,7 @@ SIMD_WRAPPER_2(unpacklo_ps); //----------------------------------------------------------------------- // Load / store operations //----------------------------------------------------------------------- -template +template static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { @@ -529,11 +529,11 @@ static SIMDINLINE Integer SIMDCALL } // for each element: (mask & (1 << 31)) ? (i32gather_ps(p, idx), mask = 0) : old -template +template static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) { - __mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps()); + __mmask16 k = _mm512_test_epi32_mask(castps_si(mask), set1_epi32(0x80000000)); return _mm512_mask_i32gather_ps(old, k, idx, p, static_cast(ScaleT)); } diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl index 91705f2646d..f25d834725c 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl @@ -631,7 +631,7 @@ SIMD_WRAPPER_2(unpacklo_ps); //----------------------------------------------------------------------- // Load / store operations //----------------------------------------------------------------------- -template +template static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) { @@ -641,6 +641,16 @@ static SIMDINLINE Float SIMDCALL }; } +template +static SIMDINLINE Float SIMDCALL + sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) +{ + return Float{ + SIMD256T::template sw_i32gather_ps(p, idx.v8[0]), + SIMD256T::template sw_i32gather_ps(p, idx.v8[1]), + }; +} + static SIMDINLINE Float SIMDCALL load1_ps(float const* p) // return *p (broadcast 1 value to all elements) { @@ -677,7 +687,7 @@ static SIMDINLINE Integer SIMDCALL } // for each element: (mask & (1 << 31)) ? (i32gather_ps(p, idx), mask = 0) : old -template +template static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) { @@ -687,6 +697,16 @@ static SIMDINLINE Float SIMDCALL }; } +template +static SIMDINLINE Float SIMDCALL + sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) +{ + return Float{ + SIMD256T::template sw_mask_i32gather_ps(old.v8[0], p, idx.v8[0], mask.v8[0]), + SIMD256T::template sw_mask_i32gather_ps(old.v8[1], p, idx.v8[1], mask.v8[1]), + }; +} + static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src) { SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]); diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp index 7902bcb2b64..85c722c92c0 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp @@ -273,7 +273,7 @@ struct SIMD256 // or SIMD4 or SIMD16 SF_8, // Scale offset by 8 }; - template + template static Float i32gather_ps(float const* p, Integer idx); // return *(float*)(((int8*)p) + (idx * ScaleT)) static Float load1_ps(float const *p); // return *p (broadcast 1 value to all elements) static Float load_ps(float const *p); // return *p (loads SIMD width elements from memory) diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp index 8c53fca6432..87be5bc119b 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp @@ -31,12 +31,6 @@ #include "common/os.h" #include "core/clip.h" -// Temp storage used by the clipper -THREAD SIMDVERTEX_T tlsTempVertices[7]; -#if USE_SIMD16_FRONTEND -THREAD SIMDVERTEX_T tlsTempVertices_simd16[7]; -#endif - float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1) { return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1)); diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index 7b4ed58c3fa..33c16538fd9 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -32,12 +32,6 @@ #include "core/pa.h" #include "rdtsc_core.h" -// Temp storage used by the clipper -extern THREAD SIMDVERTEX_T tlsTempVertices[7]; -#if USE_SIMD16_FRONTEND -extern THREAD SIMDVERTEX_T tlsTempVertices_simd16[7]; -#endif - enum SWR_CLIPCODES { // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare. @@ -314,41 +308,36 @@ struct SimdHelper return SIMD512::cmp_ps_mask(a, b); } }; - #endif -// Temp storage used by the clipper -template -struct ClipHelper -{ -}; - -template <> -struct ClipHelper -{ - static SIMDVERTEX_T* GetTempVertices() { return tlsTempVertices; } -}; - -#if USE_SIMD16_FRONTEND -template <> -struct ClipHelper -{ - static SIMDVERTEX_T* GetTempVertices() { return tlsTempVertices_simd16; } -}; -#endif -template +template class Clipper { public: INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) : workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC)) { - static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim"); + static_assert(NumVertsPerPrimT >= 1 && NumVertsPerPrimT <= 3, "Invalid NumVertsPerPrim"); + THREAD_DATA &thread_data = in_pDC->pContext->threadPool.pThreadData[workerId]; + + if (thread_data.clipperData == nullptr) + { + // 7 vertex temp data + // 7 post-clipped vertices + // 2 transposed verts for binning + size_t alloc_size = sizeof(SIMDVERTEX_T) * (7 + 7 + 2); + thread_data.clipperData = AlignedMalloc(alloc_size, KNOB_SIMD16_BYTES); + } + SWR_ASSERT(thread_data.clipperData); + + this->clippedVerts = (SIMDVERTEX_T*)thread_data.clipperData; + this->tmpVerts = this->clippedVerts + 7; + this->transposedVerts = this->tmpVerts + 7; } void ComputeClipCodes(Vec4 vertex[], const Integer& viewportIndexes) { - for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + for (uint32_t i = 0; i < NumVertsPerPrimT; ++i) { ::ComputeClipCodes(state, vertex[i], clipCodes[i], viewportIndexes); } @@ -358,7 +347,7 @@ public: { Float result = clipCodes[0]; - for (uint32_t i = 1; i < NumVertsPerPrim; ++i) + for (uint32_t i = 1; i < NumVertsPerPrimT; ++i) { result = SIMD_T::and_ps(result, clipCodes[i]); } @@ -370,7 +359,7 @@ public: { Float result = clipCodes[0]; - for (uint32_t i = 1; i < NumVertsPerPrim; ++i) + for (uint32_t i = 1; i < NumVertsPerPrimT; ++i) { result = SIMD_T::or_ps(result, clipCodes[i]); } @@ -393,7 +382,7 @@ public: { Float vNanMask = SIMD_T::setzero_ps(); - for (uint32_t e = 0; e < NumVertsPerPrim; ++e) + for (uint32_t e = 0; e < NumVertsPerPrimT; ++e) { Float vNan01 = SIMD_T::template cmp_ps(prim[e].v[0], prim[e].v[1]); @@ -428,7 +417,7 @@ public: uint32_t component = index & 0x3; Float vCullMaskElem = SIMD_T::set1_ps(-1.0f); - for (uint32_t e = 0; e < NumVertsPerPrim; ++e) + for (uint32_t e = 0; e < NumVertsPerPrimT; ++e) { Float vCullComp; if (slot == 0) @@ -457,7 +446,7 @@ public: uint32_t component = index & 0x3; Float vCullMaskElem = SIMD_T::set1_ps(-1.0f); - for (uint32_t e = 0; e < NumVertsPerPrim; ++e) + for (uint32_t e = 0; e < NumVertsPerPrimT; ++e) { Float vClipComp; if (slot == 0) @@ -491,7 +480,7 @@ public: const Integer& vRtIdx) { // input/output vertex store for clipper - SIMDVERTEX_T vertices[7]; // maximum 7 verts generated per triangle + SIMDVERTEX_T* vertices = this->clippedVerts; uint32_t constantInterpMask = state.backendState.constantInterpolationMask; uint32_t provokingVertex = 0; @@ -502,8 +491,8 @@ public: ///@todo: line topology for wireframe? // assemble pos - Vec4 tmpVector[NumVertsPerPrim]; - for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + Vec4 tmpVector[NumVertsPerPrimT]; + for (uint32_t i = 0; i < NumVertsPerPrimT; ++i) { vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i]; } @@ -526,14 +515,14 @@ public: // vertex values to all edges if (CheckBit(constantInterpMask, slot)) { - for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + for (uint32_t i = 0; i < NumVertsPerPrimT; ++i) { vertices[i].attrib[inputSlot] = tmpVector[provokingVertex]; } } else { - for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + for (uint32_t i = 0; i < NumVertsPerPrimT; ++i) { vertices[i].attrib[inputSlot] = tmpVector[i]; } @@ -545,7 +534,7 @@ public: if (state.backendState.clipDistanceMask & 0xf) { pa.Assemble(vertexClipCullSlot, tmpVector); - for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + for (uint32_t i = 0; i < NumVertsPerPrimT; ++i) { vertices[i].attrib[vertexClipCullSlot] = tmpVector[i]; } @@ -554,7 +543,7 @@ public: if (state.backendState.clipDistanceMask & 0xf0) { pa.Assemble(vertexClipCullSlot + 1, tmpVector); - for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + for (uint32_t i = 0; i < NumVertsPerPrimT; ++i) { vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i]; } @@ -565,12 +554,12 @@ public: Integer vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs); - BinnerChooser binner(NumVertsPerPrim, + BinnerChooser binner(NumVertsPerPrimT, pa.pDC->pState->state.rastState.conservativeRast); // set up new PA for binning clipped primitives PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN; - if (NumVertsPerPrim == 3) + if (NumVertsPerPrimT == 3) { clipTopology = TOP_TRIANGLE_FAN; @@ -584,7 +573,7 @@ public: clipTopology = TOP_RECT_LIST; } } - else if (NumVertsPerPrim == 2) + else if (NumVertsPerPrimT == 2) { clipTopology = TOP_LINE_LIST; } @@ -614,25 +603,16 @@ public: uint32_t numClippedPrims = 0; - // tranpose clipper output so that each lane's vertices are in SIMD order + // transpose clipper output so that each lane's vertices are in SIMD order // set aside space for 2 vertices, as the PA will try to read up to 16 verts // for triangle fan + SIMDVERTEX_T* transposedPrims = this->transposedVerts; -#if defined(_DEBUG) - // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack - // overflow in debug builds - SIMDVERTEX_T* transposedPrims = reinterpret_cast*>( - AlignedMalloc(sizeof(SIMDVERTEX_T) * 2, 64)); - -#else - SIMDVERTEX_T transposedPrims[2]; - -#endif uint32_t numInputPrims = pa.NumPrims(); for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim) { uint32_t numEmittedVerts = pVertexCount[inputPrim]; - if (numEmittedVerts < NumVertsPerPrim) + if (numEmittedVerts < NumVertsPerPrimT) { continue; } @@ -648,27 +628,23 @@ public: // for triangle fan // transpose pos - uint8_t* pBase = reinterpret_cast(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + - sizeof(float) * inputPrim; - -#if 0 - // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug - static const float *dummy = reinterpret_cast(pBase); + float const* pBase = + reinterpret_cast(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + + inputPrim; -#endif for (uint32_t c = 0; c < 4; ++c) { - SIMD256::Float temp = SIMD256::template mask_i32gather_ps(1)>( - SIMD256::setzero_ps(), reinterpret_cast(pBase), vOffsets, vMask); + SIMD256::Float temp = + SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask); transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper::insert_lo_ps(temp); - pBase += sizeof(Float); + pBase = PtrAdd(pBase, sizeof(Float)); } // transpose attribs - pBase = - reinterpret_cast(&vertices[0].attrib[backendState.vertexAttribOffset]) + - sizeof(float) * inputPrim; + pBase = reinterpret_cast( + &vertices[0].attrib[backendState.vertexAttribOffset]) + + inputPrim; for (uint32_t attrib = 0; attrib < numAttribs; ++attrib) { @@ -677,14 +653,10 @@ public: for (uint32_t c = 0; c < 4; ++c) { SIMD256::Float temp = - SIMD256::template mask_i32gather_ps(1)>( - SIMD256::setzero_ps(), - reinterpret_cast(pBase), - vOffsets, - vMask); + SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask); transposedPrims[0].attrib[attribSlot][c] = SimdHelper::insert_lo_ps(temp); - pBase += sizeof(Float); + pBase = PtrAdd(pBase, sizeof(Float)); } } @@ -692,39 +664,32 @@ public: uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset; if (state.backendState.clipDistanceMask & 0x0f) { - pBase = reinterpret_cast(&vertices[0].attrib[vertexClipCullSlot]) + - sizeof(float) * inputPrim; + pBase = reinterpret_cast(&vertices[0].attrib[vertexClipCullSlot]) + + inputPrim; for (uint32_t c = 0; c < 4; ++c) { SIMD256::Float temp = - SIMD256::template mask_i32gather_ps(1)>( - SIMD256::setzero_ps(), - reinterpret_cast(pBase), - vOffsets, - vMask); + SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask); transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper::insert_lo_ps(temp); - pBase += sizeof(Float); + pBase = PtrAdd(pBase, sizeof(Float)); } } if (state.backendState.clipDistanceMask & 0xf0) { - pBase = reinterpret_cast(&vertices[0].attrib[vertexClipCullSlot + 1]) + - sizeof(float) * inputPrim; + pBase = + reinterpret_cast(&vertices[0].attrib[vertexClipCullSlot + 1]) + + inputPrim; for (uint32_t c = 0; c < 4; ++c) { SIMD256::Float temp = - SIMD256::template mask_i32gather_ps(1)>( - SIMD256::setzero_ps(), - reinterpret_cast(pBase), - vOffsets, - vMask); + SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask); transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper::insert_lo_ps(temp); - pBase += sizeof(Float); + pBase = PtrAdd(pBase, sizeof(Float)); } } @@ -734,7 +699,7 @@ public: numEmittedVerts, SWR_VTX_NUM_SLOTS, true, - NumVertsPerPrim, + NumVertsPerPrimT, clipTopology); clipPA.viewportArrayActive = pa.viewportArrayActive; clipPA.rtArrayActive = pa.rtArrayActive; @@ -751,7 +716,7 @@ public: { do { - Vec4 attrib[NumVertsPerPrim]; + Vec4 attrib[NumVertsPerPrimT]; bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib); @@ -765,10 +730,6 @@ public: } } -#if defined(_DEBUG) - AlignedFree(transposedPrims); - -#endif // update global pipeline stat UPDATE_STAT_FE(CPrimitives, numClippedPrims); } @@ -811,7 +772,7 @@ public: // skip clipping for points uint32_t clipMask = 0; - if (NumVertsPerPrim != 1) + if (NumVertsPerPrimT != 1) { clipMask = validMask & ComputeClipMask(); } @@ -905,8 +866,7 @@ private: Integer vOffsets = ComputeOffsets(attrib, vIndices, component); Float vSrc = SIMD_T::setzero_ps(); - return SIMD_T::template mask_i32gather_ps(1)>( - vSrc, pBuffer, vOffsets, vMask); + return SIMD_T::mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask); } void ScatterComponent(const float* pBuffer, @@ -1278,15 +1238,15 @@ private: int numAttribs) { // temp storage - float* pTempVerts = reinterpret_cast(ClipHelper::GetTempVertices()); + float* pTempVerts = reinterpret_cast(this->tmpVerts); // zero out num input verts for non-active lanes - Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim); + Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrimT); vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask); // clip prims to frustum Integer vNumOutPts; - if (NumVertsPerPrim == 3) + if (NumVertsPerPrimT == 3) { vNumOutPts = ClipTriToPlane(pVertices, vNumInPts, numAttribs, pTempVerts); vNumOutPts = ClipTriToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); @@ -1300,7 +1260,7 @@ private: } else { - SWR_ASSERT(NumVertsPerPrim == 2); + SWR_ASSERT(NumVertsPerPrimT == 2); vNumOutPts = ClipLineToPlane(pVertices, vNumInPts, numAttribs, pTempVerts); vNumOutPts = @@ -1318,7 +1278,7 @@ private: // restore num verts for non-clipped, active lanes Float vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask); vNumOutPts = - SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask); + SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrimT), vNonClippedMask); return vNumOutPts; } @@ -1326,7 +1286,10 @@ private: const uint32_t workerId{0}; DRAW_CONTEXT* pDC{nullptr}; const API_STATE& state; - Float clipCodes[NumVertsPerPrim]; + Float clipCodes[NumVertsPerPrimT]; + SIMDVERTEX_T* clippedVerts; + SIMDVERTEX_T* tmpVerts; + SIMDVERTEX_T* transposedVerts; }; // pipeline stage functions diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 24db5275795..6ba6784f518 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -782,19 +782,19 @@ void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t for (uint32_t a = 0; a < numAttribs; ++a) { - auto attribGatherX = SIMD_T::template mask_i32gather_ps(1)>( + auto attribGatherX = SIMD_T::mask_i32gather_ps( SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask); - auto attribGatherY = SIMD_T::template mask_i32gather_ps(1)>( + auto attribGatherY = SIMD_T::mask_i32gather_ps( SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask); - auto attribGatherZ = SIMD_T::template mask_i32gather_ps(1)>( + auto attribGatherZ = SIMD_T::mask_i32gather_ps( SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask); - auto attribGatherW = SIMD_T::template mask_i32gather_ps(1)>( + auto attribGatherW = SIMD_T::mask_i32gather_ps( SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h index d0f4b30dca0..3072bbc835d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.h +++ b/src/gallium/drivers/swr/rasterizer/core/threads.h @@ -45,7 +45,8 @@ struct THREAD_DATA uint32_t numaId; // NUMA node id uint32_t coreId; // Core id uint32_t htId; // Hyperthread id - uint32_t workerId; + uint32_t workerId; // index of worker in total thread data + void* clipperData; // pointer to hang clipper-private data on SWR_CONTEXT* pContext; bool forceBindProcGroup; // Only useful when MAX_WORKER_THREADS is set. }; -- 2.30.2