From 0b4db4370544459fcd47499d9f8663e421fcae34 Mon Sep 17 00:00:00 2001 From: Alok Hota Date: Tue, 28 Aug 2018 12:23:31 -0500 Subject: [PATCH] swr/rast: FP consistency between POSH/RENDER pipes - Ensure all threads have optimal floating-point control state - Disable auto-generation of fused FP ops for VERTEX shader stage - Disable "fast" FP ops for VERTEX shader stage Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/common/os.h | 21 +++++++++++++++++++ .../drivers/swr/rasterizer/core/api.cpp | 6 ++---- .../drivers/swr/rasterizer/core/frontend.cpp | 8 +++---- .../drivers/swr/rasterizer/core/threads.cpp | 9 +++++--- 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h index 314d8184374..b00beeb36dd 100644 --- a/src/gallium/drivers/swr/rasterizer/common/os.h +++ b/src/gallium/drivers/swr/rasterizer/common/os.h @@ -294,4 +294,25 @@ int SWR_API std::string* pOptStdErr = nullptr, ///< (Optional Out) Standard Error text const std::string* pOptStdIn = nullptr); ///< (Optional In) Standard Input text + +/// Helper for setting up FP state +/// @returns old csr state +static INLINE uint32_t SetOptimalVectorCSR() +{ + uint32_t oldCSR = _mm_getcsr(); + + uint32_t newCSR = (oldCSR & ~(_MM_ROUND_MASK | _MM_DENORMALS_ZERO_MASK | _MM_FLUSH_ZERO_MASK)); + newCSR |= (_MM_ROUND_NEAREST | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); + _mm_setcsr(newCSR); + + return oldCSR; +} + +/// Set Vector CSR state. +/// @param csrState - should be value returned from SetOptimalVectorCSR() +static INLINE void RestoreVectorCSR(uint32_t csrState) +{ + _mm_setcsr(csrState); +} + #endif //__SWR_OS_H__ diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index acbc7e077b1..203a74bdb24 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -250,9 +250,7 @@ void QueueWork(SWR_CONTEXT* pContext) if (pContext->threadInfo.SINGLE_THREADED) { - // flush denormals to 0 - uint32_t mxcsr = _mm_getcsr(); - _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); + uint32_t mxcsr = SetOptimalVectorCSR(); if (IsDraw) { @@ -274,7 +272,7 @@ void QueueWork(SWR_CONTEXT* pContext) } // restore csr - _mm_setcsr(mxcsr); + RestoreVectorCSR(mxcsr); } else { diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 8bc97c743eb..24db5275795 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -1840,10 +1840,10 @@ void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, vo { vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale); - fetchInfo_lo.xpIndices = - pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex); - fetchInfo_hi.xpIndices = - pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH + fetchInfo_lo.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex); + fetchInfo_hi.xpIndices = pDC->pContext->pfnMakeGfxPtr( + GetPrivateState(pDC), + &vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH } fetchInfo_lo.CurInstance = instanceNum; diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 4523616cba0..e30c1170568 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -421,9 +421,9 @@ INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CON for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) { stats.DepthPassCount += dynState.pStats[i].DepthPassCount; - stats.PsInvocations += dynState.pStats[i].PsInvocations; stats.CsInvocations += dynState.pStats[i].CsInvocations; + } @@ -439,6 +439,10 @@ INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONT pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData, pDC->retireCallback.userData2, pDC->retireCallback.userData3); + + // Callbacks to external code *could* change floating point control state + // Reset our optimal flags + SetOptimalVectorCSR(); } } @@ -870,8 +874,7 @@ DWORD workerThreadMain(LPVOID pData) uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE; uint32_t numaMask = pContext->threadPool.numaMask; - // flush denormals to 0 - _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); + SetOptimalVectorCSR(); // Track tiles locked by other threads. If we try to lock a macrotile and find its already // locked then we'll add it to this list so that we don't try and lock it again. -- 2.30.2