swr/rast: FP consistency between POSH/RENDER pipes
authorAlok Hota <alok.hota@intel.com>
Tue, 28 Aug 2018 17:23:31 +0000 (12:23 -0500)
committerAlok Hota <alok.hota@intel.com>
Fri, 15 Feb 2019 20:54:09 +0000 (14:54 -0600)
- Ensure all threads have optimal floating-point control state
- Disable auto-generation of fused FP ops for VERTEX shader stage
- Disable "fast" FP ops for VERTEX shader stage

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
src/gallium/drivers/swr/rasterizer/common/os.h
src/gallium/drivers/swr/rasterizer/core/api.cpp
src/gallium/drivers/swr/rasterizer/core/frontend.cpp
src/gallium/drivers/swr/rasterizer/core/threads.cpp

index 314d8184374707659828ffb261fe2c09e4609eca..b00beeb36ddbcbe1c982492161ccea504652dbbd 100644 (file)
@@ -294,4 +294,25 @@ int SWR_API
             std::string* pOptStdErr     = nullptr,   ///< (Optional Out) Standard Error text
             const std::string* pOptStdIn = nullptr); ///< (Optional In) Standard Input text
 
+
+/// Helper for setting up FP state
+/// @returns old csr state
+static INLINE uint32_t SetOptimalVectorCSR()
+{
+    uint32_t oldCSR = _mm_getcsr();
+
+    uint32_t newCSR = (oldCSR & ~(_MM_ROUND_MASK | _MM_DENORMALS_ZERO_MASK | _MM_FLUSH_ZERO_MASK));
+    newCSR |= (_MM_ROUND_NEAREST | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
+    _mm_setcsr(newCSR);
+
+    return oldCSR;
+}
+
+/// Set Vector CSR state.
+/// @param csrState - should be value returned from SetOptimalVectorCSR()
+static INLINE void RestoreVectorCSR(uint32_t csrState)
+{
+    _mm_setcsr(csrState);
+}
+
 #endif //__SWR_OS_H__
index acbc7e077b186900693254019af10b6bb8f894a7..203a74bdb246a36facaf93d20551cc6826e59db8 100644 (file)
@@ -250,9 +250,7 @@ void QueueWork(SWR_CONTEXT* pContext)
 
     if (pContext->threadInfo.SINGLE_THREADED)
     {
-        // flush denormals to 0
-        uint32_t mxcsr = _mm_getcsr();
-        _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
+        uint32_t mxcsr = SetOptimalVectorCSR();
 
         if (IsDraw)
         {
@@ -274,7 +272,7 @@ void QueueWork(SWR_CONTEXT* pContext)
         }
 
         // restore csr
-        _mm_setcsr(mxcsr);
+        RestoreVectorCSR(mxcsr);
     }
     else
     {
index 8bc97c743eb6838cb7c216e2adfc348ce60ce690..24db5275795b4396a4938eb149429775ae703fcc 100644 (file)
@@ -1840,10 +1840,10 @@ void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, vo
         {
             vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
 
-            fetchInfo_lo.xpIndices = 
-                pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex);
-            fetchInfo_hi.xpIndices = 
-                pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
+            fetchInfo_lo.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex);
+            fetchInfo_hi.xpIndices = pDC->pContext->pfnMakeGfxPtr(
+                GetPrivateState(pDC),
+                &vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
         }
 
         fetchInfo_lo.CurInstance = instanceNum;
index 4523616cba0bc8742d3b3aa3af8419b6e9cba481..e30c11705688c2d6279d88499052e4556d3fa3bf 100644 (file)
@@ -421,9 +421,9 @@ INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CON
     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
     {
         stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
-
         stats.PsInvocations += dynState.pStats[i].PsInvocations;
         stats.CsInvocations += dynState.pStats[i].CsInvocations;
+
     }
 
 
@@ -439,6 +439,10 @@ INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONT
         pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData,
                                             pDC->retireCallback.userData2,
                                             pDC->retireCallback.userData3);
+
+        // Callbacks to external code *could* change floating point control state
+        // Reset our optimal flags
+        SetOptimalVectorCSR();
     }
 }
 
@@ -870,8 +874,7 @@ DWORD workerThreadMain(LPVOID pData)
     uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
     uint32_t numaMask = pContext->threadPool.numaMask;
 
-    // flush denormals to 0
-    _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
+    SetOptimalVectorCSR();
 
     // Track tiles locked by other threads. If we try to lock a macrotile and find its already
     // locked then we'll add it to this list so that we don't try and lock it again.