From 73a9154bdef807330ec3d75a79610532e33edb75 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Tue, 14 Jun 2016 17:02:11 -0600 Subject: [PATCH] swr: [rasterizer core] use wrap-around safe compares for dependency checking Move drawIDs from 64-bit to 32-bit to increase perf. Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/core/api.cpp | 4 +- .../drivers/swr/rasterizer/core/context.h | 4 +- .../drivers/swr/rasterizer/core/ringbuffer.h | 8 +-- .../drivers/swr/rasterizer/core/threads.cpp | 54 +++++++++++-------- .../drivers/swr/rasterizer/core/threads.h | 6 +-- .../swr/rasterizer/scripts/knob_defs.py | 5 +- 6 files changed, 45 insertions(+), 36 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index cec451929a3..b63d5474cf3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -215,13 +215,13 @@ void QueueWork(SWR_CONTEXT *pContext) if (IsDraw) { - uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; + uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; WorkOnFifoFE(pContext, 0, curDraw[0]); WorkOnFifoBE(pContext, 0, curDraw[1], gSingleThreadLockedTiles, 0, 0); } else { - uint64_t curDispatch = pContext->pCurDrawContext->drawId; + uint32_t curDispatch = pContext->pCurDrawContext->drawId; WorkOnCompute(pContext, 0, curDispatch); } diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 320435281d8..08eadf41134 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -381,13 +381,13 @@ struct DRAW_STATE struct DRAW_CONTEXT { SWR_CONTEXT* pContext; - uint64_t drawId; + uint32_t drawId; + uint32_t dependency; union { MacroTileMgr* pTileMgr; DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) }; - uint64_t dependency; DRAW_STATE* pState; CachingArena* pArena; diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h index b9076de65fe..97f75c6550e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h +++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h @@ -90,13 +90,13 @@ public: return (numEnqueued == mNumEntries); } - INLINE uint64_t GetTail() volatile { return mRingTail; } - INLINE uint64_t GetHead() volatile { return mRingHead; } + INLINE uint32_t GetTail() volatile { return mRingTail; } + INLINE uint32_t GetHead() volatile { return mRingHead; } protected: T* mpRingBuffer; uint32_t mNumEntries; - OSALIGNLINE(volatile uint64_t) mRingHead; // Consumer Counter - OSALIGNLINE(volatile uint64_t) mRingTail; // Producer Counter + OSALIGNLINE(volatile uint32_t) mRingHead; // Consumer Counter + OSALIGNLINE(volatile uint32_t) mRingTail; // Producer Counter }; diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 17bf6163bd8..fe164a06fb4 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -294,22 +294,30 @@ void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup= } INLINE -uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext) +uint32_t GetEnqueuedDraw(SWR_CONTEXT *pContext) { return pContext->dcRing.GetHead(); } INLINE -DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint64_t drawId) +DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint32_t drawId) { return &pContext->dcRing[(drawId-1) % KNOB_MAX_DRAWS_IN_FLIGHT]; } +INLINE +bool IDComparesLess(uint32_t a, uint32_t b) +{ + // Use signed delta to ensure that wrap-around to 0 is correctly handled. + int32_t delta = int32_t(a - b); + return (delta < 0); +} + // returns true if dependency not met INLINE -bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastRetiredDraw) +bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw) { - return (pDC->dependency > lastRetiredDraw); + return IDComparesLess(lastRetiredDraw, pDC->dependency); } // inlined-only version @@ -345,11 +353,11 @@ int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) return CompleteDrawContextInl(pContext, pDC); } -INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE, uint64_t& drawEnqueued) +INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint32_t& curDrawBE, uint32_t& drawEnqueued) { // increment our current draw id to the first incomplete draw drawEnqueued = GetEnqueuedDraw(pContext); - while (curDrawBE < drawEnqueued) + while (IDComparesLess(curDrawBE, drawEnqueued)) { DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT]; @@ -372,7 +380,7 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE, } // If there are no more incomplete draws then return false. - return (curDrawBE >= drawEnqueued) ? false : true; + return IDComparesLess(curDrawBE, drawEnqueued); } ////////////////////////////////////////////////////////////////////////// @@ -392,20 +400,20 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE, void WorkOnFifoBE( SWR_CONTEXT *pContext, uint32_t workerId, - uint64_t &curDrawBE, + uint32_t &curDrawBE, TileSet& lockedTiles, uint32_t numaNode, uint32_t numaMask) { // Find the first incomplete draw that has pending work. If no such draw is found then // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. - uint64_t drawEnqueued = 0; + uint32_t drawEnqueued = 0; if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false) { return; } - uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; + uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; // Reset our history for locked tiles. We'll have to re-learn which tiles are locked. lockedTiles.clear(); @@ -415,7 +423,7 @@ void WorkOnFifoBE( // 2. If we're trying to work on draws after curDrawBE, we are restricted to // working on those macrotiles that are known to be complete in the prior draw to // maintain order. The locked tiles provides the history to ensures this. - for (uint64_t i = curDrawBE; i < drawEnqueued; ++i) + for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i) { DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; @@ -508,11 +516,11 @@ void WorkOnFifoBE( } } -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE) +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE) { // Try to grab the next DC from the ring - uint64_t drawEnqueued = GetEnqueuedDraw(pContext); - while (curDrawFE < drawEnqueued) + uint32_t drawEnqueued = GetEnqueuedDraw(pContext); + while (IDComparesLess(curDrawFE, drawEnqueued)) { uint32_t dcSlot = curDrawFE % KNOB_MAX_DRAWS_IN_FLIGHT; DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; @@ -527,8 +535,8 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE) } } - uint64_t curDraw = curDrawFE; - while (curDraw < drawEnqueued) + uint32_t curDraw = curDrawFE; + while (IDComparesLess(curDraw, drawEnqueued)) { uint32_t dcSlot = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT; DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; @@ -559,17 +567,17 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE) void WorkOnCompute( SWR_CONTEXT *pContext, uint32_t workerId, - uint64_t& curDrawBE) + uint32_t& curDrawBE) { - uint64_t drawEnqueued = 0; + uint32_t drawEnqueued = 0; if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false) { return; } - uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; + uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; - for (uint64_t i = curDrawBE; curDrawBE < drawEnqueued; ++i) + for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i) { DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; if (pDC->isCompute == false) return; @@ -639,10 +647,10 @@ DWORD workerThreadMain(LPVOID pData) // the worker can safely increment its oldestDraw counter and move on to the next draw. std::unique_lock lock(pContext->WaitLock, std::defer_lock); - auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); }; + auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); }; - uint64_t curDrawBE = 0; - uint64_t curDrawFE = 0; + uint32_t curDrawBE = 0; + uint32_t curDrawFE = 0; while (pContext->threadPool.inThreadShutdown == false) { diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h index 3aba6323a95..e7b4924f0e8 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.h +++ b/src/gallium/drivers/swr/rasterizer/core/threads.h @@ -64,7 +64,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); // Expose FE and BE worker functions to the API thread if single threaded -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE); -void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask); -void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE); +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE); +void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask); +void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE); int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC); \ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py index ab6ec565804..56c3144bfa6 100644 --- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py +++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py @@ -133,8 +133,9 @@ KNOBS = [ ['MAX_DRAWS_IN_FLIGHT', { 'type' : 'uint32_t', - 'default' : '96', - 'desc' : ['Maximum number of draws outstanding before API thread blocks.'], + 'default' : '128', + 'desc' : ['Maximum number of draws outstanding before API thread blocks.', + 'This value MUST be evenly divisible into 2^32'], 'category' : 'perf', }], -- 2.30.2