From b990483de21cd99a94a098bb0eb397ba86b7b2a6 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Wed, 30 Mar 2016 15:54:48 -0600 Subject: [PATCH] swr: [rasterizer core] Put DRAW_CONTEXT on a diet No need for 256 pointers per DC. Acked-by: Brian Paul --- .../drivers/swr/rasterizer/core/api.cpp | 31 +++++++++++++------ .../drivers/swr/rasterizer/core/backend.cpp | 8 ++--- .../drivers/swr/rasterizer/core/backend.h | 2 +- .../drivers/swr/rasterizer/core/context.h | 16 ++++++---- .../drivers/swr/rasterizer/core/threads.cpp | 8 +++-- .../drivers/swr/rasterizer/core/tilemgr.cpp | 21 ------------- .../drivers/swr/rasterizer/core/tilemgr.h | 6 ---- 7 files changed, 43 insertions(+), 49 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 665b6c0453f..d0738a7e2e0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include "core/api.h" #include "core/backend.h" @@ -65,11 +66,14 @@ HANDLE SwrCreateContext( pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); + pContext->pMacroTileManagerArray = (MacroTileMgr*)_aligned_malloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); + pContext->pDispatchQueueArray = (DispatchQueue*)_aligned_malloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); + for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) { pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); - pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena)); - pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen. + new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena); + new (&pContext->pDispatchQueueArray[dc]) DispatchQueue(); pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); } @@ -143,10 +147,13 @@ void SwrDestroyContext(HANDLE hContext) { delete pContext->dcRing[i].pArena; delete pContext->dsRing[i].pArena; - delete(pContext->dcRing[i].pTileMgr); - delete(pContext->dcRing[i].pDispatch); + pContext->pMacroTileManagerArray[i].~MacroTileMgr(); + pContext->pDispatchQueueArray[i].~DispatchQueue(); } + _aligned_free(pContext->pDispatchQueueArray); + _aligned_free(pContext->pMacroTileManagerArray); + // Free scratch space. for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) { @@ -176,6 +183,15 @@ void WakeAllThreads(SWR_CONTEXT *pContext) template void QueueWork(SWR_CONTEXT *pContext) { + DRAW_CONTEXT* pDC = pContext->pCurDrawContext; + uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT; + + if (IsDraw) + { + pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex]; + pDC->pTileMgr->initialize(); + } + // Each worker thread looks at a DC for both FE and BE work at different times and so we // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and @@ -299,8 +315,6 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) pCurDrawContext->FeLock = 0; pCurDrawContext->threadsDone = 0; - pCurDrawContext->pTileMgr->initialize(); - // Assign unique drawId for this DC pCurDrawContext->drawId = pContext->dcRing.GetHead(); @@ -1368,9 +1382,6 @@ void SwrDispatch( pDC->isCompute = true; // This is a compute context. - // Ensure spill fill pointers are initialized to nullptr. - memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill)); - COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64); pTaskData->threadGroupCountX = threadGroupCountX; @@ -1378,6 +1389,8 @@ void SwrDispatch( pTaskData->threadGroupCountZ = threadGroupCountZ; uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; + uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT; + pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex]; pDC->pDispatch->initialize(totalThreadGroups, pTaskData); QueueDispatch(pContext); diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 842ea326e68..b2d3d9ef4f4 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -70,7 +70,7 @@ static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS]; /// @param pDC - pointer to draw context (dispatch). /// @param workerId - The unique worker ID that is assigned to this thread. /// @param threadGroupId - the linear index for the thread group within the dispatch. -void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId) +void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer) { RDTSC_START(BEDispatch); @@ -80,10 +80,10 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup SWR_ASSERT(pTaskData != nullptr); // Ensure spill fill memory has been allocated. - if (pDC->pSpillFill[workerId] == nullptr) + if (pSpillFillBuffer == nullptr) { ///@todo Add state which indicates the spill fill size. - pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8); + pSpillFillBuffer = pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8); } const API_STATE& state = GetApiState(pDC); @@ -94,7 +94,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup csContext.dispatchDims[1] = pTaskData->threadGroupCountY; csContext.dispatchDims[2] = pTaskData->threadGroupCountZ; csContext.pTGSM = pContext->pScratch[workerId]; - csContext.pSpillFillBuffer = pDC->pSpillFill[workerId]; + csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer; state.pfnCsFunc(GetPrivateState(pDC), &csContext); diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 2fa18953cad..d0626b997af 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -32,7 +32,7 @@ #include "core/context.h" #include "core/multisample.h" -void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId); +void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer); void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 2c28286b5ad..660c86e1194 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -384,8 +384,11 @@ struct DRAW_CONTEXT { SWR_CONTEXT* pContext; uint64_t drawId; - MacroTileMgr* pTileMgr; - DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) + union + { + MacroTileMgr* pTileMgr; + DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) + }; uint64_t dependency; DRAW_STATE* pState; CachingArena* pArena; @@ -394,12 +397,10 @@ struct DRAW_CONTEXT bool cleanupState; // True if this is the last draw using an entry in the state ring. volatile bool doneFE; // Is FE work done for this draw? + FE_WORK FeWork; + volatile OSALIGNLINE(uint32_t) FeLock; volatile int64_t threadsDone; - - OSALIGNLINE(FE_WORK) FeWork; - uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills. - }; static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT"); @@ -445,6 +446,9 @@ struct SWR_CONTEXT DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from. + MacroTileMgr* pMacroTileManagerArray; + DispatchQueue* pDispatchQueueArray; + // Draw State Ring // When draw are very large (lots of primitives) then the API thread will break these up. // These split draws all have identical state. So instead of storing the state directly diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index bee1e138002..4b7a207f366 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -291,7 +291,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) { // Cleanup memory allocations pDC->pArena->Reset(true); - pDC->pTileMgr->initialize(); + if (!pDC->isCompute) + { + pDC->pTileMgr->initialize(); + } if (pDC->cleanupState) { pDC->pState->pArena->Reset(true); @@ -546,10 +549,11 @@ void WorkOnCompute( // Is there any work remaining? if (queue.getNumQueued() > 0) { + void* pSpillFillBuffer = nullptr; uint32_t threadGroupId = 0; while (queue.getWork(threadGroupId)) { - ProcessComputeBE(pDC, workerId, threadGroupId); + ProcessComputeBE(pDC, workerId, threadGroupId, pSpillFillBuffer); queue.finishedWork(); } diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp index 794577270cf..c053e27f9a7 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp @@ -35,27 +35,6 @@ #define TILE_ID(x,y) ((x << 16 | y)) -// override new/delete for alignment -void *MacroTileMgr::operator new(size_t size) -{ - return _aligned_malloc(size, 64); -} - -void MacroTileMgr::operator delete(void *p) -{ - _aligned_free(p); -} - -void* DispatchQueue::operator new(size_t size) -{ - return _aligned_malloc(size, 64); -} - -void DispatchQueue::operator delete(void *p) -{ - _aligned_free(p); -} - MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena) { } diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h index 34992aaea75..82a15e16a33 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h @@ -140,9 +140,6 @@ public: x = (tileID >> 16) & 0xffff; } - void *operator new(size_t size); - void operator delete (void *p); - private: CachingArena& mArena; std::unordered_map mTiles; @@ -229,9 +226,6 @@ public: return mpTaskData; } - void *operator new(size_t size); - void operator delete (void *p); - void* mpTaskData{ nullptr }; // The API thread will set this up and the callback task function will interpet this. OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 }; -- 2.30.2