From 92ec820244710e1b13267d8e93f3a81d7114080e Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Mon, 12 Sep 2016 13:08:12 -0500 Subject: [PATCH] swr: [rasterizer core] Better thread destruction Signed-off-by: Tim Rowley --- .../drivers/swr/rasterizer/core/api.cpp | 88 ++++++++++--------- .../drivers/swr/rasterizer/core/backend.cpp | 10 +++ .../drivers/swr/rasterizer/core/backend.h | 1 + .../drivers/swr/rasterizer/core/context.h | 1 + .../drivers/swr/rasterizer/core/frontend.cpp | 30 +++++++ .../drivers/swr/rasterizer/core/frontend.h | 1 + .../drivers/swr/rasterizer/core/threads.cpp | 61 +++++++------ .../drivers/swr/rasterizer/core/threads.h | 3 +- 8 files changed, 126 insertions(+), 69 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index df87d14ca3e..703f239cc01 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -157,46 +157,6 @@ HANDLE SwrCreateContext( return (HANDLE)pContext; } -void SwrDestroyContext(HANDLE hContext) -{ - SWR_CONTEXT *pContext = GetContext(hContext); - DestroyThreadPool(pContext, &pContext->threadPool); - - // free the fifos - for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i) - { - delete [] pContext->dcRing[i].dynState.pStats; - delete pContext->dcRing[i].pArena; - delete pContext->dsRing[i].pArena; - pContext->pMacroTileManagerArray[i].~MacroTileMgr(); - pContext->pDispatchQueueArray[i].~DispatchQueue(); - } - - AlignedFree(pContext->pDispatchQueueArray); - AlignedFree(pContext->pMacroTileManagerArray); - - // Free scratch space. - for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) - { -#if defined(_WIN32) - VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE); -#else - AlignedFree(pContext->ppScratch[i]); -#endif - - ArchRast::DestroyThreadContext(pContext->pArContext[i]); - } - - delete [] pContext->ppScratch; - delete [] pContext->pArContext; - delete [] pContext->pStats; - - delete(pContext->pHotTileMgr); - - pContext->~SWR_CONTEXT(); - AlignedFree(GetContext(hContext)); -} - void CopyState(DRAW_STATE& dst, const DRAW_STATE& src) { memcpy(&dst.state, &src.state, sizeof(API_STATE)); @@ -382,6 +342,54 @@ API_STATE* GetDrawState(SWR_CONTEXT *pContext) return &pDC->pState->state; } +void SwrDestroyContext(HANDLE hContext) +{ + SWR_CONTEXT *pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + + pDC->FeWork.type = SHUTDOWN; + pDC->FeWork.pfnWork = ProcessShutdown; + + //enqueue + QueueDraw(pContext); + + DestroyThreadPool(pContext, &pContext->threadPool); + + // free the fifos + for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i) + { + delete[] pContext->dcRing[i].dynState.pStats; + delete pContext->dcRing[i].pArena; + delete pContext->dsRing[i].pArena; + pContext->pMacroTileManagerArray[i].~MacroTileMgr(); + pContext->pDispatchQueueArray[i].~DispatchQueue(); + } + + AlignedFree(pContext->pDispatchQueueArray); + AlignedFree(pContext->pMacroTileManagerArray); + + // Free scratch space. + for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) + { +#if defined(_WIN32) + VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE); +#else + AlignedFree(pContext->ppScratch[i]); +#endif + + ArchRast::DestroyThreadContext(pContext->pArContext[i]); + } + + delete[] pContext->ppScratch; + delete[] pContext->pArContext; + delete[] pContext->pStats; + + delete(pContext->pHotTileMgr); + + pContext->~SWR_CONTEXT(); + AlignedFree(GetContext(hContext)); +} + void SWR_API SwrSaveState( HANDLE hContext, void* pOutputStateBlock, diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index d3d114ecdb0..0a0001d0776 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -78,6 +78,16 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup AR_END(BEDispatch, 1); } +////////////////////////////////////////////////////////////////////////// +/// @brief Process shutdown. +/// @param pDC - pointer to draw context (dispatch). +/// @param workerId - The unique worker ID that is assigned to this thread. +/// @param threadGroupId - the linear index for the thread group within the dispatch. +void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) +{ + // Dummy function +} + void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) { uint32_t x, y; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 9d2f317f316..e19a53d6b04 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -38,6 +38,7 @@ void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); +void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers); void InitClearTilesTable(); simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ); diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index a4dbbc5280b..dfcc1c0d39a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -158,6 +158,7 @@ enum WORK_TYPE CLEAR, DISCARDINVALIDATETILES, STORETILES, + SHUTDOWN, }; struct BE_WORK diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index decc161f1f5..5d549873f36 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -80,6 +80,36 @@ void ProcessSync( pTileMgr->enqueue(0, 0, &work); } +////////////////////////////////////////////////////////////////////////// +/// @brief FE handler for SwrDestroyContext. +/// @param pContext - pointer to SWR context. +/// @param pDC - pointer to draw context. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param pUserData - Pointer to user data passed back to sync callback. +void ProcessShutdown( + SWR_CONTEXT *pContext, + DRAW_CONTEXT *pDC, + uint32_t workerId, + void *pUserData) +{ + BE_WORK work; + work.type = SHUTDOWN; + work.pfnWork = ProcessShutdownBE; + + MacroTileMgr *pTileMgr = pDC->pTileMgr; + // Enqueue at least 1 work item for each worker thread + // account for number of numa nodes + uint32_t numNumaNodes = pContext->threadPool.numaMask + 1; + + for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i) + { + for (uint32_t n = 0; n < numNumaNodes; ++n) + { + pTileMgr->enqueue(i, n, &work); + } + } +} + ////////////////////////////////////////////////////////////////////////// /// @brief FE handler for SwrClearRenderTarget. /// @param pContext - pointer to SWR context. diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h index 6316156bfd0..46924947a73 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.h +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h @@ -304,6 +304,7 @@ void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, v void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +void ProcessShutdown(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative); diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 446e795fb2b..b1a27f34c29 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -428,7 +428,8 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint32_t& curDrawBE, /// still have work pending in a previous draw. Additionally, the lockedTiles is /// hueristic that can steer a worker back to the same macrotile that it had been /// working on in a previous draw. -void WorkOnFifoBE( +/// @returns true if worker thread should shutdown +bool WorkOnFifoBE( SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, @@ -436,12 +437,14 @@ void WorkOnFifoBE( uint32_t numaNode, uint32_t numaMask) { + bool bShutdown = false; + // Find the first incomplete draw that has pending work. If no such draw is found then // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. uint32_t drawEnqueued = 0; if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false) { - return; + return false; } uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; @@ -458,17 +461,17 @@ void WorkOnFifoBE( { DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; - if (pDC->isCompute) return; // We don't look at compute work. + if (pDC->isCompute) return false; // We don't look at compute work. // First wait for FE to be finished with this draw. This keeps threading model simple // but if there are lots of bubbles between draws then serializing FE and BE may // need to be revisited. - if (!pDC->doneFE) return; + if (!pDC->doneFE) return false; // If this draw is dependent on a previous draw then we need to bail. if (CheckDependency(pContext, pDC, lastRetiredDraw)) { - return; + return false; } // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it. @@ -512,6 +515,10 @@ void WorkOnFifoBE( { pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID); } + else if (pWork->type == SHUTDOWN) + { + bShutdown = true; + } while ((pWork = tile->peek()) != nullptr) { @@ -526,7 +533,7 @@ void WorkOnFifoBE( // Optimization: If the draw is complete and we're the last one to have worked on it then // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete. - if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete()) + if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete())) { // We can increment the current BE and safely move to next draw since we know this draw is complete. curDrawBE++; @@ -537,6 +544,11 @@ void WorkOnFifoBE( lockedTiles.clear(); break; } + + if (bShutdown) + { + break; + } } else { @@ -545,6 +557,8 @@ void WorkOnFifoBE( } } } + + return bShutdown; } ////////////////////////////////////////////////////////////////////////// @@ -710,8 +724,15 @@ DWORD workerThreadMain(LPVOID pData) uint32_t curDrawBE = 0; uint32_t curDrawFE = 0; - while (pContext->threadPool.inThreadShutdown == false) + bool bShutdown = false; + + while (true) { + if (bShutdown && !threadHasWork(curDrawBE)) + { + break; + } + uint32_t loop = 0; while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE)) { @@ -729,29 +750,18 @@ DWORD workerThreadMain(LPVOID pData) continue; } - if (pContext->threadPool.inThreadShutdown) - { - lock.unlock(); - break; - } - AR_BEGIN(WorkerWaitForThreadEvent, 0); pContext->FifosNotEmpty.wait(lock); lock.unlock(); AR_END(WorkerWaitForThreadEvent, 0); - - if (pContext->threadPool.inThreadShutdown) - { - break; - } } if (IsBEThread) { AR_BEGIN(WorkerWorkOnFifoBE, 0); - WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); + bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); AR_END(WorkerWorkOnFifoBE, 0); WorkOnCompute(pContext, workerId, curDrawBE); @@ -918,7 +928,6 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) pPool->numThreads = numThreads; pContext->NumWorkerThreads = pPool->numThreads; - pPool->inThreadShutdown = false; pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA)); pPool->numaMask = 0; @@ -1001,17 +1010,15 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) { if (!pContext->threadInfo.SINGLE_THREADED) { - // Inform threads to finish up - std::unique_lock lock(pContext->WaitLock); - pPool->inThreadShutdown = true; - _mm_mfence(); - pContext->FifosNotEmpty.notify_all(); - lock.unlock(); + // Wait for all threads to finish + SwrWaitForIdle(pContext); // Wait for threads to finish and destroy them for (uint32_t t = 0; t < pPool->numThreads; ++t) { - pPool->pThreads[t]->join(); + // Detach from thread. Cannot join() due to possibility (in Windows) of code + // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns. + pPool->pThreads[t]->detach(); delete(pPool->pThreads[t]); } diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h index 05231c5a38f..c802c576fc3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.h +++ b/src/gallium/drivers/swr/rasterizer/core/threads.h @@ -54,7 +54,6 @@ struct THREAD_POOL THREAD_PTR* pThreads; uint32_t numThreads; uint32_t numaMask; - volatile bool inThreadShutdown; THREAD_DATA *pThreadData; }; @@ -65,6 +64,6 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); // Expose FE and BE worker functions to the API thread if single threaded void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE); -void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask); +bool WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask); void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE); int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC); -- 2.30.2