From 7472a8ee75ec3f2b401fa211edb26c2161eb4a6a Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Mon, 22 Aug 2016 11:49:48 -0500 Subject: [PATCH] swr: [rasterizer core] remove KNOB_MAX_THREADS Use dynamic memory allocation for per-thread data Signed-off-by: Tim Rowley --- .../drivers/swr/rasterizer/core/api.cpp | 48 ++++++++------- .../drivers/swr/rasterizer/core/backend.cpp | 2 +- .../drivers/swr/rasterizer/core/context.h | 15 +++-- .../drivers/swr/rasterizer/core/knobs.h | 2 - .../drivers/swr/rasterizer/core/threads.cpp | 59 ++++++++++++------- .../drivers/swr/rasterizer/core/threads.h | 2 +- 6 files changed, 77 insertions(+), 51 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 599d1f4ed9d..7108a83d0d3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -77,6 +77,15 @@ HANDLE SwrCreateContext( pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); + for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) + { + pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); + new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena); + new (&pContext->pDispatchQueueArray[dc]) DispatchQueue(); + + pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); + } + pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS; pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES; pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE; @@ -88,24 +97,12 @@ HANDLE SwrCreateContext( pContext->threadInfo = *pCreateInfo->pThreadInfo; } - for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) - { - pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); - new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena); - new (&pContext->pDispatchQueueArray[dc]) DispatchQueue(); - - pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); - } - - if (!pContext->threadInfo.SINGLE_THREADED) - { - memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock)); - memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty)); - new (&pContext->WaitLock) std::mutex(); - new (&pContext->FifosNotEmpty) std::condition_variable(); + memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock)); + memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty)); + new (&pContext->WaitLock) std::mutex(); + new (&pContext->FifosNotEmpty) std::condition_variable(); - CreateThreadPool(pContext, &pContext->threadPool); - } + CreateThreadPool(pContext, &pContext->threadPool); // Calling createThreadPool() above can set SINGLE_THREADED if (pContext->threadInfo.SINGLE_THREADED) @@ -115,6 +112,9 @@ HANDLE SwrCreateContext( pContext->NumBEThreads = 1; } + pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads]; + pContext->pStats = new SWR_STATS[pContext->NumWorkerThreads]; + // Allocate scratch space for workers. ///@note We could lazily allocate this but its rather small amount of memory. for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) @@ -122,12 +122,12 @@ HANDLE SwrCreateContext( #if defined(_WIN32) uint32_t numaNode = pContext->threadPool.pThreadData ? pContext->threadPool.pThreadData[i].numaId : 0; - pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma( + pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma( GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE, numaNode); #else - pContext->pScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4); + pContext->ppScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4); #endif } @@ -166,6 +166,7 @@ void SwrDestroyContext(HANDLE hContext) // free the fifos for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i) { + delete [] pContext->dcRing[i].dynState.pStats; delete pContext->dcRing[i].pArena; delete pContext->dsRing[i].pArena; pContext->pMacroTileManagerArray[i].~MacroTileMgr(); @@ -179,12 +180,15 @@ void SwrDestroyContext(HANDLE hContext) for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) { #if defined(_WIN32) - VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE); + VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE); #else - AlignedFree(pContext->pScratch[i]); + AlignedFree(pContext->ppScratch[i]); #endif } + delete [] pContext->ppScratch; + delete [] pContext->pStats; + delete(pContext->pHotTileMgr); pContext->~SWR_CONTEXT(); @@ -352,7 +356,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) pCurDrawContext->threadsDone = 0; pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr; - memset(&pCurDrawContext->dynState, 0, sizeof(pCurDrawContext->dynState)); + pCurDrawContext->dynState.Reset(pContext->threadPool.numThreads); // Assign unique drawId for this DC pCurDrawContext->drawId = pContext->dcRing.GetHead(); diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 7dd6c0db3de..0e92ccf2c88 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -68,7 +68,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup csContext.dispatchDims[0] = pTaskData->threadGroupCountX; csContext.dispatchDims[1] = pTaskData->threadGroupCountY; csContext.dispatchDims[2] = pTaskData->threadGroupCountZ; - csContext.pTGSM = pContext->pScratch[workerId]; + csContext.pTGSM = pContext->ppScratch[workerId]; csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer; state.pfnCsFunc(GetPrivateState(pDC), &csContext); diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 0848264f8fa..fe78cd6dc93 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -368,12 +368,19 @@ struct DRAW_STATE struct DRAW_DYNAMIC_STATE { + void Reset(uint32_t numThreads) + { + SWR_STATS* pSavePtr = pStats; + memset(this, 0, sizeof(*this)); + pStats = pSavePtr; + memset(pStats, 0, sizeof(SWR_STATS) * (numThreads ? numThreads : 1)); + } ///@todo Currently assumes only a single FE can do stream output for a draw. uint32_t SoWriteOffset[4]; bool SoWriteOffsetDirty[4]; SWR_STATS_FE statsFE; // Only one FE thread per DC. - SWR_STATS stats[KNOB_MAX_NUM_THREADS]; + SWR_STATS* pStats; }; // Draw Context @@ -486,10 +493,10 @@ struct SWR_CONTEXT PFN_UPDATE_STATS_FE pfnUpdateStatsFE; // Global Stats - SWR_STATS stats[KNOB_MAX_NUM_THREADS]; + SWR_STATS* pStats; // Scratch space for workers. - uint8_t* pScratch[KNOB_MAX_NUM_THREADS]; + uint8_t** ppScratch; volatile int32_t drawsOutstandingFE; @@ -501,5 +508,5 @@ struct SWR_CONTEXT TileSet singleThreadLockedTiles; }; -#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.stats[workerId].name += count; } +#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.pStats[workerId].name += count; } #define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.statsFE.name += count; } diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h index 740613118a6..c01ad67f7c4 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h @@ -92,8 +92,6 @@ /////////////////////////////////////////////////////////////////////////////// // Configuration knobs /////////////////////////////////////////////////////////////////////////////// -#define KNOB_MAX_NUM_THREADS 256 // Supports up to dual-HSW-Xeon. - // Maximum supported number of active vertex buffer streams #define KNOB_NUM_STREAMS 32 diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 9665f09e2c8..ed03d70a1f9 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -73,14 +73,19 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread static std::mutex m; std::lock_guard l(m); - static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS]; - DWORD bufSize = sizeof(buffer); + DWORD bufSize = 0; - BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize); + BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize); + SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize); + SWR_ASSERT(pBufferMem); + + ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize); SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information"); - uint32_t count = bufSize / buffer->Size; - PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer; + uint32_t count = bufSize / pBufferMem->Size; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem; for (uint32_t i = 0; i < count; ++i) { @@ -150,6 +155,8 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread pBuffer = PtrAdd(pBuffer, pBuffer->Size); } + free(pBufferMem); + #elif defined(__linux__) || defined (__gnu_linux__) @@ -321,10 +328,10 @@ INLINE void UpdateClientStats(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) // Sum up stats across all workers before sending to client. for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) { - stats.DepthPassCount += dynState.stats[i].DepthPassCount; + stats.DepthPassCount += dynState.pStats[i].DepthPassCount; - stats.PsInvocations += dynState.stats[i].PsInvocations; - stats.CsInvocations += dynState.stats[i].CsInvocations; + stats.PsInvocations += dynState.pStats[i].PsInvocations; + stats.CsInvocations += dynState.pStats[i].CsInvocations; } pContext->pfnUpdateStats(GetPrivateState(pDC), &stats); @@ -849,13 +856,6 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, maxHWThreads); } - if (numThreads > KNOB_MAX_NUM_THREADS) - { - printf("WARNING: system thread count %u exceeds max %u, " - "performance will be degraded\n", - numThreads, KNOB_MAX_NUM_THREADS); - } - uint32_t numAPIReservedThreads = 1; @@ -878,8 +878,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) else { pPool->numThreads = 0; - SET_KNOB(SINGLE_THREADED, true); - return; + numThreads = 1; + pContext->threadInfo.SINGLE_THREADED = true; } } else @@ -895,6 +895,19 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) } } + // Initialize DRAW_CONTEXT's per-thread stats + for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) + { + pContext->dcRing[dc].dynState.pStats = new SWR_STATS[numThreads]; + memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads); + } + + if (pContext->threadInfo.SINGLE_THREADED) + { + return; + } + + pPool->numThreads = numThreads; pContext->NumWorkerThreads = pPool->numThreads; @@ -902,6 +915,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA)); pPool->numaMask = 0; + pPool->pThreads = new THREAD_PTR[pPool->numThreads]; + if (pContext->threadInfo.MAX_WORKER_THREADS) { bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup); @@ -918,7 +933,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) pPool->pThreadData[workerId].htId = 0; pPool->pThreadData[workerId].pContext = pContext; pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup; - pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); + pPool->pThreads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); pContext->NumBEThreads++; pContext->NumFEThreads++; @@ -964,7 +979,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) pPool->pThreadData[workerId].htId = t; pPool->pThreadData[workerId].pContext = pContext; - pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); + pPool->pThreads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); pContext->NumBEThreads++; pContext->NumFEThreads++; @@ -989,10 +1004,12 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) // Wait for threads to finish and destroy them for (uint32_t t = 0; t < pPool->numThreads; ++t) { - pPool->threads[t]->join(); - delete(pPool->threads[t]); + pPool->pThreads[t]->join(); + delete(pPool->pThreads[t]); } + delete [] pPool->pThreads; + // Clean up data used by threads free(pPool->pThreadData); } diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h index 157f46aff70..05231c5a38f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.h +++ b/src/gallium/drivers/swr/rasterizer/core/threads.h @@ -51,7 +51,7 @@ struct THREAD_DATA struct THREAD_POOL { - THREAD_PTR threads[KNOB_MAX_NUM_THREADS]; + THREAD_PTR* pThreads; uint32_t numThreads; uint32_t numaMask; volatile bool inThreadShutdown; -- 2.30.2