From a939a58881063c092a95bd7f1426b8fae1d8a44d Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Wed, 30 Mar 2016 14:59:40 -0600 Subject: [PATCH] swr: [rasterizer core] Add experimental support for hyper-threaded front-end Acked-by: Brian Paul --- .../drivers/swr/rasterizer/core/api.cpp | 8 +- .../drivers/swr/rasterizer/core/context.h | 38 +++--- .../drivers/swr/rasterizer/core/threads.cpp | 126 +++++++++++++----- .../drivers/swr/rasterizer/core/threads.h | 4 +- .../swr/rasterizer/scripts/knob_defs.py | 19 +++ 5 files changed, 139 insertions(+), 56 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 9e13ee142a5..665b6c0453f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -87,7 +87,10 @@ HANDLE SwrCreateContext( // Calling createThreadPool() above can set SINGLE_THREADED if (KNOB_SINGLE_THREADED) { + SET_KNOB(HYPERTHREADED_FE, false); pContext->NumWorkerThreads = 1; + pContext->NumFEThreads = 1; + pContext->NumBEThreads = 1; } // Allocate scratch space for workers. @@ -177,8 +180,7 @@ void QueueWork(SWR_CONTEXT *pContext) // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and // then moved on if all work is done.) - pContext->pCurDrawContext->threadsDone = - pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2; + pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads; _ReadWriteBarrier(); { @@ -196,7 +198,7 @@ void QueueWork(SWR_CONTEXT *pContext) { static TileSet lockedTiles; uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; - WorkOnFifoFE(pContext, 0, curDraw[0], 0); + WorkOnFifoFE(pContext, 0, curDraw[0]); WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0); } else diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 27abe437718..2c28286b5ad 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -382,32 +382,28 @@ struct DRAW_STATE // This draw context maintains all of the state needed for the draw operation. struct DRAW_CONTEXT { - SWR_CONTEXT *pContext; + SWR_CONTEXT* pContext; + uint64_t drawId; + MacroTileMgr* pTileMgr; + DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) + uint64_t dependency; + DRAW_STATE* pState; + CachingArena* pArena; - uint64_t drawId; + bool isCompute; // Is this DC a compute context? + bool cleanupState; // True if this is the last draw using an entry in the state ring. + volatile bool doneFE; // Is FE work done for this draw? - bool isCompute; // Is this DC a compute context? + volatile OSALIGNLINE(uint32_t) FeLock; + volatile int64_t threadsDone; - FE_WORK FeWork; - volatile OSALIGNLINE(uint32_t) FeLock; - volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw? - volatile OSALIGNLINE(int64_t) threadsDone; + OSALIGNLINE(FE_WORK) FeWork; + uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills. - uint64_t dependency; - - MacroTileMgr* pTileMgr; - - // The following fields are valid if isCompute is true. - DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) - - DRAW_STATE* pState; - CachingArena* pArena; - - uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills. - - bool cleanupState; // True if this is the last draw using an entry in the state ring. }; +static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT"); + INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC) { SWR_ASSERT(pDC != nullptr); @@ -459,6 +455,8 @@ struct SWR_CONTEXT uint32_t curStateId; // Current index to the next available entry in the DS ring. uint32_t NumWorkerThreads; + uint32_t NumFEThreads; + uint32_t NumBEThreads; THREAD_POOL threadPool; // Thread pool associated with this context diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 056003e467c..bee1e138002 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -305,10 +305,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) return result; } -INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE) +INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE, uint64_t& drawEnqueued) { // increment our current draw id to the first incomplete draw - uint64_t drawEnqueued = GetEnqueuedDraw(pContext); + drawEnqueued = GetEnqueuedDraw(pContext); while (curDrawBE < drawEnqueued) { DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT]; @@ -316,8 +316,9 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE) // If its not compute and FE is not done then break out of loop. if (!pDC->doneFE && !pDC->isCompute) break; - bool isWorkComplete = (pDC->isCompute) ? - pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete(); + bool isWorkComplete = pDC->isCompute ? + pDC->pDispatch->isWorkComplete() : + pDC->pTileMgr->isWorkComplete(); if (isWorkComplete) { @@ -358,7 +359,8 @@ void WorkOnFifoBE( { // Find the first incomplete draw that has pending work. If no such draw is found then // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. - if (FindFirstIncompleteDraw(pContext, curDrawBE) == false) + uint64_t drawEnqueued = 0; + if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false) { return; } @@ -373,7 +375,7 @@ void WorkOnFifoBE( // 2. If we're trying to work on draws after curDrawBE, we are restricted to // working on those macrotiles that are known to be complete in the prior draw to // maintain order. The locked tiles provides the history to ensures this. - for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i) + for (uint64_t i = curDrawBE; i < drawEnqueued; ++i) { DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; @@ -466,7 +468,7 @@ void WorkOnFifoBE( } } -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode) +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE) { // Try to grab the next DC from the ring uint64_t drawEnqueued = GetEnqueuedDraw(pContext); @@ -519,38 +521,43 @@ void WorkOnCompute( uint32_t workerId, uint64_t& curDrawBE) { - if (FindFirstIncompleteDraw(pContext, curDrawBE) == false) + uint64_t drawEnqueued = 0; + if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false) { return; } uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; - DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT]; - if (pDC->isCompute == false) return; - - // check dependencies - if (CheckDependency(pContext, pDC, lastRetiredDraw)) + for (uint64_t i = curDrawBE; curDrawBE < drawEnqueued; ++i) { - return; - } + DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; + if (pDC->isCompute == false) return; + + // check dependencies + if (CheckDependency(pContext, pDC, lastRetiredDraw)) + { + return; + } - SWR_ASSERT(pDC->pDispatch != nullptr); - DispatchQueue& queue = *pDC->pDispatch; + SWR_ASSERT(pDC->pDispatch != nullptr); + DispatchQueue& queue = *pDC->pDispatch; - // Is there any work remaining? - if (queue.getNumQueued() > 0) - { - uint32_t threadGroupId = 0; - while (queue.getWork(threadGroupId)) + // Is there any work remaining? + if (queue.getNumQueued() > 0) { - ProcessComputeBE(pDC, workerId, threadGroupId); + uint32_t threadGroupId = 0; + while (queue.getWork(threadGroupId)) + { + ProcessComputeBE(pDC, workerId, threadGroupId); - queue.finishedWork(); + queue.finishedWork(); + } } } } +template DWORD workerThreadMain(LPVOID pData) { THREAD_DATA *pThreadData = (THREAD_DATA*)pData; @@ -634,25 +641,38 @@ DWORD workerThreadMain(LPVOID pData) } } - RDTSC_START(WorkerWorkOnFifoBE); - WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); - RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); + if (IsBEThread) + { + RDTSC_START(WorkerWorkOnFifoBE); + WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); + RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); - WorkOnCompute(pContext, workerId, curDrawBE); + WorkOnCompute(pContext, workerId, curDrawBE); + } + + if (IsFEThread) + { + WorkOnFifoFE(pContext, workerId, curDrawFE); - WorkOnFifoFE(pContext, workerId, curDrawFE, numaNode); + if (!IsBEThread) + { + curDrawBE = curDrawFE; + } + } } return 0; } +template<> DWORD workerThreadMain(LPVOID) = delete; +template DWORD workerThreadInit(LPVOID pData) { #if defined(_WIN32) __try #endif // _WIN32 { - return workerThreadMain(pData); + return workerThreadMain(pData); } #if defined(_WIN32) @@ -664,6 +684,7 @@ DWORD workerThreadInit(LPVOID pData) return 1; } +template<> DWORD workerThreadInit(LPVOID pData) = delete; void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) { @@ -681,6 +702,16 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) uint32_t numCoresPerNode = numHWCoresPerNode; uint32_t numHyperThreads = numHWHyperThreads; + if (KNOB_MAX_WORKER_THREADS) + { + SET_KNOB(HYPERTHREADED_FE, false); + } + + if (KNOB_HYPERTHREADED_FE) + { + SET_KNOB(MAX_THREADS_PER_CORE, 0); + } + if (KNOB_MAX_NUMA_NODES) { numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES); @@ -696,6 +727,11 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE); } + if (numHyperThreads < 2) + { + SET_KNOB(HYPERTHREADED_FE, false); + } + // Calculate numThreads uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads; @@ -770,9 +806,14 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups; pPool->pThreadData[workerId].threadId = 0; pPool->pThreadData[workerId].numaId = 0; + pPool->pThreadData[workerId].coreId = 0; + pPool->pThreadData[workerId].htId = 0; pPool->pThreadData[workerId].pContext = pContext; pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup; - pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); + pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); + + pContext->NumBEThreads++; + pContext->NumFEThreads++; } } else @@ -804,8 +845,29 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) pPool->pThreadData[workerId].procGroupId = core.procGroup; pPool->pThreadData[workerId].threadId = core.threadIds[t]; pPool->pThreadData[workerId].numaId = n; + pPool->pThreadData[workerId].coreId = c; + pPool->pThreadData[workerId].htId = t; pPool->pThreadData[workerId].pContext = pContext; - pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); + + if (KNOB_HYPERTHREADED_FE) + { + if (t == 0) + { + pContext->NumBEThreads++; + pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); + } + else + { + pContext->NumFEThreads++; + pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); + } + } + else + { + pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); + pContext->NumBEThreads++; + pContext->NumFEThreads++; + } ++workerId; } diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h index 821d7dcb16e..3aba6323a95 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.h +++ b/src/gallium/drivers/swr/rasterizer/core/threads.h @@ -41,6 +41,8 @@ struct THREAD_DATA uint32_t procGroupId; // Will always be 0 for non-Windows OS uint32_t threadId; // within the procGroup for Windows uint32_t numaId; // NUMA node id + uint32_t coreId; // Core id + uint32_t htId; // Hyperthread id uint32_t workerId; SWR_CONTEXT *pContext; bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set. @@ -62,7 +64,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); // Expose FE and BE worker functions to the API thread if single threaded -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode); +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE); void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask); void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE); int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC); \ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py index 0f3ded68544..3832b91d93e 100644 --- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py +++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py @@ -30,6 +30,18 @@ KNOBS = [ 'category' : 'debug', }], + ['HYPERTHREADED_FE', { + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['EXPERIMENTAL!!', + 'If enabled will attempt to use secondary threads per core to perform', + 'front-end (VS/GS) work.', + '', + 'Note: Setting this will cause KNOB_MAX_THREADS_PER_CORE to be ignored.'], + 'category' : 'perf', + 'advanced' : 'true', + }], + ['DUMP_SHADER_IR', { 'type' : 'bool', 'default' : 'false', @@ -166,6 +178,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_FETCH', { @@ -175,6 +188,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_IA', { @@ -184,6 +198,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_VS', { @@ -193,6 +208,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_SETUP_TRIS', { @@ -202,6 +218,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_BIN_TRIS', { @@ -211,6 +228,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_RS', { @@ -220,4 +238,5 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }],] -- 2.30.2