From 20f9006603139a479b756c593c04a540041e3471 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Mon, 11 Dec 2017 17:45:58 -0600 Subject: [PATCH] swr/rast: Rework thread binding parameters for machine partitioning Add BASE_NUMA_NODE, BASE_CORE, BASE_THREAD parameters to SwrCreateContext. Add optional SWR_API_THREADING_INFO parameter to SwrCreateContext to control reservation of API threads. Add SwrBindApiThread() function to allow binding of API threads to reserved HW threads. Reviewed-by: Bruce Cherniak --- .../swr/rasterizer/codegen/knob_defs.py | 29 +- .../drivers/swr/rasterizer/core/api.cpp | 40 ++- src/gallium/drivers/swr/rasterizer/core/api.h | 33 ++ .../drivers/swr/rasterizer/core/context.h | 1 + .../drivers/swr/rasterizer/core/threads.cpp | 299 +++++++++++++----- .../drivers/swr/rasterizer/core/threads.h | 4 + .../drivers/swr/rasterizer/core/tilemgr.cpp | 4 +- 7 files changed, 322 insertions(+), 88 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py index 09e31246021..30803927e3c 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py @@ -62,15 +62,33 @@ KNOBS = [ 'category' : 'perf', }], - ['MAX_NUMA_NODES', { + ['BASE_NUMA_NODE', { 'type' : 'uint32_t', 'default' : '0', + 'desc' : ['Starting NUMA node index to use when allocating compute resources.', + 'Setting this to a non-zero value will reduce the maximum # of NUMA nodes used.'], + 'category' : 'perf', + 'advanced' : True, + }], + + ['MAX_NUMA_NODES', { + 'type' : 'uint32_t', + 'default' : '1' if sys.platform == 'win32' else '0', 'desc' : ['Maximum # of NUMA-nodes per system used for worker threads', ' 0 == ALL NUMA-nodes in the system', ' N == Use at most N NUMA-nodes for rendering'], 'category' : 'perf', }], + ['BASE_CORE', { + 'type' : 'uint32_t', + 'default' : '0', + 'desc' : ['Starting core index to use when allocating compute resources.', + 'Setting this to a non-zero value will reduce the maximum # of cores used.'], + 'category' : 'perf', + 'advanced' : True, + }], + ['MAX_CORES_PER_NUMA_NODE', { 'type' : 'uint32_t', 'default' : '0', @@ -80,6 +98,15 @@ KNOBS = [ 'category' : 'perf', }], + ['BASE_THREAD', { + 'type' : 'uint32_t', + 'default' : '0', + 'desc' : ['Starting thread index to use when allocating compute resources.', + 'Setting this to a non-zero value will reduce the maximum # of threads used.'], + 'category' : 'perf', + 'advanced' : True, + }], + ['MAX_THREADS_PER_CORE', { 'type' : 'uint32_t', 'default' : '1', diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 9265440904f..25a3f348411 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -95,16 +95,32 @@ HANDLE SwrCreateContext( pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); } - pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS; - pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES; - pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE; - pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE; - pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED; - if (pCreateInfo->pThreadInfo) { pContext->threadInfo = *pCreateInfo->pThreadInfo; } + else + { + pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS; + pContext->threadInfo.BASE_NUMA_NODE = KNOB_BASE_NUMA_NODE; + pContext->threadInfo.BASE_CORE = KNOB_BASE_CORE; + pContext->threadInfo.BASE_THREAD = KNOB_BASE_THREAD; + pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES; + pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE; + pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE; + pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED; + } + + if (pCreateInfo->pApiThreadInfo) + { + pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo; + } + else + { + pContext->apiThreadInfo.bindAPIThread0 = true; + pContext->apiThreadInfo.numAPIReservedThreads = 1; + pContext->apiThreadInfo.numAPIThreadsPerCore = 1; + } memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock)); memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty)); @@ -113,6 +129,11 @@ HANDLE SwrCreateContext( CreateThreadPool(pContext, &pContext->threadPool); + if (pContext->apiThreadInfo.bindAPIThread0) + { + BindApiThread(pContext, 0); + } + pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads]; pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64); @@ -407,6 +428,12 @@ void SwrDestroyContext(HANDLE hContext) AlignedFree(GetContext(hContext)); } +void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId) +{ + SWR_CONTEXT *pContext = GetContext(hContext); + BindApiThread(pContext, apiThreadId); +} + void SWR_API SwrSaveState( HANDLE hContext, void* pOutputStateBlock, @@ -1688,6 +1715,7 @@ void SwrGetInterface(SWR_INTERFACE &out_funcs) { out_funcs.pfnSwrCreateContext = SwrCreateContext; out_funcs.pfnSwrDestroyContext = SwrDestroyContext; + out_funcs.pfnSwrBindApiThread = SwrBindApiThread; out_funcs.pfnSwrSaveState = SwrSaveState; out_funcs.pfnSwrRestoreState = SwrRestoreState; out_funcs.pfnSwrSync = SwrSync; diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h index c032b0bb103..7247fa4215f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -181,6 +181,9 @@ class BucketManager; ///////////////////////////////////////////////////////////////////////// struct SWR_THREADING_INFO { + uint32_t BASE_NUMA_NODE; + uint32_t BASE_CORE; + uint32_t BASE_THREAD; uint32_t MAX_WORKER_THREADS; uint32_t MAX_NUMA_NODES; uint32_t MAX_CORES_PER_NUMA_NODE; @@ -188,6 +191,24 @@ struct SWR_THREADING_INFO bool SINGLE_THREADED; }; +////////////////////////////////////////////////////////////////////////// +/// SWR_API_THREADING_INFO +/// Data used to reserve HW threads for API use +/// API Threads are reserved from numa nodes / cores used for +/// SWR Worker threads. Specifying reserved threads here can reduce +/// the total number of SWR worker threads. +///////////////////////////////////////////////////////////////////////// +struct SWR_API_THREADING_INFO +{ + uint32_t numAPIReservedThreads; // Default is 1 if SWR_API_THREADING_INFO is not sent + uint32_t bindAPIThread0; // Default is true if numAPIReservedThreads is > 0, + // binds thread used in SwrCreateContext to API Reserved + // thread 0 + uint32_t numAPIThreadsPerCore; // 0 - means use all threads per core, else clamp to this number. + // Independent of KNOB_MAX_THREADS_PER_CORE. +}; + + ////////////////////////////////////////////////////////////////////////// /// SWR_CREATECONTEXT_INFO ///////////////////////////////////////////////////////////////////////// @@ -219,6 +240,9 @@ struct SWR_CREATECONTEXT_INFO // Input (optional): Threading info that overrides any set KNOB values. SWR_THREADING_INFO* pThreadInfo; + // Input (optional}: Info for reserving API threads + SWR_API_THREADING_INFO* pApiThreadInfo; + // Input: if set to non-zero value, overrides KNOB value for maximum // number of draws in flight uint32_t MAX_DRAWS_IN_FLIGHT; @@ -236,6 +260,14 @@ SWR_FUNC(HANDLE, SwrCreateContext, SWR_FUNC(void, SwrDestroyContext, HANDLE hContext); +////////////////////////////////////////////////////////////////////////// +/// @brief Bind current thread to an API reserved HW thread +/// @param hContext - Handle passed back from SwrCreateContext +/// @param apiThreadId - index of reserved HW thread to bind to. +SWR_FUNC(void, SwrBindApiThread, + HANDLE hContext, + uint32_t apiThreadId); + ////////////////////////////////////////////////////////////////////////// /// @brief Saves API state associated with hContext /// @param hContext - Handle passed back from SwrCreateContext @@ -720,6 +752,7 @@ struct SWR_INTERFACE { PFNSwrCreateContext pfnSwrCreateContext; PFNSwrDestroyContext pfnSwrDestroyContext; + PFNSwrBindApiThread pfnSwrBindApiThread; PFNSwrSaveState pfnSwrSaveState; PFNSwrRestoreState pfnSwrRestoreState; PFNSwrSync pfnSwrSync; diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index cba8de999be..6a63838eb5d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -480,6 +480,7 @@ struct SWR_CONTEXT THREAD_POOL threadPool; // Thread pool associated with this context SWR_THREADING_INFO threadInfo; + SWR_API_THREADING_INFO apiThreadInfo; uint32_t MAX_DRAWS_IN_FLIGHT; diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 6242cb3fc7c..d684ffe7278 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -284,13 +284,20 @@ void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = { // If MAX_WORKER_THREADS is set, only bind to the proc group, // Not the individual HW thread. - if (!pContext->threadInfo.MAX_WORKER_THREADS) + if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS) { affinity.Mask = KAFFINITY(1) << threadId; } + else + { + affinity.Mask = KAFFINITY(0); + } } - SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr); + if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr)) + { + SWR_INVALID("Failed to set Thread Affinity"); + } #elif defined(__linux__) || defined(__gnu_linux__) @@ -727,6 +734,29 @@ void WorkOnCompute( } } +void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId) +{ + if (nullptr == pContext) + { + return; + } + + if (apiThreadId >= pContext->threadPool.numReservedThreads) + { + if (pContext->threadPool.numReservedThreads) + { + const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[0]; + // Just bind to the process group used for API thread 0 + bindThread(pContext, 0, threadData.procGroupId, true); + } + return; + } + + const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[apiThreadId]; + + bindThread(pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup); +} + template DWORD workerThreadMain(LPVOID pData) { @@ -752,7 +782,8 @@ DWORD workerThreadMain(LPVOID pData) RDTSC_INIT(threadId); - uint32_t numaNode = pThreadData->numaId; + // Only need offset numa index from base for correct masking + uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE; uint32_t numaMask = pContext->threadPool.numaMask; // flush denormals to 0 @@ -861,28 +892,50 @@ DWORD workerThreadInit(LPVOID pData) } template<> DWORD workerThreadInit(LPVOID pData) = delete; +static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads) +{ + // Initialize DRAW_CONTEXT's per-thread stats + for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc) + { + pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64); + memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads); + } +} + ////////////////////////////////////////////////////////////////////////// /// @brief Creates thread pool info but doesn't launch threads. /// @param pContext - pointer to context /// @param pPool - pointer to thread pool object. void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) { - bindThread(pContext, 0); - CPUNumaNodes nodes; uint32_t numThreadsPerProcGroup = 0; CalculateProcessorTopology(nodes, numThreadsPerProcGroup); + // Assumption, for asymmetric topologies, multi-threaded cores will appear + // in the list before single-threaded cores. This appears to be true for + // Windows when the total HW threads is limited to 64. uint32_t numHWNodes = (uint32_t)nodes.size(); uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size(); uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size(); +#if defined(_WIN32) && !defined(_WIN64) + if (!pContext->threadInfo.MAX_WORKER_THREADS) + { + // Limit 32-bit windows to bindable HW threads only + if ((numHWCoresPerNode * numHWHyperThreads) > 32) + { + numHWCoresPerNode = 32 / numHWHyperThreads; + } + } +#endif + // Calculate num HW threads. Due to asymmetric topologies, this is not // a trivial multiplication. uint32_t numHWThreads = 0; - for (auto& node : nodes) + for (auto const& node : nodes) { - for (auto& core : node.cores) + for (auto const& core : node.cores) { numHWThreads += (uint32_t)core.threadIds.size(); } @@ -892,14 +945,19 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) uint32_t numCoresPerNode = numHWCoresPerNode; uint32_t numHyperThreads = numHWHyperThreads; - if (pContext->threadInfo.MAX_NUMA_NODES) + // Calc used threads per-core + if (numHyperThreads > pContext->threadInfo.BASE_THREAD) { - numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES); + numHyperThreads -= pContext->threadInfo.BASE_THREAD; } - - if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE) + else { - numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE); + SWR_ASSERT( + false, + "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0", + pContext->threadInfo.BASE_THREAD, + numHyperThreads); + pContext->threadInfo.BASE_THREAD = 0; } if (pContext->threadInfo.MAX_THREADS_PER_CORE) @@ -907,93 +965,139 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE); } -#if defined(_WIN32) && !defined(_WIN64) - if (!pContext->threadInfo.MAX_WORKER_THREADS) + // Prune any cores that don't support the number of threads + if (numHyperThreads > 1) { - // Limit 32-bit windows to bindable HW threads only - if ((numCoresPerNode * numHWHyperThreads) > 32) + for (auto& node : nodes) { - numCoresPerNode = 32 / numHWHyperThreads; + uint32_t numUsableCores = 0; + for (auto& core : node.cores) + { + numUsableCores += (core.threadIds.size() >= numHyperThreads); + } + numCoresPerNode = std::min(numCoresPerNode, numUsableCores); } } -#endif - - // Calculate numThreads - uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads; - numThreads = std::min(numThreads, numHWThreads); - if (pContext->threadInfo.MAX_WORKER_THREADS) + // Calc used cores per NUMA node + if (numCoresPerNode > pContext->threadInfo.BASE_CORE) { - uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads; - numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, maxHWThreads); + numCoresPerNode -= pContext->threadInfo.BASE_CORE; + } + else + { + SWR_ASSERT( + false, + "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0", + pContext->threadInfo.BASE_CORE, + numCoresPerNode); + pContext->threadInfo.BASE_CORE = 0; } - uint32_t numAPIReservedThreads = 1; - + if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE) + { + numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE); + } - if (numThreads == 1) + // Calc used NUMA nodes + if (numNodes > pContext->threadInfo.BASE_NUMA_NODE) { - // If only 1 worker threads, try to move it to an available - // HW thread. If that fails, use the API thread. - if (numCoresPerNode < numHWCoresPerNode) - { - numCoresPerNode++; - } - else if (numHyperThreads < numHWHyperThreads) - { - numHyperThreads++; - } - else if (numNodes < numHWNodes) - { - numNodes++; - } - else - { - pContext->threadInfo.SINGLE_THREADED = true; - } + numNodes -= pContext->threadInfo.BASE_NUMA_NODE; } else { - // Save HW threads for the API if we can - if (numThreads > numAPIReservedThreads) - { - numThreads -= numAPIReservedThreads; - } - else - { - numAPIReservedThreads = 0; - } + SWR_ASSERT( + false, + "Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0", + pContext->threadInfo.BASE_NUMA_NODE, + numNodes); + pContext->threadInfo.BASE_NUMA_NODE = 0; } - if (pContext->threadInfo.SINGLE_THREADED) + if (pContext->threadInfo.MAX_NUMA_NODES) { - numThreads = 1; + numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES); } - // Initialize DRAW_CONTEXT's per-thread stats - for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc) - { - pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64); - memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads); - } + // Calculate numThreads - at this point everything should be symmetric + uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads; + SWR_REL_ASSERT(numThreads <= numHWThreads); + + uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads; + uint32_t& numAPIThreadsPerCore = pContext->apiThreadInfo.numAPIThreadsPerCore; + uint32_t numRemovedThreads = 0; if (pContext->threadInfo.SINGLE_THREADED) { + numAPIReservedThreads = 0; + numThreads = 1; pContext->NumWorkerThreads = 1; pContext->NumFEThreads = 1; pContext->NumBEThreads = 1; pPool->numThreads = 0; + } + else if (pContext->threadInfo.MAX_WORKER_THREADS) + { + numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads); + pContext->threadInfo.BASE_NUMA_NODE = 0; + pContext->threadInfo.BASE_CORE = 0; + pContext->threadInfo.BASE_THREAD = 0; + numAPIReservedThreads = 0; + } + else + { + if (numAPIReservedThreads >= numThreads) + { + numAPIReservedThreads = 0; + } + else if (numAPIReservedThreads) + { + numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads); + + if (0 == numAPIThreadsPerCore) + { + numAPIThreadsPerCore = numHWHyperThreads; + } + + numRemovedThreads = numAPIReservedThreads; + if (numAPIThreadsPerCore == 2 && numHyperThreads == 1) + { + // Adjust removed threads to make logic below work + numRemovedThreads = std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2); + } + + numThreads -= numRemovedThreads; + } + } + InitPerThreadStats(pContext, numThreads); + + if (pContext->threadInfo.SINGLE_THREADED) + { return; } + if (numAPIReservedThreads) + { + pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads]; + SWR_ASSERT(pPool->pApiThreadData); + if (!pPool->pApiThreadData) + { + numAPIReservedThreads = 0; + } + } + pPool->numReservedThreads = numAPIReservedThreads; + pPool->numThreads = numThreads; pContext->NumWorkerThreads = pPool->numThreads; - pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA)); + pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads]; + SWR_ASSERT(pPool->pThreadData); pPool->numaMask = 0; - pPool->pThreads = new THREAD_PTR[pPool->numThreads]; + + pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads]; + SWR_ASSERT(pPool->pThreads); if (pContext->threadInfo.MAX_WORKER_THREADS) { @@ -1021,37 +1125,72 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) // numa distribution assumes workers on all nodes bool useNuma = true; if (numCoresPerNode * numHyperThreads == 1) + { useNuma = false; + } - if (useNuma) { + if (useNuma) + { pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.) - } else { + } + else + { pPool->numaMask = 0; } uint32_t workerId = 0; + uint32_t numReservedThreads = numAPIReservedThreads; for (uint32_t n = 0; n < numNodes; ++n) { - auto& node = nodes[n]; + if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size()) + { + break; + } + auto& node = nodes[n + pContext->threadInfo.BASE_NUMA_NODE]; uint32_t numCores = numCoresPerNode; for (uint32_t c = 0; c < numCores; ++c) { - if (c >= node.cores.size()) + if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size()) { break; } - auto& core = node.cores[c]; + auto& core = node.cores[c + pContext->threadInfo.BASE_CORE]; for (uint32_t t = 0; t < numHyperThreads; ++t) { - if (t >= core.threadIds.size()) + if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size()) { break; } - if (numAPIReservedThreads) + if (numRemovedThreads) { - --numAPIReservedThreads; + --numRemovedThreads; + SWR_REL_ASSERT(numReservedThreads); + --numReservedThreads; + pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU; + pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup; + pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t]; + pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0; + pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE; + pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD; + pPool->pApiThreadData[numReservedThreads].pContext = pContext; + pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false; + + + if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads) + { + --numReservedThreads; + pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU; + pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup; + pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t + 1]; + pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0; + pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE; + pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD; + pPool->pApiThreadData[numReservedThreads].pContext = pContext; + pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false; + } + continue; } @@ -1059,11 +1198,12 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) pPool->pThreadData[workerId].workerId = workerId; pPool->pThreadData[workerId].procGroupId = core.procGroup; - pPool->pThreadData[workerId].threadId = core.threadIds[t]; - pPool->pThreadData[workerId].numaId = useNuma ? n : 0; - pPool->pThreadData[workerId].coreId = c; - pPool->pThreadData[workerId].htId = t; + pPool->pThreadData[workerId].threadId = core.threadIds[t + pContext->threadInfo.BASE_THREAD]; + pPool->pThreadData[workerId].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0; + pPool->pThreadData[workerId].coreId = c + pContext->threadInfo.BASE_CORE; + pPool->pThreadData[workerId].htId = t + pContext->threadInfo.BASE_THREAD; pPool->pThreadData[workerId].pContext = pContext; + pPool->pThreadData[workerId].forceBindProcGroup = false; pContext->NumBEThreads++; pContext->NumFEThreads++; @@ -1113,9 +1253,10 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) delete(pPool->pThreads[t]); } - delete [] pPool->pThreads; + delete[] pPool->pThreads; // Clean up data used by threads - free(pPool->pThreadData); + delete[] pPool->pThreadData; + delete[] pPool->pApiThreadData; } } diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h index dac8f86c1df..2e53265f424 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.h +++ b/src/gallium/drivers/swr/rasterizer/core/threads.h @@ -55,6 +55,8 @@ struct THREAD_POOL uint32_t numThreads; uint32_t numaMask; THREAD_DATA *pThreadData; + uint32_t numReservedThreads; // Number of threads reserved for API use + THREAD_DATA *pApiThreadData; }; typedef std::unordered_set TileSet; @@ -68,3 +70,5 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE) bool WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask); void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE); int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC); + +void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId); diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp index a6c54ab86e8..3ade6e4333e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp @@ -100,7 +100,7 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32 { uint32_t size = numSamples * mHotTileSize[attachment]; uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); - hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode); + hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE); hotTile.state = HOTTILE_INVALID; hotTile.numSamples = numSamples; hotTile.renderTargetArrayIndex = renderTargetArrayIndex; @@ -124,7 +124,7 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32 uint32_t size = numSamples * mHotTileSize[attachment]; uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); - hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode); + hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE); hotTile.state = HOTTILE_INVALID; hotTile.numSamples = numSamples; } -- 2.30.2