From: Tim Rowley Date: Wed, 27 Apr 2016 22:45:13 +0000 (-0600) Subject: swr: [rasterizer core] Fix thread allocation X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=6e8922705435a1649e51ea404b89aaeba37d555a;p=mesa.git swr: [rasterizer core] Fix thread allocation Fix windows in 32-bit mode when hyperthreading is disabled on Xeons. Some support for asymmetric processor topologies. Reviewed-by: Bruce Cherniak --- diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 47629e53ada..0b57a3fc836 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -68,6 +68,8 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread #if defined(_WIN32) + std::vector threadMaskPerProcGroup; + static std::mutex m; std::lock_guard l(m); @@ -96,14 +98,33 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask)) { // clear mask - gmask.Mask &= ~(KAFFINITY(1) << threadId); + KAFFINITY threadMask = KAFFINITY(1) << threadId; + gmask.Mask &= ~threadMask; + + if (procGroup >= threadMaskPerProcGroup.size()) + { + threadMaskPerProcGroup.resize(procGroup + 1); + } + + if (threadMaskPerProcGroup[procGroup] & threadMask) + { + // Already seen this mask. This means that we are in 32-bit mode and + // have seen more than 32 HW threads for this procGroup + // Don't use it +#if defined(_WIN64) + SWR_ASSERT(false, "Shouldn't get here in 64-bit mode"); +#endif + continue; + } + + threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId); // Find Numa Node + uint32_t numaId = 0; PROCESSOR_NUMBER procNum = {}; procNum.Group = WORD(procGroup); procNum.Number = UCHAR(threadId); - uint32_t numaId = 0; ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId); SWR_ASSERT(ret); @@ -118,16 +139,6 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread numaNode.cores.push_back(Core()); pCore = &numaNode.cores.back(); pCore->procGroup = procGroup; -#if !defined(_WIN64) - coreId = (uint32_t)numaNode.cores.size(); - if ((coreId * numThreads) > 32) - { - // Windows doesn't return threadIds >= 32 for a processor group correctly - // when running a 32-bit application. - // Just save -1 as the threadId - threadId = uint32_t(-1); - } -#endif } pCore->threadIds.push_back(threadId); if (procGroup == 0) @@ -712,6 +723,17 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size(); uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size(); + // Calculate num HW threads. Due to asymmetric topologies, this is not + // a trivial multiplication. + uint32_t numHWThreads = 0; + for (auto& node : nodes) + { + for (auto& core : node.cores) + { + numHWThreads += (uint32_t)core.threadIds.size(); + } + } + uint32_t numNodes = numHWNodes; uint32_t numCoresPerNode = numHWCoresPerNode; uint32_t numHyperThreads = numHWHyperThreads; @@ -759,6 +781,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) // Calculate numThreads uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads; + numThreads = std::min(numThreads, numHWThreads); if (KNOB_MAX_WORKER_THREADS) { @@ -849,23 +872,30 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) for (uint32_t n = 0; n < numNodes; ++n) { auto& node = nodes[n]; - if (node.cores.size() == 0) - { - continue; - } - uint32_t numCores = numCoresPerNode; for (uint32_t c = 0; c < numCores; ++c) { + if (c >= node.cores.size()) + { + break; + } + auto& core = node.cores[c]; for (uint32_t t = 0; t < numHyperThreads; ++t) { + if (t >= core.threadIds.size()) + { + break; + } + if (numAPIReservedThreads) { --numAPIReservedThreads; continue; } + SWR_ASSERT(workerId < numThreads); + pPool->pThreadData[workerId].workerId = workerId; pPool->pThreadData[workerId].procGroupId = core.procGroup; pPool->pThreadData[workerId].threadId = core.threadIds[t];