swr: [rasterizer core] Fix thread allocation

author Tim Rowley <timothy.o.rowley@intel.com>

Wed, 27 Apr 2016 22:45:13 +0000 (16:45 -0600)

committer Tim Rowley <timothy.o.rowley@intel.com>

Thu, 5 May 2016 19:49:11 +0000 (14:49 -0500)
author Tim Rowley <timothy.o.rowley@intel.com>
Wed, 27 Apr 2016 22:45:13 +0000 (16:45 -0600)
committer Tim Rowley <timothy.o.rowley@intel.com>
Thu, 5 May 2016 19:49:11 +0000 (14:49 -0500)
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp

index 47629e53ada756ffc392b895c72529c5c8f3726b..0b57a3fc836c3efd611c0b379d798c30a257c5c6 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -68,6 +68,8 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
  
  #if defined(_WIN32)
  
+    std::vector<KAFFINITY> threadMaskPerProcGroup;
+
      static std::mutex m;
      std::lock_guard<std::mutex> l(m);
  
@@ -96,14 +98,33 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
              while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
              {
                  // clear mask
-                gmask.Mask &= ~(KAFFINITY(1) << threadId);
+                KAFFINITY threadMask = KAFFINITY(1) << threadId;
+                gmask.Mask &= ~threadMask;
+
+                if (procGroup >= threadMaskPerProcGroup.size())
+                {
+                    threadMaskPerProcGroup.resize(procGroup + 1);
+                }
+
+                if (threadMaskPerProcGroup[procGroup] & threadMask)
+                {
+                    // Already seen this mask.  This means that we are in 32-bit mode and
+                    // have seen more than 32 HW threads for this procGroup
+                    // Don't use it
+#if defined(_WIN64)
+                    SWR_ASSERT(false, "Shouldn't get here in 64-bit mode");
+#endif
+                    continue;
+                }
+
+                threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);
  
                  // Find Numa Node
+                uint32_t numaId = 0;
                  PROCESSOR_NUMBER procNum = {};
                  procNum.Group = WORD(procGroup);
                  procNum.Number = UCHAR(threadId);
  
-                uint32_t numaId = 0;
                  ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
                  SWR_ASSERT(ret);
  
@@ -118,16 +139,6 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
                      numaNode.cores.push_back(Core());
                      pCore = &numaNode.cores.back();
                      pCore->procGroup = procGroup;
-#if !defined(_WIN64)
-                    coreId = (uint32_t)numaNode.cores.size();
-                    if ((coreId * numThreads) > 32)
-                    {
-                        // Windows doesn't return threadIds >= 32 for a processor group correctly
-                        // when running a 32-bit application.
-                        // Just save -1 as the threadId
-                        threadId = uint32_t(-1);
-                    }
-#endif
                  }
                  pCore->threadIds.push_back(threadId);
                  if (procGroup == 0)
@@ -712,6 +723,17 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
      uint32_t numHWCoresPerNode  = (uint32_t)nodes[0].cores.size();
      uint32_t numHWHyperThreads  = (uint32_t)nodes[0].cores[0].threadIds.size();
  
+    // Calculate num HW threads.  Due to asymmetric topologies, this is not
+    // a trivial multiplication.
+    uint32_t numHWThreads = 0;
+    for (auto& node : nodes)
+    {
+        for (auto& core : node.cores)
+        {
+            numHWThreads += (uint32_t)core.threadIds.size();
+        }
+    }
+
      uint32_t numNodes           = numHWNodes;
      uint32_t numCoresPerNode    = numHWCoresPerNode;
      uint32_t numHyperThreads    = numHWHyperThreads;
@@ -759,6 +781,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
  
      // Calculate numThreads
      uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
+    numThreads = std::min(numThreads, numHWThreads);
  
      if (KNOB_MAX_WORKER_THREADS)
      {
@@ -849,23 +872,30 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
          for (uint32_t n = 0; n < numNodes; ++n)
          {
              auto& node = nodes[n];
-            if (node.cores.size() == 0)
-            {
-               continue;
-            }
-
              uint32_t numCores = numCoresPerNode;
              for (uint32_t c = 0; c < numCores; ++c)
              {
+                if (c >= node.cores.size())
+                {
+                    break;
+                }
+
                  auto& core = node.cores[c];
                  for (uint32_t t = 0; t < numHyperThreads; ++t)
                  {
+                    if (t >= core.threadIds.size())
+                    {
+                        break;
+                    }
+
                      if (numAPIReservedThreads)
                      {
                          --numAPIReservedThreads;
                          continue;
                      }
  
+                    SWR_ASSERT(workerId < numThreads);
+
                      pPool->pThreadData[workerId].workerId = workerId;
                      pPool->pThreadData[workerId].procGroupId = core.procGroup;
                      pPool->pThreadData[workerId].threadId = core.threadIds[t];
author	Tim Rowley <timothy.o.rowley@intel.com>
	Wed, 27 Apr 2016 22:45:13 +0000 (16:45 -0600)
committer	Tim Rowley <timothy.o.rowley@intel.com>
	Thu, 5 May 2016 19:49:11 +0000 (14:49 -0500)