swr/rast: Rework thread binding parameters for machine partitioning

author Tim Rowley <timothy.o.rowley@intel.com>

Mon, 11 Dec 2017 23:45:58 +0000 (17:45 -0600)

committer Tim Rowley <timothy.o.rowley@intel.com>

Fri, 15 Dec 2017 16:56:46 +0000 (10:56 -0600)
author Tim Rowley <timothy.o.rowley@intel.com>
Mon, 11 Dec 2017 23:45:58 +0000 (17:45 -0600)
committer Tim Rowley <timothy.o.rowley@intel.com>
Fri, 15 Dec 2017 16:56:46 +0000 (10:56 -0600)
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py

index 09e31246021fb468fc0b1d79e4a3c781d8a81107..30803927e3c1e9f99a5875f086bed59b5b089c18 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
@@ -62,15 +62,33 @@ KNOBS = [
          'category'  : 'perf',
      }],
  
-    ['MAX_NUMA_NODES', {
+    ['BASE_NUMA_NODE', {
          'type'      : 'uint32_t',
          'default'   : '0',
+        'desc'      : ['Starting NUMA node index to use when allocating compute resources.',
+                       'Setting this to a non-zero value will reduce the maximum # of NUMA nodes used.'],
+        'category'  : 'perf',
+        'advanced'  : True,
+    }],
+
+    ['MAX_NUMA_NODES', {
+        'type'      : 'uint32_t',
+        'default'   : '1' if sys.platform == 'win32' else '0',
          'desc'      : ['Maximum # of NUMA-nodes per system used for worker threads',
                         '  0 == ALL NUMA-nodes in the system',
                         '  N == Use at most N NUMA-nodes for rendering'],
          'category'  : 'perf',
      }],
  
+    ['BASE_CORE', {
+        'type'      : 'uint32_t',
+        'default'   : '0',
+        'desc'      : ['Starting core index to use when allocating compute resources.',
+                       'Setting this to a non-zero value will reduce the maximum # of cores used.'],
+        'category'  : 'perf',
+        'advanced'  : True,
+    }],
+
      ['MAX_CORES_PER_NUMA_NODE', {
          'type'      : 'uint32_t',
          'default'   : '0',
@@ -80,6 +98,15 @@ KNOBS = [
          'category'  : 'perf',
      }],
  
+    ['BASE_THREAD', {
+        'type'      : 'uint32_t',
+        'default'   : '0',
+        'desc'      : ['Starting thread index to use when allocating compute resources.',
+                       'Setting this to a non-zero value will reduce the maximum # of threads used.'],
+        'category'  : 'perf',
+        'advanced'  : True,
+    }],
+
      ['MAX_THREADS_PER_CORE', {
          'type'      : 'uint32_t',
          'default'   : '1',
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp

index 9265440904fe95cc4956bdca2b49eaa166100fa0..25a3f3484113e8d57393f3154306e6454a715218 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -95,16 +95,32 @@ HANDLE SwrCreateContext(
          pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
      }
  
-    pContext->threadInfo.MAX_WORKER_THREADS        = KNOB_MAX_WORKER_THREADS;
-    pContext->threadInfo.MAX_NUMA_NODES            = KNOB_MAX_NUMA_NODES;
-    pContext->threadInfo.MAX_CORES_PER_NUMA_NODE   = KNOB_MAX_CORES_PER_NUMA_NODE;
-    pContext->threadInfo.MAX_THREADS_PER_CORE      = KNOB_MAX_THREADS_PER_CORE;
-    pContext->threadInfo.SINGLE_THREADED           = KNOB_SINGLE_THREADED;
-
      if (pCreateInfo->pThreadInfo)
      {
          pContext->threadInfo = *pCreateInfo->pThreadInfo;
      }
+    else
+    {
+        pContext->threadInfo.MAX_WORKER_THREADS         = KNOB_MAX_WORKER_THREADS;
+        pContext->threadInfo.BASE_NUMA_NODE             = KNOB_BASE_NUMA_NODE;
+        pContext->threadInfo.BASE_CORE                  = KNOB_BASE_CORE;
+        pContext->threadInfo.BASE_THREAD                = KNOB_BASE_THREAD;
+        pContext->threadInfo.MAX_NUMA_NODES             = KNOB_MAX_NUMA_NODES;
+        pContext->threadInfo.MAX_CORES_PER_NUMA_NODE    = KNOB_MAX_CORES_PER_NUMA_NODE;
+        pContext->threadInfo.MAX_THREADS_PER_CORE       = KNOB_MAX_THREADS_PER_CORE;
+        pContext->threadInfo.SINGLE_THREADED            = KNOB_SINGLE_THREADED;
+    }
+
+    if (pCreateInfo->pApiThreadInfo)
+    {
+        pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo;
+    }
+    else
+    {
+        pContext->apiThreadInfo.bindAPIThread0          = true;
+        pContext->apiThreadInfo.numAPIReservedThreads   = 1;
+        pContext->apiThreadInfo.numAPIThreadsPerCore    = 1;
+    }
  
      memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
      memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
@@ -113,6 +129,11 @@ HANDLE SwrCreateContext(
  
      CreateThreadPool(pContext, &pContext->threadPool);
  
+    if (pContext->apiThreadInfo.bindAPIThread0)
+    {
+        BindApiThread(pContext, 0);
+    }
+
      pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
      pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
  
@@ -407,6 +428,12 @@ void SwrDestroyContext(HANDLE hContext)
      AlignedFree(GetContext(hContext));
  }
  
+void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId)
+{
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    BindApiThread(pContext, apiThreadId);
+}
+
  void SWR_API SwrSaveState(
      HANDLE hContext,
      void* pOutputStateBlock,
@@ -1688,6 +1715,7 @@ void SwrGetInterface(SWR_INTERFACE &out_funcs)
  {
      out_funcs.pfnSwrCreateContext = SwrCreateContext;
      out_funcs.pfnSwrDestroyContext = SwrDestroyContext;
+    out_funcs.pfnSwrBindApiThread = SwrBindApiThread;
      out_funcs.pfnSwrSaveState = SwrSaveState;
      out_funcs.pfnSwrRestoreState = SwrRestoreState;
      out_funcs.pfnSwrSync = SwrSync;
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h

index c032b0bb103fdea743079fb4a18cf6aec016dd2b..7247fa4215f10a8de73b39204799e40f72f7127a 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -181,6 +181,9 @@ class BucketManager;
  /////////////////////////////////////////////////////////////////////////
  struct SWR_THREADING_INFO
  {
+    uint32_t    BASE_NUMA_NODE;
+    uint32_t    BASE_CORE;
+    uint32_t    BASE_THREAD;
      uint32_t    MAX_WORKER_THREADS;
      uint32_t    MAX_NUMA_NODES;
      uint32_t    MAX_CORES_PER_NUMA_NODE;
@@ -188,6 +191,24 @@ struct SWR_THREADING_INFO
      bool        SINGLE_THREADED;
  };
  
+//////////////////////////////////////////////////////////////////////////
+/// SWR_API_THREADING_INFO
+/// Data used to reserve HW threads for API use
+/// API Threads are reserved from numa nodes / cores used for
+/// SWR Worker threads.  Specifying reserved threads here can reduce
+/// the total number of SWR worker threads.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_API_THREADING_INFO
+{
+    uint32_t numAPIReservedThreads; // Default is 1 if SWR_API_THREADING_INFO is not sent
+    uint32_t bindAPIThread0;        // Default is true if numAPIReservedThreads is > 0,
+                                    // binds thread used in SwrCreateContext to API Reserved
+                                    // thread 0
+    uint32_t numAPIThreadsPerCore;  // 0 - means use all threads per core, else clamp to this number.
+                                    // Independent of KNOB_MAX_THREADS_PER_CORE.
+};
+
+
  //////////////////////////////////////////////////////////////////////////
  /// SWR_CREATECONTEXT_INFO
  /////////////////////////////////////////////////////////////////////////
@@ -219,6 +240,9 @@ struct SWR_CREATECONTEXT_INFO
      // Input (optional): Threading info that overrides any set KNOB values.
      SWR_THREADING_INFO* pThreadInfo;
  
+    // Input (optional}: Info for reserving API threads
+    SWR_API_THREADING_INFO* pApiThreadInfo;
+
      // Input: if set to non-zero value, overrides KNOB value for maximum
      // number of draws in flight
      uint32_t MAX_DRAWS_IN_FLIGHT;
@@ -236,6 +260,14 @@ SWR_FUNC(HANDLE, SwrCreateContext,
  SWR_FUNC(void, SwrDestroyContext,
      HANDLE hContext);
  
+//////////////////////////////////////////////////////////////////////////
+/// @brief Bind current thread to an API reserved HW thread
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param apiThreadId - index of reserved HW thread to bind to.
+SWR_FUNC(void, SwrBindApiThread,
+    HANDLE hContext,
+    uint32_t apiThreadId);
+
  //////////////////////////////////////////////////////////////////////////
  /// @brief Saves API state associated with hContext
  /// @param hContext - Handle passed back from SwrCreateContext
@@ -720,6 +752,7 @@ struct SWR_INTERFACE
  {
      PFNSwrCreateContext pfnSwrCreateContext;
      PFNSwrDestroyContext pfnSwrDestroyContext;
+    PFNSwrBindApiThread pfnSwrBindApiThread;
      PFNSwrSaveState pfnSwrSaveState;
      PFNSwrRestoreState pfnSwrRestoreState;
      PFNSwrSync pfnSwrSync;
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h

index cba8de999be80be0190ac7929bb65db00f7eca79..6a63838eb5da6f9096f7d9cee37840072d227e0f 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -480,6 +480,7 @@ struct SWR_CONTEXT
  
      THREAD_POOL threadPool; // Thread pool associated with this context
      SWR_THREADING_INFO threadInfo;
+    SWR_API_THREADING_INFO apiThreadInfo;
  
      uint32_t MAX_DRAWS_IN_FLIGHT;
  
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp

index 6242cb3fc7cab9cbab4e75ef3472963aea7a5543..d684ffe72784265488ca385860180ef81fe027e9 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -284,13 +284,20 @@ void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId =
      {
          // If MAX_WORKER_THREADS is set, only bind to the proc group,
          // Not the individual HW thread.
-        if (!pContext->threadInfo.MAX_WORKER_THREADS)
+        if (!bindProcGroup  && !pContext->threadInfo.MAX_WORKER_THREADS)
          {
              affinity.Mask = KAFFINITY(1) << threadId;
          }
+        else
+        {
+            affinity.Mask = KAFFINITY(0);
+        }
      }
  
-    SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr);
+    if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr))
+    {
+        SWR_INVALID("Failed to set Thread Affinity");
+    }
  
  #elif defined(__linux__) || defined(__gnu_linux__)
  
@@ -727,6 +734,29 @@ void WorkOnCompute(
      }
  }
  
+void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId)
+{
+    if (nullptr == pContext)
+    {
+        return;
+    }
+
+    if (apiThreadId >= pContext->threadPool.numReservedThreads)
+    {
+        if (pContext->threadPool.numReservedThreads)
+        {
+            const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[0];
+            // Just bind to the process group used for API thread 0
+            bindThread(pContext, 0, threadData.procGroupId, true);
+        }
+        return;
+    }
+
+    const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[apiThreadId];
+
+    bindThread(pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
+}
+
  template<bool IsFEThread, bool IsBEThread>
  DWORD workerThreadMain(LPVOID pData)
  {
@@ -752,7 +782,8 @@ DWORD workerThreadMain(LPVOID pData)
  
      RDTSC_INIT(threadId);
  
-    uint32_t numaNode = pThreadData->numaId;
+    // Only need offset numa index from base for correct masking
+    uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
      uint32_t numaMask = pContext->threadPool.numaMask;
  
      // flush denormals to 0
@@ -861,28 +892,50 @@ DWORD workerThreadInit(LPVOID pData)
  }
  template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
  
+static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
+{
+    // Initialize DRAW_CONTEXT's per-thread stats
+    for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
+    {
+        pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
+        memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
+    }
+}
+
  //////////////////////////////////////////////////////////////////////////
  /// @brief Creates thread pool info but doesn't launch threads.
  /// @param pContext - pointer to context
  /// @param pPool - pointer to thread pool object.
  void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
  {
-    bindThread(pContext, 0);
-
      CPUNumaNodes nodes;
      uint32_t numThreadsPerProcGroup = 0;
      CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
  
+    // Assumption, for asymmetric topologies, multi-threaded cores will appear
+    // in the list before single-threaded cores.  This appears to be true for
+    // Windows when the total HW threads is limited to 64.
      uint32_t numHWNodes         = (uint32_t)nodes.size();
      uint32_t numHWCoresPerNode  = (uint32_t)nodes[0].cores.size();
      uint32_t numHWHyperThreads  = (uint32_t)nodes[0].cores[0].threadIds.size();
  
+#if defined(_WIN32) && !defined(_WIN64)
+    if (!pContext->threadInfo.MAX_WORKER_THREADS)
+    {
+        // Limit 32-bit windows to bindable HW threads only
+        if ((numHWCoresPerNode * numHWHyperThreads) > 32)
+        {
+            numHWCoresPerNode = 32 / numHWHyperThreads;
+        }
+    }
+#endif
+
      // Calculate num HW threads.  Due to asymmetric topologies, this is not
      // a trivial multiplication.
      uint32_t numHWThreads = 0;
-    for (auto& node : nodes)
+    for (auto const& node : nodes)
      {
-        for (auto& core : node.cores)
+        for (auto const& core : node.cores)
          {
              numHWThreads += (uint32_t)core.threadIds.size();
          }
@@ -892,14 +945,19 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
      uint32_t numCoresPerNode    = numHWCoresPerNode;
      uint32_t numHyperThreads    = numHWHyperThreads;
  
-    if (pContext->threadInfo.MAX_NUMA_NODES)
+    // Calc used threads per-core
+    if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
      {
-        numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
+        numHyperThreads -= pContext->threadInfo.BASE_THREAD;
      }
-
-    if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
+    else
      {
-        numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
+        SWR_ASSERT(
+            false,
+            "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
+            pContext->threadInfo.BASE_THREAD,
+            numHyperThreads);
+        pContext->threadInfo.BASE_THREAD = 0;
      }
  
      if (pContext->threadInfo.MAX_THREADS_PER_CORE)
@@ -907,93 +965,139 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
          numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
      }
  
-#if defined(_WIN32) && !defined(_WIN64)
-    if (!pContext->threadInfo.MAX_WORKER_THREADS)
+    // Prune any cores that don't support the number of threads
+    if (numHyperThreads > 1)
      {
-        // Limit 32-bit windows to bindable HW threads only
-        if ((numCoresPerNode * numHWHyperThreads) > 32)
+        for (auto& node : nodes)
          {
-            numCoresPerNode = 32 / numHWHyperThreads;
+            uint32_t numUsableCores = 0;
+            for (auto& core : node.cores)
+            {
+                numUsableCores += (core.threadIds.size() >= numHyperThreads);
+            }
+            numCoresPerNode = std::min(numCoresPerNode, numUsableCores);
          }
      }
-#endif
-
-    // Calculate numThreads
-    uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
-    numThreads = std::min(numThreads, numHWThreads);
  
-    if (pContext->threadInfo.MAX_WORKER_THREADS)
+    // Calc used cores per NUMA node
+    if (numCoresPerNode > pContext->threadInfo.BASE_CORE)
      {
-        uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads;
-        numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, maxHWThreads);
+        numCoresPerNode -= pContext->threadInfo.BASE_CORE;
+    }
+    else
+    {
+        SWR_ASSERT(
+            false,
+            "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
+            pContext->threadInfo.BASE_CORE,
+            numCoresPerNode);
+        pContext->threadInfo.BASE_CORE = 0;
      }
  
-    uint32_t numAPIReservedThreads = 1;
-
+    if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
+    {
+        numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
+    }
  
-    if (numThreads == 1)
+    // Calc used NUMA nodes
+    if (numNodes > pContext->threadInfo.BASE_NUMA_NODE)
      {
-        // If only 1 worker threads, try to move it to an available
-        // HW thread.  If that fails, use the API thread.
-        if (numCoresPerNode < numHWCoresPerNode)
-        {
-            numCoresPerNode++;
-        }
-        else if (numHyperThreads < numHWHyperThreads)
-        {
-            numHyperThreads++;
-        }
-        else if (numNodes < numHWNodes)
-        {
-            numNodes++;
-        }
-        else
-        {
-            pContext->threadInfo.SINGLE_THREADED = true;
-        }
+        numNodes -= pContext->threadInfo.BASE_NUMA_NODE;
      }
      else
      {
-        // Save HW threads for the API if we can
-        if (numThreads > numAPIReservedThreads)
-        {
-            numThreads -= numAPIReservedThreads;
-        }
-        else
-        {
-            numAPIReservedThreads = 0;
-        }
+        SWR_ASSERT(
+            false,
+            "Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0",
+            pContext->threadInfo.BASE_NUMA_NODE,
+            numNodes);
+        pContext->threadInfo.BASE_NUMA_NODE = 0;
      }
  
-    if (pContext->threadInfo.SINGLE_THREADED)
+    if (pContext->threadInfo.MAX_NUMA_NODES)
      {
-        numThreads = 1;
+        numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
      }
  
-    // Initialize DRAW_CONTEXT's per-thread stats
-    for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
-    {
-        pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
-        memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
-    }
+    // Calculate numThreads - at this point everything should be symmetric
+    uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
+    SWR_REL_ASSERT(numThreads <= numHWThreads);
+
+    uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
+    uint32_t& numAPIThreadsPerCore = pContext->apiThreadInfo.numAPIThreadsPerCore;
+    uint32_t numRemovedThreads = 0;
  
      if (pContext->threadInfo.SINGLE_THREADED)
      {
+        numAPIReservedThreads = 0;
+        numThreads = 1;
          pContext->NumWorkerThreads = 1;
          pContext->NumFEThreads = 1;
          pContext->NumBEThreads = 1;
          pPool->numThreads = 0;
+    }
+    else if (pContext->threadInfo.MAX_WORKER_THREADS)
+    {
+        numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
+        pContext->threadInfo.BASE_NUMA_NODE = 0;
+        pContext->threadInfo.BASE_CORE = 0;
+        pContext->threadInfo.BASE_THREAD = 0;
+        numAPIReservedThreads = 0;
+    }
+    else
+    {
+        if (numAPIReservedThreads >= numThreads)
+        {
+            numAPIReservedThreads = 0;
+        }
+        else if (numAPIReservedThreads)
+        {
+            numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads);
+
+            if (0 == numAPIThreadsPerCore)
+            {
+                numAPIThreadsPerCore = numHWHyperThreads;
+            }
+
+            numRemovedThreads = numAPIReservedThreads;
+            if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
+            {
+                // Adjust removed threads to make logic below work
+                numRemovedThreads = std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
+            }
+
+            numThreads -= numRemovedThreads;
+        }
+    }
  
+    InitPerThreadStats(pContext, numThreads);
+
+    if (pContext->threadInfo.SINGLE_THREADED)
+    {
          return;
      }
  
+    if (numAPIReservedThreads)
+    {
+        pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads];
+        SWR_ASSERT(pPool->pApiThreadData);
+        if (!pPool->pApiThreadData)
+        {
+            numAPIReservedThreads = 0;
+        }
+    }
+    pPool->numReservedThreads = numAPIReservedThreads;
+
      pPool->numThreads = numThreads;
      pContext->NumWorkerThreads = pPool->numThreads;
  
-    pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
+    pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
+    SWR_ASSERT(pPool->pThreadData);
      pPool->numaMask = 0;
  
-    pPool->pThreads = new THREAD_PTR[pPool->numThreads];
+
+    pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
+    SWR_ASSERT(pPool->pThreads);
  
      if (pContext->threadInfo.MAX_WORKER_THREADS)
      {
@@ -1021,37 +1125,72 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
          // numa distribution assumes workers on all nodes
          bool useNuma = true;
          if (numCoresPerNode * numHyperThreads == 1)
+        {
              useNuma = false;
+        }
  
-        if (useNuma) {
+        if (useNuma)
+        {
              pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
-        } else {
+        }
+        else
+        {
              pPool->numaMask = 0;
          }
  
          uint32_t workerId = 0;
+        uint32_t numReservedThreads = numAPIReservedThreads;
          for (uint32_t n = 0; n < numNodes; ++n)
          {
-            auto& node = nodes[n];
+            if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size())
+            {
+                break;
+            }
+            auto& node = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
              uint32_t numCores = numCoresPerNode;
              for (uint32_t c = 0; c < numCores; ++c)
              {
-                if (c >= node.cores.size())
+                if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size())
                  {
                      break;
                  }
  
-                auto& core = node.cores[c];
+                auto& core = node.cores[c + pContext->threadInfo.BASE_CORE];
                  for (uint32_t t = 0; t < numHyperThreads; ++t)
                  {
-                    if (t >= core.threadIds.size())
+                    if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size())
                      {
                          break;
                      }
  
-                    if (numAPIReservedThreads)
+                    if (numRemovedThreads)
                      {
-                        --numAPIReservedThreads;
+                        --numRemovedThreads;
+                        SWR_REL_ASSERT(numReservedThreads);
+                        --numReservedThreads;
+                        pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
+                        pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
+                        pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t];
+                        pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
+                        pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
+                        pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
+                        pPool->pApiThreadData[numReservedThreads].pContext = pContext;
+                        pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
+
+
+                        if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
+                        {
+                            --numReservedThreads;
+                            pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
+                            pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
+                            pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t + 1];
+                            pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
+                            pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
+                            pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
+                            pPool->pApiThreadData[numReservedThreads].pContext = pContext;
+                            pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
+                        }
+
                          continue;
                      }
  
@@ -1059,11 +1198,12 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
  
                      pPool->pThreadData[workerId].workerId = workerId;
                      pPool->pThreadData[workerId].procGroupId = core.procGroup;
-                    pPool->pThreadData[workerId].threadId = core.threadIds[t];
-                    pPool->pThreadData[workerId].numaId = useNuma ? n : 0;
-                    pPool->pThreadData[workerId].coreId = c;
-                    pPool->pThreadData[workerId].htId = t;
+                    pPool->pThreadData[workerId].threadId = core.threadIds[t + pContext->threadInfo.BASE_THREAD];
+                    pPool->pThreadData[workerId].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
+                    pPool->pThreadData[workerId].coreId = c + pContext->threadInfo.BASE_CORE;
+                    pPool->pThreadData[workerId].htId = t + pContext->threadInfo.BASE_THREAD;
                      pPool->pThreadData[workerId].pContext = pContext;
+                    pPool->pThreadData[workerId].forceBindProcGroup = false;
  
                      pContext->NumBEThreads++;
                      pContext->NumFEThreads++;
@@ -1113,9 +1253,10 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
              delete(pPool->pThreads[t]);
          }
  
-        delete [] pPool->pThreads;
+        delete[] pPool->pThreads;
  
          // Clean up data used by threads
-        free(pPool->pThreadData);
+        delete[] pPool->pThreadData;
+        delete[] pPool->pApiThreadData;
      }
  }
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h

index dac8f86c1dfd4a80280f6aa05dd699f19995b053..2e53265f424b8a331bfe85ca876c6783a6d333ef 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -55,6 +55,8 @@ struct THREAD_POOL
      uint32_t numThreads;
      uint32_t numaMask;
      THREAD_DATA *pThreadData;
+    uint32_t numReservedThreads; // Number of threads reserved for API use
+    THREAD_DATA *pApiThreadData;
  };
  
  typedef std::unordered_set<uint32_t> TileSet;
@@ -68,3 +70,5 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
  bool WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
  void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE);
  int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
+
+void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp

index a6c54ab86e84e4c489c958e697d37a1fb7f32561..3ade6e4333ebe7f21baac1631b247384fb3cf6f8 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -100,7 +100,7 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
          {
              uint32_t size = numSamples * mHotTileSize[attachment];
              uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
-            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode);
+            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
              hotTile.state = HOTTILE_INVALID;
              hotTile.numSamples = numSamples;
              hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
@@ -124,7 +124,7 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
  
              uint32_t size = numSamples * mHotTileSize[attachment];
              uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
-            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode);
+            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
              hotTile.state = HOTTILE_INVALID;
              hotTile.numSamples = numSamples;
          }
author	Tim Rowley <timothy.o.rowley@intel.com>
	Mon, 11 Dec 2017 23:45:58 +0000 (17:45 -0600)
committer	Tim Rowley <timothy.o.rowley@intel.com>
	Fri, 15 Dec 2017 16:56:46 +0000 (10:56 -0600)
src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/api.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/api.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/context.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/threads.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/threads.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp		patch \| blob \| history