swr: [rasterizer core] Support sparse numa id values on all OSes

[mesa.git] / src / gallium / drivers / swr / rasterizer / core / threads.cpp
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp

index b1a27f34c2924dafaa9e49a4e206bba14fddb596..e11291bb83e9c9f1a090f9b97607fd7776f01cad 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -56,6 +56,7 @@ struct Core
  
  struct NumaNode
  {
+    uint32_t          numaId;
      std::vector<Core> cores;
  };
  
@@ -134,8 +135,12 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
                  SWR_ASSERT(ret);
  
                  // Store data
-                if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
+                if (out_nodes.size() <= numaId)
+                {
+                    out_nodes.resize(numaId + 1);
+                }
                  auto& numaNode = out_nodes[numaId];
+                numaNode.numaId = numaId;
  
                  uint32_t coreId = 0;
  
@@ -175,11 +180,18 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
              if (threadId != uint32_t(-1))
              {
                  // Save information.
-                if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
+                if (out_nodes.size() <= numaId)
+                {
+                    out_nodes.resize(numaId + 1);
+                }
+
                  auto& numaNode = out_nodes[numaId];
-                if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
-                auto& core = numaNode.cores[coreId];
+                if (numaNode.cores.size() <= coreId)
+                {
+                    numaNode.cores.resize(coreId + 1);
+                }
  
+                auto& core = numaNode.cores[coreId];
                  core.procGroup = coreId;
                  core.threadIds.push_back(threadId);
  
@@ -207,9 +219,16 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
      if (threadId != uint32_t(-1))
      {
          // Save information.
-        if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
+        if (out_nodes.size() <= numaId)
+        {
+            out_nodes.resize(numaId + 1);
+        }
          auto& numaNode = out_nodes[numaId];
-        if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
+        numaNode.numaId = numaId;
+        if (numaNode.cores.size() <= coreId)
+        {
+            numaNode.cores.resize(coreId + 1);
+        }
          auto& core = numaNode.cores[coreId];
  
          core.procGroup = coreId;
@@ -217,22 +236,38 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
          out_numThreadsPerProcGroup++;
      }
  
-    for (uint32_t node = 0; node < out_nodes.size(); node++) {
-        auto& numaNode = out_nodes[node];
-        auto it = numaNode.cores.begin();
-        for ( ; it != numaNode.cores.end(); ) {
-            if (it->threadIds.size() == 0)
-                numaNode.cores.erase(it);
-            else
-                ++it;
-        }
-    }
-
  #else
  
  #error Unsupported platform
  
  #endif
+
+    // Prune empty cores and numa nodes
+    for (auto node_it = out_nodes.begin(); node_it != out_nodes.end(); )
+    {
+        // Erase empty cores (first)
+        for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end(); )
+        {
+            if (core_it->threadIds.size() == 0)
+            {
+                core_it = node_it->cores.erase(core_it);
+            }
+            else
+            {
+                ++core_it;
+            }
+        }
+
+        // Erase empty numa nodes (second)
+        if (node_it->cores.size() == 0)
+        {
+            node_it = out_nodes.erase(node_it);
+        }
+        else
+        {
+            ++node_it;
+        }
+    }
  }
  
  
@@ -313,11 +348,16 @@ bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastReti
      return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
  }
  
+bool CheckDependencyFE(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw)
+{
+    return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
+}
+
  //////////////////////////////////////////////////////////////////////////
  /// @brief Update client stats.
-INLINE void UpdateClientStats(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
+INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
  {
-    if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStats == false))
+    if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStatsBE == false))
      {
          return;
      }
@@ -334,12 +374,13 @@ INLINE void UpdateClientStats(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
          stats.CsInvocations  += dynState.pStats[i].CsInvocations;
      }
  
+
      pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
  }
  
-INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
+INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
  {
-    UpdateClientStats(pContext, pDC);
+    UpdateClientStats(pContext, workerId, pDC);
  
      if (pDC->retireCallback.pfnCallbackFunc)
      {
@@ -350,14 +391,14 @@ INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
  }
  
  // inlined-only version
-INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
+INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
  {
      int32_t result = InterlockedDecrement((volatile LONG*)&pDC->threadsDone);
      SWR_ASSERT(result >= 0);
  
      if (result == 0)
      {
-        ExecuteCallbacks(pContext, pDC);
+        ExecuteCallbacks(pContext, workerId, pDC);
  
          // Cleanup memory allocations
          pDC->pArena->Reset(true);
@@ -381,10 +422,10 @@ INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
  // available to other translation modules
  int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
  {
-    return CompleteDrawContextInl(pContext, pDC);
+    return CompleteDrawContextInl(pContext, 0, pDC);
  }
  
-INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint32_t& curDrawBE, uint32_t& drawEnqueued)
+INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE, uint32_t& drawEnqueued)
  {
      // increment our current draw id to the first incomplete draw
      drawEnqueued = GetEnqueuedDraw(pContext);
@@ -402,7 +443,7 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint32_t& curDrawBE,
          if (isWorkComplete)
          {
              curDrawBE++;
-            CompleteDrawContextInl(pContext, pDC);
+            CompleteDrawContextInl(pContext, workerId, pDC);
          }
          else
          {
@@ -442,7 +483,7 @@ bool WorkOnFifoBE(
      // Find the first incomplete draw that has pending work. If no such draw is found then
      // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
      uint32_t drawEnqueued = 0;
-    if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
+    if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
      {
          return false;
      }
@@ -537,7 +578,7 @@ bool WorkOnFifoBE(
                  {
                      // We can increment the current BE and safely move to next draw since we know this draw is complete.
                      curDrawBE++;
-                    CompleteDrawContextInl(pContext, pDC);
+                    CompleteDrawContextInl(pContext, workerId, pDC);
  
                      lastRetiredDraw++;
  
@@ -563,13 +604,21 @@ bool WorkOnFifoBE(
  
  //////////////////////////////////////////////////////////////////////////
  /// @brief Called when FE work is complete for this DC.
-INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
+INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
  {
-    _ReadWriteBarrier();
-
-    if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStats)
+    if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE)
      {
-        pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &pDC->dynState.statsFE);
+        SWR_STATS_FE& stats = pDC->dynState.statsFE;
+
+        AR_EVENT(FrontendStatsEvent(pDC->drawId,
+            stats.IaVertices, stats.IaPrimitives, stats.VsInvocations, stats.HsInvocations,
+            stats.DsInvocations, stats.GsInvocations, stats.GsPrimitives, stats.CInvocations, stats.CPrimitives,
+            stats.SoPrimStorageNeeded[0], stats.SoPrimStorageNeeded[1], stats.SoPrimStorageNeeded[2], stats.SoPrimStorageNeeded[3],
+            stats.SoNumPrimsWritten[0], stats.SoNumPrimsWritten[1], stats.SoNumPrimsWritten[2], stats.SoNumPrimsWritten[3]
+        ));
+               AR_EVENT(FrontendDrawEndEvent(pDC->drawId));
+
+        pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats);
      }
  
      if (pContext->pfnUpdateSoWriteOffset)
@@ -584,6 +633,8 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
          }
      }
  
+    // Ensure all streaming writes are globally visible before marking this FE done
+    _mm_mfence();
      pDC->doneFE = true;
  
      InterlockedDecrement((volatile LONG*)&pContext->drawsOutstandingFE);
@@ -597,9 +648,9 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
      {
          uint32_t dcSlot = curDrawFE % KNOB_MAX_DRAWS_IN_FLIGHT;
          DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
-        if (pDC->isCompute || pDC->doneFE || pDC->FeLock)
+        if (pDC->isCompute || pDC->doneFE)
          {
-            CompleteDrawContextInl(pContext, pDC);
+            CompleteDrawContextInl(pContext, workerId, pDC);
              curDrawFE++;
          }
          else
@@ -608,6 +659,7 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
          }
      }
  
+    uint32_t lastRetiredFE = curDrawFE - 1;
      uint32_t curDraw = curDrawFE;
      while (IDComparesLess(curDraw, drawEnqueued))
      {
@@ -616,13 +668,18 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
  
          if (!pDC->isCompute && !pDC->FeLock)
          {
+            if (CheckDependencyFE(pContext, pDC, lastRetiredFE))
+            {
+                return;
+            }
+
              uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
              if (initial == 0)
              {
                  // successfully grabbed the DC, now run the FE
                  pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
  
-                CompleteDrawFE(pContext, pDC);
+                CompleteDrawFE(pContext, workerId, pDC);
              }
          }
          curDraw++;
@@ -642,7 +699,7 @@ void WorkOnCompute(
      uint32_t& curDrawBE)
  {
      uint32_t drawEnqueued = 0;
-    if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
+    if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
      {
          return;
      }
@@ -670,10 +727,12 @@ void WorkOnCompute(
              uint32_t threadGroupId = 0;
              while (queue.getWork(threadGroupId))
              {
-                ProcessComputeBE(pDC, workerId, threadGroupId, pSpillFillBuffer);
-
+                queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer);
                  queue.finishedWork();
              }
+
+            // Ensure all streaming writes are globally visible before moving onto the next draw
+            _mm_mfence();
          }
      }
  }
@@ -750,12 +809,8 @@ DWORD workerThreadMain(LPVOID pData)
                  continue;
              }
  
-            AR_BEGIN(WorkerWaitForThreadEvent, 0);
-
              pContext->FifosNotEmpty.wait(lock);
              lock.unlock();
-
-            AR_END(WorkerWaitForThreadEvent, 0);
          }
  
          if (IsBEThread)
@@ -803,7 +858,11 @@ DWORD workerThreadInit(LPVOID pData)
  }
  template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
  
-void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
+//////////////////////////////////////////////////////////////////////////
+/// @brief Creates thread pool info but doesn't launch threads.
+/// @param pContext - pointer to context
+/// @param pPool - pointer to thread pool object.
+void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
  {
      bindThread(pContext, 0);
  
@@ -949,7 +1008,6 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
              pPool->pThreadData[workerId].htId = 0;
              pPool->pThreadData[workerId].pContext = pContext;
              pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
-            pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
  
              pContext->NumBEThreads++;
              pContext->NumFEThreads++;
@@ -990,12 +1048,11 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
                      pPool->pThreadData[workerId].workerId = workerId;
                      pPool->pThreadData[workerId].procGroupId = core.procGroup;
                      pPool->pThreadData[workerId].threadId = core.threadIds[t];
-                    pPool->pThreadData[workerId].numaId = n;
+                    pPool->pThreadData[workerId].numaId = node.numaId;
                      pPool->pThreadData[workerId].coreId = c;
                      pPool->pThreadData[workerId].htId = t;
                      pPool->pThreadData[workerId].pContext = pContext;
  
-                    pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
                      pContext->NumBEThreads++;
                      pContext->NumFEThreads++;
  
@@ -1003,9 +1060,31 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
                  }
              }
          }
+        SWR_ASSERT(workerId == pContext->NumWorkerThreads);
      }
  }
  
+//////////////////////////////////////////////////////////////////////////
+/// @brief Launches worker threads in thread pool.
+/// @param pContext - pointer to context
+/// @param pPool - pointer to thread pool object.
+void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
+{
+    if (pContext->threadInfo.SINGLE_THREADED)
+    {
+        return;
+    }
+
+    for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId)
+    {
+        pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Destroys thread pool.
+/// @param pContext - pointer to context
+/// @param pPool - pointer to thread pool object.
  void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
  {
      if (!pContext->threadInfo.SINGLE_THREADED)