swr: [rasterizer core] Support sparse numa id values on all OSes

[mesa.git] / src / gallium / drivers / swr / rasterizer / core / threads.cpp
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp

index bee1e138002af2d85ffdce24c4a5ed7a5479e50e..e11291bb83e9c9f1a090f9b97607fd7776f01cad 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -1,5 +1,5 @@
  /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -56,6 +56,7 @@ struct Core
  
  struct NumaNode
  {
+    uint32_t          numaId;
      std::vector<Core> cores;
  };
  
@@ -68,17 +69,24 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
  
  #if defined(_WIN32)
  
+    std::vector<KAFFINITY> threadMaskPerProcGroup;
+
      static std::mutex m;
      std::lock_guard<std::mutex> l(m);
  
-    static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
-    DWORD bufSize = sizeof(buffer);
+    DWORD bufSize = 0;
+
+    BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
+    SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
  
-    BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize);
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
+    SWR_ASSERT(pBufferMem);
+
+    ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
      SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
  
-    uint32_t count = bufSize / buffer->Size;
-    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer;
+    uint32_t count = bufSize / pBufferMem->Size;
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;
  
      for (uint32_t i = 0; i < count; ++i)
      {
@@ -96,20 +104,43 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
              while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
              {
                  // clear mask
-                gmask.Mask &= ~(KAFFINITY(1) << threadId);
+                KAFFINITY threadMask = KAFFINITY(1) << threadId;
+                gmask.Mask &= ~threadMask;
+
+                if (procGroup >= threadMaskPerProcGroup.size())
+                {
+                    threadMaskPerProcGroup.resize(procGroup + 1);
+                }
+
+                if (threadMaskPerProcGroup[procGroup] & threadMask)
+                {
+                    // Already seen this mask.  This means that we are in 32-bit mode and
+                    // have seen more than 32 HW threads for this procGroup
+                    // Don't use it
+#if defined(_WIN64)
+                    SWR_ASSERT(false, "Shouldn't get here in 64-bit mode");
+#endif
+                    continue;
+                }
+
+                threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);
  
                  // Find Numa Node
+                uint32_t numaId = 0;
                  PROCESSOR_NUMBER procNum = {};
                  procNum.Group = WORD(procGroup);
                  procNum.Number = UCHAR(threadId);
  
-                uint32_t numaId = 0;
                  ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
                  SWR_ASSERT(ret);
  
                  // Store data
-                if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
+                if (out_nodes.size() <= numaId)
+                {
+                    out_nodes.resize(numaId + 1);
+                }
                  auto& numaNode = out_nodes[numaId];
+                numaNode.numaId = numaId;
  
                  uint32_t coreId = 0;
  
@@ -118,16 +149,6 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
                      numaNode.cores.push_back(Core());
                      pCore = &numaNode.cores.back();
                      pCore->procGroup = procGroup;
-#if !defined(_WIN64)
-                    coreId = (uint32_t)numaNode.cores.size();
-                    if ((coreId * numThreads) >= 32)
-                    {
-                        // Windows doesn't return threadIds >= 32 for a processor group correctly
-                        // when running a 32-bit application.
-                        // Just save -1 as the threadId
-                        threadId = uint32_t(-1);
-                    }
-#endif
                  }
                  pCore->threadIds.push_back(threadId);
                  if (procGroup == 0)
@@ -139,6 +160,8 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
          pBuffer = PtrAdd(pBuffer, pBuffer->Size);
      }
  
+    free(pBufferMem);
+
  
  #elif defined(__linux__) || defined (__gnu_linux__)
  
@@ -157,11 +180,18 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
              if (threadId != uint32_t(-1))
              {
                  // Save information.
-                if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
+                if (out_nodes.size() <= numaId)
+                {
+                    out_nodes.resize(numaId + 1);
+                }
+
                  auto& numaNode = out_nodes[numaId];
-                if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
-                auto& core = numaNode.cores[coreId];
+                if (numaNode.cores.size() <= coreId)
+                {
+                    numaNode.cores.resize(coreId + 1);
+                }
  
+                auto& core = numaNode.cores[coreId];
                  core.procGroup = coreId;
                  core.threadIds.push_back(threadId);
  
@@ -189,9 +219,16 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
      if (threadId != uint32_t(-1))
      {
          // Save information.
-        if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
+        if (out_nodes.size() <= numaId)
+        {
+            out_nodes.resize(numaId + 1);
+        }
          auto& numaNode = out_nodes[numaId];
-        if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
+        numaNode.numaId = numaId;
+        if (numaNode.cores.size() <= coreId)
+        {
+            numaNode.cores.resize(coreId + 1);
+        }
          auto& core = numaNode.cores[coreId];
  
          core.procGroup = coreId;
@@ -199,99 +236,176 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
          out_numThreadsPerProcGroup++;
      }
  
-    for (uint32_t node = 0; node < out_nodes.size(); node++) {
-        auto& numaNode = out_nodes[node];
-        auto it = numaNode.cores.begin();
-        for ( ; it != numaNode.cores.end(); ) {
-            if (it->threadIds.size() == 0)
-                numaNode.cores.erase(it);
-            else
-                ++it;
-        }
-    }
-
  #else
  
  #error Unsupported platform
  
  #endif
+
+    // Prune empty cores and numa nodes
+    for (auto node_it = out_nodes.begin(); node_it != out_nodes.end(); )
+    {
+        // Erase empty cores (first)
+        for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end(); )
+        {
+            if (core_it->threadIds.size() == 0)
+            {
+                core_it = node_it->cores.erase(core_it);
+            }
+            else
+            {
+                ++core_it;
+            }
+        }
+
+        // Erase empty numa nodes (second)
+        if (node_it->cores.size() == 0)
+        {
+            node_it = out_nodes.erase(node_it);
+        }
+        else
+        {
+            ++node_it;
+        }
+    }
  }
  
  
-void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false)
+void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false)
  {
      // Only bind threads when MAX_WORKER_THREADS isn't set.
-    if (KNOB_MAX_WORKER_THREADS && bindProcGroup == false)
+    if (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false)
      {
          return;
      }
  
  #if defined(_WIN32)
-    {
-        GROUP_AFFINITY affinity = {};
-        affinity.Group = procGroupId;
+
+    GROUP_AFFINITY affinity = {};
+    affinity.Group = procGroupId;
  
  #if !defined(_WIN64)
-        if (threadId >= 32)
-        {
-            // In a 32-bit process on Windows it is impossible to bind
-            // to logical processors 32-63 within a processor group.
-            // In this case set the mask to 0 and let the system assign
-            // the processor.  Hopefully it will make smart choices.
-            affinity.Mask = 0;
-        }
-        else
+    if (threadId >= 32)
+    {
+        // Hopefully we don't get here.  Logic in CreateThreadPool should prevent this.
+        SWR_REL_ASSERT(false, "Shouldn't get here");
+
+        // In a 32-bit process on Windows it is impossible to bind
+        // to logical processors 32-63 within a processor group.
+        // In this case set the mask to 0 and let the system assign
+        // the processor.  Hopefully it will make smart choices.
+        affinity.Mask = 0;
+    }
+    else
  #endif
+    {
+        // If MAX_WORKER_THREADS is set, only bind to the proc group,
+        // Not the individual HW thread.
+        if (!pContext->threadInfo.MAX_WORKER_THREADS)
          {
-            // If KNOB_MAX_WORKER_THREADS is set, only bind to the proc group,
-            // Not the individual HW thread.
-            if (!KNOB_MAX_WORKER_THREADS)
-            {
-                affinity.Mask = KAFFINITY(1) << threadId;
-            }
+            affinity.Mask = KAFFINITY(1) << threadId;
          }
-
-        SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr);
      }
+
+    SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr);
+
  #else
+
      cpu_set_t cpuset;
      pthread_t thread = pthread_self();
      CPU_ZERO(&cpuset);
      CPU_SET(threadId, &cpuset);
  
      pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
+
  #endif
  }
  
  INLINE
-uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
+uint32_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
  {
      return pContext->dcRing.GetHead();
  }
  
  INLINE
-DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint64_t drawId)
+DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint32_t drawId)
  {
      return &pContext->dcRing[(drawId-1) % KNOB_MAX_DRAWS_IN_FLIGHT];
  }
  
+INLINE
+bool IDComparesLess(uint32_t a, uint32_t b)
+{
+    // Use signed delta to ensure that wrap-around to 0 is correctly handled.
+    int32_t delta = int32_t(a - b);
+    return (delta < 0);
+}
+
  // returns true if dependency not met
  INLINE
-bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastRetiredDraw)
+bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw)
  {
-    return (pDC->dependency > lastRetiredDraw);
+    return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
  }
  
-INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
+bool CheckDependencyFE(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw)
  {
-    int64_t result = InterlockedDecrement64(&pDC->threadsDone);
+    return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Update client stats.
+INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
+{
+    if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStatsBE == false))
+    {
+        return;
+    }
+
+    DRAW_DYNAMIC_STATE& dynState = pDC->dynState;
+    SWR_STATS stats{ 0 };
+
+    // Sum up stats across all workers before sending to client.
+    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
+    {
+        stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
+
+        stats.PsInvocations  += dynState.pStats[i].PsInvocations;
+        stats.CsInvocations  += dynState.pStats[i].CsInvocations;
+    }
+
+
+    pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
+}
+
+INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
+{
+    UpdateClientStats(pContext, workerId, pDC);
+
+    if (pDC->retireCallback.pfnCallbackFunc)
+    {
+        pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData,
+            pDC->retireCallback.userData2,
+            pDC->retireCallback.userData3);
+    }
+}
+
+// inlined-only version
+INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
+{
+    int32_t result = InterlockedDecrement((volatile LONG*)&pDC->threadsDone);
      SWR_ASSERT(result >= 0);
  
      if (result == 0)
      {
+        ExecuteCallbacks(pContext, workerId, pDC);
+
          // Cleanup memory allocations
          pDC->pArena->Reset(true);
-        pDC->pTileMgr->initialize();
+        if (!pDC->isCompute)
+        {
+            pDC->pTileMgr->initialize();
+        }
          if (pDC->cleanupState)
          {
              pDC->pState->pArena->Reset(true);
@@ -305,11 +419,17 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
      return result;
  }
  
-INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE, uint64_t& drawEnqueued)
+// available to other translation modules
+int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
+{
+    return CompleteDrawContextInl(pContext, 0, pDC);
+}
+
+INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE, uint32_t& drawEnqueued)
  {
      // increment our current draw id to the first incomplete draw
      drawEnqueued = GetEnqueuedDraw(pContext);
-    while (curDrawBE < drawEnqueued)
+    while (IDComparesLess(curDrawBE, drawEnqueued))
      {
          DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
  
@@ -323,7 +443,7 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE,
          if (isWorkComplete)
          {
              curDrawBE++;
-            CompleteDrawContext(pContext, pDC);
+            CompleteDrawContextInl(pContext, workerId, pDC);
          }
          else
          {
@@ -332,7 +452,7 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE,
      }
  
      // If there are no more incomplete draws then return false.
-    return (curDrawBE >= drawEnqueued) ? false : true;
+    return IDComparesLess(curDrawBE, drawEnqueued);
  }
  
  //////////////////////////////////////////////////////////////////////////
@@ -349,23 +469,26 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE,
  ///                      still have work pending in a previous draw. Additionally, the lockedTiles is
  ///                      hueristic that can steer a worker back to the same macrotile that it had been
  ///                      working on in a previous draw.
-void WorkOnFifoBE(
+/// @returns        true if worker thread should shutdown
+bool WorkOnFifoBE(
      SWR_CONTEXT *pContext,
      uint32_t workerId,
-    uint64_t &curDrawBE,
+    uint32_t &curDrawBE,
      TileSet& lockedTiles,
      uint32_t numaNode,
      uint32_t numaMask)
  {
+    bool bShutdown = false;
+
      // Find the first incomplete draw that has pending work. If no such draw is found then
      // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
-    uint64_t drawEnqueued = 0;
-    if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
+    uint32_t drawEnqueued = 0;
+    if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
      {
-        return;
+        return false;
      }
  
-    uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
+    uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
  
      // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
      lockedTiles.clear();
@@ -375,29 +498,31 @@ void WorkOnFifoBE(
      //   2. If we're trying to work on draws after curDrawBE, we are restricted to 
      //      working on those macrotiles that are known to be complete in the prior draw to
      //      maintain order. The locked tiles provides the history to ensures this.
-    for (uint64_t i = curDrawBE; i < drawEnqueued; ++i)
+    for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
      {
          DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
  
-        if (pDC->isCompute) return; // We don't look at compute work.
+        if (pDC->isCompute) return false; // We don't look at compute work.
  
          // First wait for FE to be finished with this draw. This keeps threading model simple
          // but if there are lots of bubbles between draws then serializing FE and BE may
          // need to be revisited.
-        if (!pDC->doneFE) return;
+        if (!pDC->doneFE) return false;
          
          // If this draw is dependent on a previous draw then we need to bail.
          if (CheckDependency(pContext, pDC, lastRetiredDraw))
          {
-            return;
+            return false;
          }
  
          // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
-        std::vector<uint32_t> &macroTiles = pDC->pTileMgr->getDirtyTiles();
+        auto &macroTiles = pDC->pTileMgr->getDirtyTiles();
  
-        for (uint32_t tileID : macroTiles)
+        for (auto tile : macroTiles)
          {
-            // Only work on tiles for for this numa node
+            uint32_t tileID = tile->mId;
+
+            // Only work on tiles for this numa node
              uint32_t x, y;
              pDC->pTileMgr->getTileIndices(tileID, x, y);
              if (((x ^ y) & numaMask) != numaNode)
@@ -405,9 +530,7 @@ void WorkOnFifoBE(
                  continue;
              }
  
-            MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID);
-            
-            if (!tile.getNumQueued())
+            if (!tile->getNumQueued())
              {
                  continue;
              }
@@ -418,28 +541,32 @@ void WorkOnFifoBE(
                  continue;
              }
  
-            if (tile.tryLock())
+            if (tile->tryLock())
              {
                  BE_WORK *pWork;
  
-                RDTSC_START(WorkerFoundWork);
+                AR_BEGIN(WorkerFoundWork, pDC->drawId);
  
-                uint32_t numWorkItems = tile.getNumQueued();
+                uint32_t numWorkItems = tile->getNumQueued();
                  SWR_ASSERT(numWorkItems);
  
-                pWork = tile.peek();
+                pWork = tile->peek();
                  SWR_ASSERT(pWork);
                  if (pWork->type == DRAW)
                  {
-                    pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
+                    pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
+                }
+                else if (pWork->type == SHUTDOWN)
+                {
+                    bShutdown = true;
                  }
  
-                while ((pWork = tile.peek()) != nullptr)
+                while ((pWork = tile->peek()) != nullptr)
                  {
                      pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
-                    tile.dequeue();
+                    tile->dequeue();
                  }
-                RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
+                AR_END(WorkerFoundWork, numWorkItems);
  
                  _ReadWriteBarrier();
  
@@ -447,17 +574,22 @@ void WorkOnFifoBE(
  
                  // Optimization: If the draw is complete and we're the last one to have worked on it then
                  // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
-                if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
+                if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
                  {
                      // We can increment the current BE and safely move to next draw since we know this draw is complete.
                      curDrawBE++;
-                    CompleteDrawContext(pContext, pDC);
+                    CompleteDrawContextInl(pContext, workerId, pDC);
  
                      lastRetiredDraw++;
  
                      lockedTiles.clear();
                      break;
                  }
+
+                if (bShutdown)
+                {
+                    break;
+                }
              }
              else
              {
@@ -466,19 +598,59 @@ void WorkOnFifoBE(
              }
          }
      }
+
+    return bShutdown;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Called when FE work is complete for this DC.
+INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
+{
+    if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE)
+    {
+        SWR_STATS_FE& stats = pDC->dynState.statsFE;
+
+        AR_EVENT(FrontendStatsEvent(pDC->drawId,
+            stats.IaVertices, stats.IaPrimitives, stats.VsInvocations, stats.HsInvocations,
+            stats.DsInvocations, stats.GsInvocations, stats.GsPrimitives, stats.CInvocations, stats.CPrimitives,
+            stats.SoPrimStorageNeeded[0], stats.SoPrimStorageNeeded[1], stats.SoPrimStorageNeeded[2], stats.SoPrimStorageNeeded[3],
+            stats.SoNumPrimsWritten[0], stats.SoNumPrimsWritten[1], stats.SoNumPrimsWritten[2], stats.SoNumPrimsWritten[3]
+        ));
+               AR_EVENT(FrontendDrawEndEvent(pDC->drawId));
+
+        pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats);
+    }
+
+    if (pContext->pfnUpdateSoWriteOffset)
+    {
+        for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
+        {
+            if ((pDC->dynState.SoWriteOffsetDirty[i]) &&
+                (pDC->pState->state.soBuffer[i].soWriteEnable))
+            {
+                pContext->pfnUpdateSoWriteOffset(GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
+            }
+        }
+    }
+
+    // Ensure all streaming writes are globally visible before marking this FE done
+    _mm_mfence();
+    pDC->doneFE = true;
+
+    InterlockedDecrement((volatile LONG*)&pContext->drawsOutstandingFE);
  }
  
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE)
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
  {
      // Try to grab the next DC from the ring
-    uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
-    while (curDrawFE < drawEnqueued)
+    uint32_t drawEnqueued = GetEnqueuedDraw(pContext);
+    while (IDComparesLess(curDrawFE, drawEnqueued))
      {
          uint32_t dcSlot = curDrawFE % KNOB_MAX_DRAWS_IN_FLIGHT;
          DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
-        if (pDC->isCompute || pDC->doneFE || pDC->FeLock)
+        if (pDC->isCompute || pDC->doneFE)
          {
-            CompleteDrawContext(pContext, pDC);
+            CompleteDrawContextInl(pContext, workerId, pDC);
              curDrawFE++;
          }
          else
@@ -487,22 +659,27 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE)
          }
      }
  
-    uint64_t curDraw = curDrawFE;
-    while (curDraw < drawEnqueued)
+    uint32_t lastRetiredFE = curDrawFE - 1;
+    uint32_t curDraw = curDrawFE;
+    while (IDComparesLess(curDraw, drawEnqueued))
      {
          uint32_t dcSlot = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
          DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
  
          if (!pDC->isCompute && !pDC->FeLock)
          {
+            if (CheckDependencyFE(pContext, pDC, lastRetiredFE))
+            {
+                return;
+            }
+
              uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
              if (initial == 0)
              {
                  // successfully grabbed the DC, now run the FE
                  pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
  
-                _ReadWriteBarrier();
-                pDC->doneFE = true;
+                CompleteDrawFE(pContext, workerId, pDC);
              }
          }
          curDraw++;
@@ -519,17 +696,17 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE)
  void WorkOnCompute(
      SWR_CONTEXT *pContext,
      uint32_t workerId,
-    uint64_t& curDrawBE)
+    uint32_t& curDrawBE)
  {
-    uint64_t drawEnqueued = 0;
-    if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
+    uint32_t drawEnqueued = 0;
+    if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
      {
          return;
      }
  
-    uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
+    uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
  
-    for (uint64_t i = curDrawBE; curDrawBE < drawEnqueued; ++i)
+    for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
      {
          DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
          if (pDC->isCompute == false) return;
@@ -546,13 +723,16 @@ void WorkOnCompute(
          // Is there any work remaining?
          if (queue.getNumQueued() > 0)
          {
+            void* pSpillFillBuffer = nullptr;
              uint32_t threadGroupId = 0;
              while (queue.getWork(threadGroupId))
              {
-                ProcessComputeBE(pDC, workerId, threadGroupId);
-
+                queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer);
                  queue.finishedWork();
              }
+
+            // Ensure all streaming writes are globally visible before moving onto the next draw
+            _mm_mfence();
          }
      }
  }
@@ -565,7 +745,7 @@ DWORD workerThreadMain(LPVOID pData)
      uint32_t threadId = pThreadData->threadId;
      uint32_t workerId = pThreadData->workerId;
  
-    bindThread(threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); 
+    bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); 
  
      RDTSC_INIT(threadId);
  
@@ -598,13 +778,20 @@ DWORD workerThreadMain(LPVOID pData)
      //    the worker can safely increment its oldestDraw counter and move on to the next draw.
      std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
  
-    auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
+    auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
+
+    uint32_t curDrawBE = 0;
+    uint32_t curDrawFE = 0;
  
-    uint64_t curDrawBE = 0;
-    uint64_t curDrawFE = 0;
+    bool bShutdown = false;
  
-    while (pContext->threadPool.inThreadShutdown == false)
+    while (true)
      {
+        if (bShutdown && !threadHasWork(curDrawBE))
+        {
+            break;
+        }
+
          uint32_t loop = 0;
          while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
          {
@@ -622,30 +809,15 @@ DWORD workerThreadMain(LPVOID pData)
                  continue;
              }
  
-            if (pContext->threadPool.inThreadShutdown)
-            {
-                lock.unlock();
-                break;
-            }
-
-            RDTSC_START(WorkerWaitForThreadEvent);
-
              pContext->FifosNotEmpty.wait(lock);
              lock.unlock();
-
-            RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0);
-
-            if (pContext->threadPool.inThreadShutdown)
-            {
-                break;
-            }
          }
  
          if (IsBEThread)
          {
-            RDTSC_START(WorkerWorkOnFifoBE);
-            WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
-            RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
+            AR_BEGIN(WorkerWorkOnFifoBE, 0);
+            bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
+            AR_END(WorkerWorkOnFifoBE, 0);
  
              WorkOnCompute(pContext, workerId, curDrawBE);
          }
@@ -686,9 +858,13 @@ DWORD workerThreadInit(LPVOID pData)
  }
  template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
  
-void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
+//////////////////////////////////////////////////////////////////////////
+/// @brief Creates thread pool info but doesn't launch threads.
+/// @param pContext - pointer to context
+/// @param pPool - pointer to thread pool object.
+void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
  {
-    bindThread(0);
+    bindThread(pContext, 0);
  
      CPUNumaNodes nodes;
      uint32_t numThreadsPerProcGroup = 0;
@@ -698,54 +874,55 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
      uint32_t numHWCoresPerNode  = (uint32_t)nodes[0].cores.size();
      uint32_t numHWHyperThreads  = (uint32_t)nodes[0].cores[0].threadIds.size();
  
-    uint32_t numNodes           = numHWNodes;
-    uint32_t numCoresPerNode    = numHWCoresPerNode;
-    uint32_t numHyperThreads    = numHWHyperThreads;
-
-    if (KNOB_MAX_WORKER_THREADS)
+    // Calculate num HW threads.  Due to asymmetric topologies, this is not
+    // a trivial multiplication.
+    uint32_t numHWThreads = 0;
+    for (auto& node : nodes)
      {
-        SET_KNOB(HYPERTHREADED_FE, false);
+        for (auto& core : node.cores)
+        {
+            numHWThreads += (uint32_t)core.threadIds.size();
+        }
      }
  
-    if (KNOB_HYPERTHREADED_FE)
-    {
-        SET_KNOB(MAX_THREADS_PER_CORE, 0);
-    }
+    uint32_t numNodes           = numHWNodes;
+    uint32_t numCoresPerNode    = numHWCoresPerNode;
+    uint32_t numHyperThreads    = numHWHyperThreads;
  
-    if (KNOB_MAX_NUMA_NODES)
+    if (pContext->threadInfo.MAX_NUMA_NODES)
      {
-        numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
+        numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
      }
  
-    if (KNOB_MAX_CORES_PER_NUMA_NODE)
+    if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
      {
-        numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE);
+        numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
      }
  
-    if (KNOB_MAX_THREADS_PER_CORE)
+    if (pContext->threadInfo.MAX_THREADS_PER_CORE)
      {
-        numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
+        numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
      }
  
-    if (numHyperThreads < 2)
+#if defined(_WIN32) && !defined(_WIN64)
+    if (!pContext->threadInfo.MAX_WORKER_THREADS)
      {
-        SET_KNOB(HYPERTHREADED_FE, false);
+        // Limit 32-bit windows to bindable HW threads only
+        if ((numCoresPerNode * numHWHyperThreads) > 32)
+        {
+            numCoresPerNode = 32 / numHWHyperThreads;
+        }
      }
+#endif
  
      // Calculate numThreads
      uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
+    numThreads = std::min(numThreads, numHWThreads);
  
-    if (KNOB_MAX_WORKER_THREADS)
+    if (pContext->threadInfo.MAX_WORKER_THREADS)
      {
          uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads;
-        numThreads = std::min(KNOB_MAX_WORKER_THREADS, maxHWThreads);
-    }
-
-    if (numThreads > KNOB_MAX_NUM_THREADS)
-    {
-        printf("WARNING: system thread count %u exceeds max %u, "
-            "performance will be degraded\n",
-            numThreads, KNOB_MAX_NUM_THREADS);
+        numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, maxHWThreads);
      }
  
      uint32_t numAPIReservedThreads = 1;
@@ -769,9 +946,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
          }
          else
          {
-            pPool->numThreads = 0;
-            SET_KNOB(SINGLE_THREADED, true);
-            return;
+            pContext->threadInfo.SINGLE_THREADED = true;
          }
      }
      else
@@ -787,14 +962,37 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
          }
      }
  
+    if (pContext->threadInfo.SINGLE_THREADED)
+    {
+        numThreads = 1;
+    }
+
+    // Initialize DRAW_CONTEXT's per-thread stats
+    for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
+    {
+        pContext->dcRing[dc].dynState.pStats = new SWR_STATS[numThreads];
+        memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
+    }
+
+    if (pContext->threadInfo.SINGLE_THREADED)
+    {
+        pContext->NumWorkerThreads = 1;
+        pContext->NumFEThreads = 1;
+        pContext->NumBEThreads = 1;
+        pPool->numThreads = 0;
+
+        return;
+    }
+
      pPool->numThreads = numThreads;
      pContext->NumWorkerThreads = pPool->numThreads;
  
-    pPool->inThreadShutdown = false;
      pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
      pPool->numaMask = 0;
  
-    if (KNOB_MAX_WORKER_THREADS)
+    pPool->pThreads = new THREAD_PTR[pPool->numThreads];
+
+    if (pContext->threadInfo.MAX_WORKER_THREADS)
      {
          bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
          uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
@@ -810,7 +1008,6 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
              pPool->pThreadData[workerId].htId = 0;
              pPool->pThreadData[workerId].pContext = pContext;
              pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
-            pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
  
              pContext->NumBEThreads++;
              pContext->NumFEThreads++;
@@ -824,76 +1021,88 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
          for (uint32_t n = 0; n < numNodes; ++n)
          {
              auto& node = nodes[n];
-            if (node.cores.size() == 0)
-            {
-               continue;
-            }
-
              uint32_t numCores = numCoresPerNode;
              for (uint32_t c = 0; c < numCores; ++c)
              {
+                if (c >= node.cores.size())
+                {
+                    break;
+                }
+
                  auto& core = node.cores[c];
                  for (uint32_t t = 0; t < numHyperThreads; ++t)
                  {
+                    if (t >= core.threadIds.size())
+                    {
+                        break;
+                    }
+
                      if (numAPIReservedThreads)
                      {
                          --numAPIReservedThreads;
                          continue;
                      }
  
+                    SWR_ASSERT(workerId < numThreads);
+
                      pPool->pThreadData[workerId].workerId = workerId;
                      pPool->pThreadData[workerId].procGroupId = core.procGroup;
                      pPool->pThreadData[workerId].threadId = core.threadIds[t];
-                    pPool->pThreadData[workerId].numaId = n;
+                    pPool->pThreadData[workerId].numaId = node.numaId;
                      pPool->pThreadData[workerId].coreId = c;
                      pPool->pThreadData[workerId].htId = t;
                      pPool->pThreadData[workerId].pContext = pContext;
  
-                    if (KNOB_HYPERTHREADED_FE)
-                    {
-                        if (t == 0)
-                        {
-                            pContext->NumBEThreads++;
-                            pPool->threads[workerId] = new std::thread(workerThreadInit<false, true>, &pPool->pThreadData[workerId]);
-                        }
-                        else
-                        {
-                            pContext->NumFEThreads++;
-                            pPool->threads[workerId] = new std::thread(workerThreadInit<true, false>, &pPool->pThreadData[workerId]);
-                        }
-                    }
-                    else
-                    {
-                        pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
-                        pContext->NumBEThreads++;
-                        pContext->NumFEThreads++;
-                    }
+                    pContext->NumBEThreads++;
+                    pContext->NumFEThreads++;
  
                      ++workerId;
                  }
              }
          }
+        SWR_ASSERT(workerId == pContext->NumWorkerThreads);
      }
  }
  
+//////////////////////////////////////////////////////////////////////////
+/// @brief Launches worker threads in thread pool.
+/// @param pContext - pointer to context
+/// @param pPool - pointer to thread pool object.
+void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
+{
+    if (pContext->threadInfo.SINGLE_THREADED)
+    {
+        return;
+    }
+
+    for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId)
+    {
+        pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Destroys thread pool.
+/// @param pContext - pointer to context
+/// @param pPool - pointer to thread pool object.
  void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
  {
-    if (!KNOB_SINGLE_THREADED)
+    if (!pContext->threadInfo.SINGLE_THREADED)
      {
-        // Inform threads to finish up
-        std::unique_lock<std::mutex> lock(pContext->WaitLock);
-        pPool->inThreadShutdown = true;
-        _mm_mfence();
-        pContext->FifosNotEmpty.notify_all();
-        lock.unlock();
+        // Wait for all threads to finish
+        SwrWaitForIdle(pContext);
  
          // Wait for threads to finish and destroy them
          for (uint32_t t = 0; t < pPool->numThreads; ++t)
          {
-            pPool->threads[t]->join();
-            delete(pPool->threads[t]);
+            // Detach from thread.  Cannot join() due to possibility (in Windows) of code
+            // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns.
+            pPool->pThreads[t]->detach();
+            delete(pPool->pThreads[t]);
          }
  
+        delete [] pPool->pThreads;
+
          // Clean up data used by threads
          free(pPool->pThreadData);
      }