swr: [rasterizer core] remove KNOB_MAX_THREADS

author Tim Rowley <timothy.o.rowley@intel.com>

Mon, 22 Aug 2016 16:49:48 +0000 (11:49 -0500)

committer Tim Rowley <timothy.o.rowley@intel.com>

Mon, 29 Aug 2016 17:41:58 +0000 (12:41 -0500)
author Tim Rowley <timothy.o.rowley@intel.com>
Mon, 22 Aug 2016 16:49:48 +0000 (11:49 -0500)
committer Tim Rowley <timothy.o.rowley@intel.com>
Mon, 29 Aug 2016 17:41:58 +0000 (12:41 -0500)
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp

index 599d1f4ed9d7d831ccc898357965720915e79a84..7108a83d0d36170dd408c134599a3d42019c53df 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -77,6 +77,15 @@ HANDLE SwrCreateContext(
      pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
      pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
  
+    for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
+    {
+        pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
+        new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
+        new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
+
+        pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
+    }
+
      pContext->threadInfo.MAX_WORKER_THREADS        = KNOB_MAX_WORKER_THREADS;
      pContext->threadInfo.MAX_NUMA_NODES            = KNOB_MAX_NUMA_NODES;
      pContext->threadInfo.MAX_CORES_PER_NUMA_NODE   = KNOB_MAX_CORES_PER_NUMA_NODE;
@@ -88,24 +97,12 @@ HANDLE SwrCreateContext(
          pContext->threadInfo = *pCreateInfo->pThreadInfo;
      }
  
-    for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
-    {
-        pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
-        new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
-        new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
-
-        pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
-    }
-
-    if (!pContext->threadInfo.SINGLE_THREADED)
-    {
-        memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
-        memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
-        new (&pContext->WaitLock) std::mutex();
-        new (&pContext->FifosNotEmpty) std::condition_variable();
+    memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
+    memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
+    new (&pContext->WaitLock) std::mutex();
+    new (&pContext->FifosNotEmpty) std::condition_variable();
  
-        CreateThreadPool(pContext, &pContext->threadPool);
-    }
+    CreateThreadPool(pContext, &pContext->threadPool);
  
      // Calling createThreadPool() above can set SINGLE_THREADED
      if (pContext->threadInfo.SINGLE_THREADED)
@@ -115,6 +112,9 @@ HANDLE SwrCreateContext(
          pContext->NumBEThreads = 1;
      }
  
+    pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
+    pContext->pStats = new SWR_STATS[pContext->NumWorkerThreads];
+
      // Allocate scratch space for workers.
      ///@note We could lazily allocate this but its rather small amount of memory.
      for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
@@ -122,12 +122,12 @@ HANDLE SwrCreateContext(
  #if defined(_WIN32)
          uint32_t numaNode = pContext->threadPool.pThreadData ?
              pContext->threadPool.pThreadData[i].numaId : 0;
-        pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma(
+        pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(
              GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
              MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
              numaNode);
  #else
-        pContext->pScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
+        pContext->ppScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
  #endif
      }
  
@@ -166,6 +166,7 @@ void SwrDestroyContext(HANDLE hContext)
      // free the fifos
      for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
      {
+        delete [] pContext->dcRing[i].dynState.pStats;
          delete pContext->dcRing[i].pArena;
          delete pContext->dsRing[i].pArena;
          pContext->pMacroTileManagerArray[i].~MacroTileMgr();
@@ -179,12 +180,15 @@ void SwrDestroyContext(HANDLE hContext)
      for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
      {
  #if defined(_WIN32)
-        VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE);
+        VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE);
  #else
-        AlignedFree(pContext->pScratch[i]);
+        AlignedFree(pContext->ppScratch[i]);
  #endif
      }
  
+    delete [] pContext->ppScratch;
+    delete [] pContext->pStats;
+
      delete(pContext->pHotTileMgr);
  
      pContext->~SWR_CONTEXT();
@@ -352,7 +356,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
          pCurDrawContext->threadsDone = 0;
          pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr;
  
-        memset(&pCurDrawContext->dynState, 0, sizeof(pCurDrawContext->dynState));
+        pCurDrawContext->dynState.Reset(pContext->threadPool.numThreads);
  
          // Assign unique drawId for this DC
          pCurDrawContext->drawId = pContext->dcRing.GetHead();
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp

index 7dd6c0db3de2d0d16b95c5b55206607565312d99..0e92ccf2c888a343d9a06bf3c456644752e99c30 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -68,7 +68,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
      csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
      csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
      csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
-    csContext.pTGSM = pContext->pScratch[workerId];
+    csContext.pTGSM = pContext->ppScratch[workerId];
      csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
  
      state.pfnCsFunc(GetPrivateState(pDC), &csContext);
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h

index 0848264f8fadf9d35d4071b32d30eb96fa5f0aab..fe78cd6dc93bbf54ee10ec5533c40ca9a8f242ed 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -368,12 +368,19 @@ struct DRAW_STATE
  
  struct DRAW_DYNAMIC_STATE
  {
+    void Reset(uint32_t numThreads)
+    {
+        SWR_STATS* pSavePtr = pStats;
+        memset(this, 0, sizeof(*this));
+        pStats = pSavePtr;
+        memset(pStats, 0, sizeof(SWR_STATS) * (numThreads ? numThreads : 1));
+    }
      ///@todo Currently assumes only a single FE can do stream output for a draw.
      uint32_t SoWriteOffset[4];
      bool     SoWriteOffsetDirty[4];
  
      SWR_STATS_FE statsFE;   // Only one FE thread per DC.
-    SWR_STATS    stats[KNOB_MAX_NUM_THREADS];
+    SWR_STATS*   pStats;
  };
  
  // Draw Context
@@ -486,10 +493,10 @@ struct SWR_CONTEXT
      PFN_UPDATE_STATS_FE         pfnUpdateStatsFE;
  
      // Global Stats
-    SWR_STATS stats[KNOB_MAX_NUM_THREADS];
+    SWR_STATS* pStats;
  
      // Scratch space for workers.
-    uint8_t* pScratch[KNOB_MAX_NUM_THREADS];
+    uint8_t** ppScratch;
  
      volatile int32_t  drawsOutstandingFE;
  
@@ -501,5 +508,5 @@ struct SWR_CONTEXT
      TileSet singleThreadLockedTiles;
  };
  
-#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.stats[workerId].name += count; }
+#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.pStats[workerId].name += count; }
  #define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.statsFE.name += count; }
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h

index 740613118a6546c3cab25714634539922ed17973..c01ad67f7c40ab0b21564730c758e6b44d576f31 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/knobs.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h
@@ -92,8 +92,6 @@
  ///////////////////////////////////////////////////////////////////////////////
  // Configuration knobs
  ///////////////////////////////////////////////////////////////////////////////
-#define KNOB_MAX_NUM_THREADS                256 // Supports up to dual-HSW-Xeon.
-
  // Maximum supported number of active vertex buffer streams
  #define KNOB_NUM_STREAMS                    32
  
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp

index 9665f09e2c852feac23daffda0e0d3d3353c44e6..ed03d70a1f931b18812575a66a13629b101dca3c 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -73,14 +73,19 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
      static std::mutex m;
      std::lock_guard<std::mutex> l(m);
  
-    static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
-    DWORD bufSize = sizeof(buffer);
+    DWORD bufSize = 0;
  
-    BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize);
+    BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
+    SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
+
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
+    SWR_ASSERT(pBufferMem);
+
+    ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
      SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
  
-    uint32_t count = bufSize / buffer->Size;
-    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer;
+    uint32_t count = bufSize / pBufferMem->Size;
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;
  
      for (uint32_t i = 0; i < count; ++i)
      {
@@ -150,6 +155,8 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
          pBuffer = PtrAdd(pBuffer, pBuffer->Size);
      }
  
+    free(pBufferMem);
+
  
  #elif defined(__linux__) || defined (__gnu_linux__)
  
@@ -321,10 +328,10 @@ INLINE void UpdateClientStats(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
      // Sum up stats across all workers before sending to client.
      for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
      {
-        stats.DepthPassCount += dynState.stats[i].DepthPassCount;
+        stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
  
-        stats.PsInvocations  += dynState.stats[i].PsInvocations;
-        stats.CsInvocations  += dynState.stats[i].CsInvocations;
+        stats.PsInvocations  += dynState.pStats[i].PsInvocations;
+        stats.CsInvocations  += dynState.pStats[i].CsInvocations;
      }
  
      pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
@@ -849,13 +856,6 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
          numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, maxHWThreads);
      }
  
-    if (numThreads > KNOB_MAX_NUM_THREADS)
-    {
-        printf("WARNING: system thread count %u exceeds max %u, "
-            "performance will be degraded\n",
-            numThreads, KNOB_MAX_NUM_THREADS);
-    }
-
      uint32_t numAPIReservedThreads = 1;
  
  
@@ -878,8 +878,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
          else
          {
              pPool->numThreads = 0;
-            SET_KNOB(SINGLE_THREADED, true);
-            return;
+            numThreads = 1;
+            pContext->threadInfo.SINGLE_THREADED = true;
          }
      }
      else
@@ -895,6 +895,19 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
          }
      }
  
+    // Initialize DRAW_CONTEXT's per-thread stats
+    for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
+    {
+        pContext->dcRing[dc].dynState.pStats = new SWR_STATS[numThreads];
+        memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
+    }
+
+    if (pContext->threadInfo.SINGLE_THREADED)
+    {
+        return;
+    }
+
+
      pPool->numThreads = numThreads;
      pContext->NumWorkerThreads = pPool->numThreads;
  
@@ -902,6 +915,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
      pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
      pPool->numaMask = 0;
  
+    pPool->pThreads = new THREAD_PTR[pPool->numThreads];
+
      if (pContext->threadInfo.MAX_WORKER_THREADS)
      {
          bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
@@ -918,7 +933,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
              pPool->pThreadData[workerId].htId = 0;
              pPool->pThreadData[workerId].pContext = pContext;
              pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
-            pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+            pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
  
              pContext->NumBEThreads++;
              pContext->NumFEThreads++;
@@ -964,7 +979,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
                      pPool->pThreadData[workerId].htId = t;
                      pPool->pThreadData[workerId].pContext = pContext;
  
-                    pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+                    pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
                      pContext->NumBEThreads++;
                      pContext->NumFEThreads++;
  
@@ -989,10 +1004,12 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
          // Wait for threads to finish and destroy them
          for (uint32_t t = 0; t < pPool->numThreads; ++t)
          {
-            pPool->threads[t]->join();
-            delete(pPool->threads[t]);
+            pPool->pThreads[t]->join();
+            delete(pPool->pThreads[t]);
          }
  
+        delete [] pPool->pThreads;
+
          // Clean up data used by threads
          free(pPool->pThreadData);
      }
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h

index 157f46aff703501b9955e61084455238d90f31de..05231c5a38f2567cb4ad95484a3bb4f6ac183af5 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -51,7 +51,7 @@ struct THREAD_DATA
  
  struct THREAD_POOL
  {
-    THREAD_PTR threads[KNOB_MAX_NUM_THREADS];
+    THREAD_PTR* pThreads;
      uint32_t numThreads;
      uint32_t numaMask;
      volatile bool inThreadShutdown;
author	Tim Rowley <timothy.o.rowley@intel.com>
	Mon, 22 Aug 2016 16:49:48 +0000 (11:49 -0500)
committer	Tim Rowley <timothy.o.rowley@intel.com>
	Mon, 29 Aug 2016 17:41:58 +0000 (12:41 -0500)
src/gallium/drivers/swr/rasterizer/core/api.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/backend.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/context.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/knobs.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/threads.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/threads.h		patch \| blob \| history