swr: [rasterizer core] Add experimental support for hyper-threaded front-end

author Tim Rowley <timothy.o.rowley@intel.com>

Wed, 30 Mar 2016 20:59:40 +0000 (14:59 -0600)

committer Tim Rowley <timothy.o.rowley@intel.com>

Tue, 12 Apr 2016 16:52:05 +0000 (11:52 -0500)
author Tim Rowley <timothy.o.rowley@intel.com>
Wed, 30 Mar 2016 20:59:40 +0000 (14:59 -0600)
committer Tim Rowley <timothy.o.rowley@intel.com>
Tue, 12 Apr 2016 16:52:05 +0000 (11:52 -0500)
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp

index 9e13ee142a5f7332707191fabfc5f759e4d73a93..665b6c0453f2a9df50be45f3bc60f5d5e233df81 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -87,7 +87,10 @@ HANDLE SwrCreateContext(
      // Calling createThreadPool() above can set SINGLE_THREADED
      if (KNOB_SINGLE_THREADED)
      {
+        SET_KNOB(HYPERTHREADED_FE, false);
          pContext->NumWorkerThreads = 1;
+        pContext->NumFEThreads = 1;
+        pContext->NumBEThreads = 1;
      }
  
      // Allocate scratch space for workers.
@@ -177,8 +180,7 @@ void QueueWork(SWR_CONTEXT *pContext)
      // multiply threadDone by 2.  When the threadDone counter has reached 0 then all workers
      // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
      // then moved on if all work is done.)
-    pContext->pCurDrawContext->threadsDone =
-        pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
+    pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
  
      _ReadWriteBarrier();
      {
@@ -196,7 +198,7 @@ void QueueWork(SWR_CONTEXT *pContext)
          {
              static TileSet lockedTiles;
              uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
-            WorkOnFifoFE(pContext, 0, curDraw[0], 0);
+            WorkOnFifoFE(pContext, 0, curDraw[0]);
              WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
          }
          else
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h

index 27abe437718486c8291dcbb90916be7bdadd13b2..2c28286b5ad5eb28a2f88963f61eb4bf27f8316c 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -382,32 +382,28 @@ struct DRAW_STATE
  //    This draw context maintains all of the state needed for the draw operation.
  struct DRAW_CONTEXT
  {
-    SWR_CONTEXT *pContext;
+    SWR_CONTEXT*    pContext;
+    uint64_t        drawId;
+    MacroTileMgr*   pTileMgr;
+    DispatchQueue*  pDispatch;      // Queue for thread groups. (isCompute)
+    uint64_t        dependency;
+    DRAW_STATE*     pState;
+    CachingArena*   pArena;
  
-    uint64_t drawId;
+    bool            isCompute;      // Is this DC a compute context?
+    bool            cleanupState;   // True if this is the last draw using an entry in the state ring.
+    volatile bool   doneFE;         // Is FE work done for this draw?
  
-    bool isCompute;    // Is this DC a compute context?
+    volatile OSALIGNLINE(uint32_t)   FeLock;
+    volatile int64_t    threadsDone;
  
-    FE_WORK FeWork;
-    volatile OSALIGNLINE(uint32_t) FeLock;
-    volatile OSALIGNLINE(bool) doneFE;    // Is FE work done for this draw?
-    volatile OSALIGNLINE(int64_t) threadsDone;
+    OSALIGNLINE(FE_WORK) FeWork;
+    uint8_t*        pSpillFill[KNOB_MAX_NUM_THREADS];  // Scratch space used for spill fills.
  
-    uint64_t dependency;
-
-    MacroTileMgr* pTileMgr;
-
-    // The following fields are valid if isCompute is true.
-    DispatchQueue* pDispatch;               // Queue for thread groups. (isCompute)
-
-    DRAW_STATE* pState;
-    CachingArena* pArena;
-
-    uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS];  // Scratch space used for spill fills.
-
-    bool  cleanupState; // True if this is the last draw using an entry in the state ring.
  };
  
+static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
+
  INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
  {
      SWR_ASSERT(pDC != nullptr);
@@ -459,6 +455,8 @@ struct SWR_CONTEXT
      uint32_t curStateId;               // Current index to the next available entry in the DS ring.
  
      uint32_t NumWorkerThreads;
+    uint32_t NumFEThreads;
+    uint32_t NumBEThreads;
  
      THREAD_POOL threadPool; // Thread pool associated with this context
  
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp

index 056003e467c5073397b4181c32933e41e91c1b06..bee1e138002af2d85ffdce24c4a5ed7a5479e50e 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -305,10 +305,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
      return result;
  }
  
-INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
+INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE, uint64_t& drawEnqueued)
  {
      // increment our current draw id to the first incomplete draw
-    uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
+    drawEnqueued = GetEnqueuedDraw(pContext);
      while (curDrawBE < drawEnqueued)
      {
          DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
@@ -316,8 +316,9 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
          // If its not compute and FE is not done then break out of loop.
          if (!pDC->doneFE && !pDC->isCompute) break;
  
-        bool isWorkComplete = (pDC->isCompute) ?
-            pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
+        bool isWorkComplete = pDC->isCompute ?
+            pDC->pDispatch->isWorkComplete() :
+            pDC->pTileMgr->isWorkComplete();
  
          if (isWorkComplete)
          {
@@ -358,7 +359,8 @@ void WorkOnFifoBE(
  {
      // Find the first incomplete draw that has pending work. If no such draw is found then
      // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
-    if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
+    uint64_t drawEnqueued = 0;
+    if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
      {
          return;
      }
@@ -373,7 +375,7 @@ void WorkOnFifoBE(
      //   2. If we're trying to work on draws after curDrawBE, we are restricted to 
      //      working on those macrotiles that are known to be complete in the prior draw to
      //      maintain order. The locked tiles provides the history to ensures this.
-    for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i)
+    for (uint64_t i = curDrawBE; i < drawEnqueued; ++i)
      {
          DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
  
@@ -466,7 +468,7 @@ void WorkOnFifoBE(
      }
  }
  
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode)
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE)
  {
      // Try to grab the next DC from the ring
      uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
@@ -519,38 +521,43 @@ void WorkOnCompute(
      uint32_t workerId,
      uint64_t& curDrawBE)
  {
-    if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
+    uint64_t drawEnqueued = 0;
+    if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
      {
          return;
      }
  
      uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
  
-    DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
-    if (pDC->isCompute == false) return;
-
-    // check dependencies
-    if (CheckDependency(pContext, pDC, lastRetiredDraw))
+    for (uint64_t i = curDrawBE; curDrawBE < drawEnqueued; ++i)
      {
-        return;
-    }
+        DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
+        if (pDC->isCompute == false) return;
+
+        // check dependencies
+        if (CheckDependency(pContext, pDC, lastRetiredDraw))
+        {
+            return;
+        }
  
-    SWR_ASSERT(pDC->pDispatch != nullptr);
-    DispatchQueue& queue = *pDC->pDispatch;
+        SWR_ASSERT(pDC->pDispatch != nullptr);
+        DispatchQueue& queue = *pDC->pDispatch;
  
-    // Is there any work remaining?
-    if (queue.getNumQueued() > 0)
-    {
-        uint32_t threadGroupId = 0;
-        while (queue.getWork(threadGroupId))
+        // Is there any work remaining?
+        if (queue.getNumQueued() > 0)
          {
-            ProcessComputeBE(pDC, workerId, threadGroupId);
+            uint32_t threadGroupId = 0;
+            while (queue.getWork(threadGroupId))
+            {
+                ProcessComputeBE(pDC, workerId, threadGroupId);
  
-            queue.finishedWork();
+                queue.finishedWork();
+            }
          }
      }
  }
  
+template<bool IsFEThread, bool IsBEThread>
  DWORD workerThreadMain(LPVOID pData)
  {
      THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
@@ -634,25 +641,38 @@ DWORD workerThreadMain(LPVOID pData)
              }
          }
  
-        RDTSC_START(WorkerWorkOnFifoBE);
-        WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
-        RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
+        if (IsBEThread)
+        {
+            RDTSC_START(WorkerWorkOnFifoBE);
+            WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
+            RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
  
-        WorkOnCompute(pContext, workerId, curDrawBE);
+            WorkOnCompute(pContext, workerId, curDrawBE);
+        }
+
+        if (IsFEThread)
+        {
+            WorkOnFifoFE(pContext, workerId, curDrawFE);
  
-        WorkOnFifoFE(pContext, workerId, curDrawFE, numaNode);
+            if (!IsBEThread)
+            {
+                curDrawBE = curDrawFE;
+            }
+        }
      }
  
      return 0;
  }
+template<> DWORD workerThreadMain<false, false>(LPVOID) = delete;
  
+template <bool IsFEThread, bool IsBEThread>
  DWORD workerThreadInit(LPVOID pData)
  {
  #if defined(_WIN32)
      __try
  #endif // _WIN32
      {
-        return workerThreadMain(pData);
+        return workerThreadMain<IsFEThread, IsBEThread>(pData);
      }
  
  #if defined(_WIN32)
@@ -664,6 +684,7 @@ DWORD workerThreadInit(LPVOID pData)
  
      return 1;
  }
+template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
  
  void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
  {
@@ -681,6 +702,16 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
      uint32_t numCoresPerNode    = numHWCoresPerNode;
      uint32_t numHyperThreads    = numHWHyperThreads;
  
+    if (KNOB_MAX_WORKER_THREADS)
+    {
+        SET_KNOB(HYPERTHREADED_FE, false);
+    }
+
+    if (KNOB_HYPERTHREADED_FE)
+    {
+        SET_KNOB(MAX_THREADS_PER_CORE, 0);
+    }
+
      if (KNOB_MAX_NUMA_NODES)
      {
          numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
@@ -696,6 +727,11 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
          numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
      }
  
+    if (numHyperThreads < 2)
+    {
+        SET_KNOB(HYPERTHREADED_FE, false);
+    }
+
      // Calculate numThreads
      uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
  
@@ -770,9 +806,14 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
              pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups;
              pPool->pThreadData[workerId].threadId = 0;
              pPool->pThreadData[workerId].numaId = 0;
+            pPool->pThreadData[workerId].coreId = 0;
+            pPool->pThreadData[workerId].htId = 0;
              pPool->pThreadData[workerId].pContext = pContext;
              pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
-            pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
+            pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+
+            pContext->NumBEThreads++;
+            pContext->NumFEThreads++;
          }
      }
      else
@@ -804,8 +845,29 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
                      pPool->pThreadData[workerId].procGroupId = core.procGroup;
                      pPool->pThreadData[workerId].threadId = core.threadIds[t];
                      pPool->pThreadData[workerId].numaId = n;
+                    pPool->pThreadData[workerId].coreId = c;
+                    pPool->pThreadData[workerId].htId = t;
                      pPool->pThreadData[workerId].pContext = pContext;
-                    pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
+
+                    if (KNOB_HYPERTHREADED_FE)
+                    {
+                        if (t == 0)
+                        {
+                            pContext->NumBEThreads++;
+                            pPool->threads[workerId] = new std::thread(workerThreadInit<false, true>, &pPool->pThreadData[workerId]);
+                        }
+                        else
+                        {
+                            pContext->NumFEThreads++;
+                            pPool->threads[workerId] = new std::thread(workerThreadInit<true, false>, &pPool->pThreadData[workerId]);
+                        }
+                    }
+                    else
+                    {
+                        pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+                        pContext->NumBEThreads++;
+                        pContext->NumFEThreads++;
+                    }
  
                      ++workerId;
                  }
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h

index 821d7dcb16e857ea427161f8f200ea02a6ab0906..3aba6323a95947c6666a94803becb4ad0ea681f2 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -41,6 +41,8 @@ struct THREAD_DATA
      uint32_t procGroupId;   // Will always be 0 for non-Windows OS
      uint32_t threadId;      // within the procGroup for Windows
      uint32_t numaId;        // NUMA node id
+    uint32_t coreId;        // Core id
+    uint32_t htId;          // Hyperthread id
      uint32_t workerId;
      SWR_CONTEXT *pContext;
      bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set.
@@ -62,7 +64,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
  void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
  
  // Expose FE and BE worker functions to the API thread if single threaded
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode);
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE);
  void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
  void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
  int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
 \ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py

index 0f3ded6854469de1f8f762b295f30ae3e6836950..3832b91d93eedbd7937752b34255b0776dbb222c 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -30,6 +30,18 @@ KNOBS = [
          'category'  : 'debug',
      }],
  
+    ['HYPERTHREADED_FE', {
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['EXPERIMENTAL!!',
+                       'If enabled will attempt to use secondary threads per core to perform',
+                       'front-end (VS/GS) work.',
+                       '',
+                       'Note: Setting this will cause KNOB_MAX_THREADS_PER_CORE to be ignored.'],
+        'category'  : 'perf',
+        'advanced'  : 'true',
+    }],
+
      ['DUMP_SHADER_IR', {
          'type'      : 'bool',
          'default'   : 'false',
@@ -166,6 +178,7 @@ KNOBS = [
                         '',
                         'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
          'category'  : 'perf',
+        'advanced'  : 'true',
      }],
  
      ['TOSS_FETCH', {
@@ -175,6 +188,7 @@ KNOBS = [
                         '',
                         'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
          'category'  : 'perf',
+        'advanced'  : 'true',
      }],
  
      ['TOSS_IA', {
@@ -184,6 +198,7 @@ KNOBS = [
                         '',
                         'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
          'category'  : 'perf',
+        'advanced'  : 'true',
      }],
  
      ['TOSS_VS', {
@@ -193,6 +208,7 @@ KNOBS = [
                         '',
                         'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
          'category'  : 'perf',
+        'advanced'  : 'true',
      }],
  
      ['TOSS_SETUP_TRIS', {
@@ -202,6 +218,7 @@ KNOBS = [
                         '',
                         'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
          'category'  : 'perf',
+        'advanced'  : 'true',
      }],
  
      ['TOSS_BIN_TRIS', {
@@ -211,6 +228,7 @@ KNOBS = [
                         '',
                         'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
          'category'  : 'perf',
+        'advanced'  : 'true',
      }],
  
      ['TOSS_RS', {
@@ -220,4 +238,5 @@ KNOBS = [
                         '',
                         'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
          'category'  : 'perf',
+        'advanced'  : 'true',
      }],]
author	Tim Rowley <timothy.o.rowley@intel.com>
	Wed, 30 Mar 2016 20:59:40 +0000 (14:59 -0600)
committer	Tim Rowley <timothy.o.rowley@intel.com>
	Tue, 12 Apr 2016 16:52:05 +0000 (11:52 -0500)
src/gallium/drivers/swr/rasterizer/core/api.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/context.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/threads.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/threads.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py		patch \| blob \| history