swr: [rasterizer core] Put DRAW_CONTEXT on a diet

author Tim Rowley <timothy.o.rowley@intel.com>

Wed, 30 Mar 2016 21:54:48 +0000 (15:54 -0600)

committer Tim Rowley <timothy.o.rowley@intel.com>

Tue, 12 Apr 2016 16:52:05 +0000 (11:52 -0500)
author Tim Rowley <timothy.o.rowley@intel.com>
Wed, 30 Mar 2016 21:54:48 +0000 (15:54 -0600)
committer Tim Rowley <timothy.o.rowley@intel.com>
Tue, 12 Apr 2016 16:52:05 +0000 (11:52 -0500)
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp

index 665b6c0453f2a9df50be45f3bc60f5d5e233df81..d0738a7e2e09894757117f98b2138106dbbccc9f 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -29,6 +29,7 @@
  #include <cfloat>
  #include <cmath>
  #include <cstdio>
+#include <new>
  
  #include "core/api.h"
  #include "core/backend.h"
@@ -65,11 +66,14 @@ HANDLE SwrCreateContext(
      pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
      pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
  
+    pContext->pMacroTileManagerArray = (MacroTileMgr*)_aligned_malloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+    pContext->pDispatchQueueArray = (DispatchQueue*)_aligned_malloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+
      for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
      {
          pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
-        pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
-        pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
+        new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
+        new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
  
          pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
      }
@@ -143,10 +147,13 @@ void SwrDestroyContext(HANDLE hContext)
      {
          delete pContext->dcRing[i].pArena;
          delete pContext->dsRing[i].pArena;
-        delete(pContext->dcRing[i].pTileMgr);
-        delete(pContext->dcRing[i].pDispatch);
+        pContext->pMacroTileManagerArray[i].~MacroTileMgr();
+        pContext->pDispatchQueueArray[i].~DispatchQueue();
      }
  
+    _aligned_free(pContext->pDispatchQueueArray);
+    _aligned_free(pContext->pMacroTileManagerArray);
+
      // Free scratch space.
      for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
      {
@@ -176,6 +183,15 @@ void WakeAllThreads(SWR_CONTEXT *pContext)
  template<bool IsDraw>
  void QueueWork(SWR_CONTEXT *pContext)
  {
+    DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
+    uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+    if (IsDraw)
+    {
+        pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
+        pDC->pTileMgr->initialize();
+    }
+
      // Each worker thread looks at a DC for both FE and BE work at different times and so we
      // multiply threadDone by 2.  When the threadDone counter has reached 0 then all workers
      // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
@@ -299,8 +315,6 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
          pCurDrawContext->FeLock = 0;
          pCurDrawContext->threadsDone = 0;
  
-        pCurDrawContext->pTileMgr->initialize();
-
          // Assign unique drawId for this DC
          pCurDrawContext->drawId = pContext->dcRing.GetHead();
  
@@ -1368,9 +1382,6 @@ void SwrDispatch(
  
      pDC->isCompute = true;      // This is a compute context.
  
-    // Ensure spill fill pointers are initialized to nullptr.
-    memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill));
-
      COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
  
      pTaskData->threadGroupCountX = threadGroupCountX;
@@ -1378,6 +1389,8 @@ void SwrDispatch(
      pTaskData->threadGroupCountZ = threadGroupCountZ;
  
      uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
+    uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+    pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
      pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
  
      QueueDispatch(pContext);
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp

index 842ea326e68229490c409c1a7984a5328428b8d3..b2d3d9ef4f4d9e1fe4ea216c0f6039d7138861cb 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -70,7 +70,7 @@ static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
  /// @param pDC - pointer to draw context (dispatch).
  /// @param workerId - The unique worker ID that is assigned to this thread.
  /// @param threadGroupId - the linear index for the thread group within the dispatch.
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId)
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
  {
      RDTSC_START(BEDispatch);
  
@@ -80,10 +80,10 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
      SWR_ASSERT(pTaskData != nullptr);
  
      // Ensure spill fill memory has been allocated.
-    if (pDC->pSpillFill[workerId] == nullptr)
+    if (pSpillFillBuffer == nullptr)
      {
          ///@todo Add state which indicates the spill fill size.
-        pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8);
+        pSpillFillBuffer = pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8);
      }
  
      const API_STATE& state = GetApiState(pDC);
@@ -94,7 +94,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
      csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
      csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
      csContext.pTGSM = pContext->pScratch[workerId];
-    csContext.pSpillFillBuffer = pDC->pSpillFill[workerId];
+    csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
  
      state.pfnCsFunc(GetPrivateState(pDC), &csContext);
  
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h

index 2fa18953cadf4d027b8f1563db6c6265fb23762f..d0626b997af9c9274277d60e3ab5c1211a537ec1 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -32,7 +32,7 @@
  #include "core/context.h"
  #include "core/multisample.h"
  
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
  void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
  void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
  void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h

index 2c28286b5ad5eb28a2f88963f61eb4bf27f8316c..660c86e1194fca1f0f1516d6d838d474da7ad746 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -384,8 +384,11 @@ struct DRAW_CONTEXT
  {
      SWR_CONTEXT*    pContext;
      uint64_t        drawId;
-    MacroTileMgr*   pTileMgr;
-    DispatchQueue*  pDispatch;      // Queue for thread groups. (isCompute)
+    union
+    {
+        MacroTileMgr*   pTileMgr;
+        DispatchQueue*  pDispatch;      // Queue for thread groups. (isCompute)
+    };
      uint64_t        dependency;
      DRAW_STATE*     pState;
      CachingArena*   pArena;
@@ -394,12 +397,10 @@ struct DRAW_CONTEXT
      bool            cleanupState;   // True if this is the last draw using an entry in the state ring.
      volatile bool   doneFE;         // Is FE work done for this draw?
  
+    FE_WORK         FeWork;
+
      volatile OSALIGNLINE(uint32_t)   FeLock;
      volatile int64_t    threadsDone;
-
-    OSALIGNLINE(FE_WORK) FeWork;
-    uint8_t*        pSpillFill[KNOB_MAX_NUM_THREADS];  // Scratch space used for spill fills.
-
  };
  
  static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
@@ -445,6 +446,9 @@ struct SWR_CONTEXT
      DRAW_CONTEXT *pCurDrawContext;    // This points to DC entry in ring for an unsubmitted draw.
      DRAW_CONTEXT *pPrevDrawContext;   // This points to DC entry for the previous context submitted that we can copy state from.
  
+    MacroTileMgr* pMacroTileManagerArray;
+    DispatchQueue* pDispatchQueueArray;
+
      // Draw State Ring
      //  When draw are very large (lots of primitives) then the API thread will break these up.
      //  These split draws all have identical state. So instead of storing the state directly
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp

index bee1e138002af2d85ffdce24c4a5ed7a5479e50e..4b7a207f366d32d1b3963f35cbb5bdf942204d24 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -291,7 +291,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
      {
          // Cleanup memory allocations
          pDC->pArena->Reset(true);
-        pDC->pTileMgr->initialize();
+        if (!pDC->isCompute)
+        {
+            pDC->pTileMgr->initialize();
+        }
          if (pDC->cleanupState)
          {
              pDC->pState->pArena->Reset(true);
@@ -546,10 +549,11 @@ void WorkOnCompute(
          // Is there any work remaining?
          if (queue.getNumQueued() > 0)
          {
+            void* pSpillFillBuffer = nullptr;
              uint32_t threadGroupId = 0;
              while (queue.getWork(threadGroupId))
              {
-                ProcessComputeBE(pDC, workerId, threadGroupId);
+                ProcessComputeBE(pDC, workerId, threadGroupId, pSpillFillBuffer);
  
                  queue.finishedWork();
              }
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp

index 794577270cf1ecc581a63c8c2cdcf5ab1fdc40ca..c053e27f9a768c0190ee73f26f5ab7f6d97978e5 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -35,27 +35,6 @@
  
  #define TILE_ID(x,y) ((x << 16 | y))
  
-// override new/delete for alignment
-void *MacroTileMgr::operator new(size_t size)
-{
-    return _aligned_malloc(size, 64);
-}
-
-void MacroTileMgr::operator delete(void *p)
-{
-    _aligned_free(p);
-}
-
-void* DispatchQueue::operator new(size_t size)
-{
-    return _aligned_malloc(size, 64);
-}
-
-void DispatchQueue::operator delete(void *p)
-{
-    _aligned_free(p);
-}
-
  MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
  {
  }
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h

index 34992aaea75b8213cf2de37d3a4f0b31c208dc66..82a15e16a33d7a8ec57a230bf0788a5ca690929a 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -140,9 +140,6 @@ public:
          x = (tileID >> 16) & 0xffff;
      }
  
-    void *operator new(size_t size);
-    void operator delete (void *p);
-
  private:
      CachingArena& mArena;
      std::unordered_map<uint32_t, MacroTileQueue> mTiles;
@@ -229,9 +226,6 @@ public:
          return mpTaskData;
      }
  
-    void *operator new(size_t size);
-    void operator delete (void *p);
-
      void* mpTaskData{ nullptr };        // The API thread will set this up and the callback task function will interpet this.
  
      OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };
author	Tim Rowley <timothy.o.rowley@intel.com>
	Wed, 30 Mar 2016 21:54:48 +0000 (15:54 -0600)
committer	Tim Rowley <timothy.o.rowley@intel.com>
	Tue, 12 Apr 2016 16:52:05 +0000 (11:52 -0500)
src/gallium/drivers/swr/rasterizer/core/api.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/backend.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/backend.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/context.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/threads.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/tilemgr.h		patch \| blob \| history