swr: [rasterizer core] Put DRAW_CONTEXT on a diet
authorTim Rowley <timothy.o.rowley@intel.com>
Wed, 30 Mar 2016 21:54:48 +0000 (15:54 -0600)
committerTim Rowley <timothy.o.rowley@intel.com>
Tue, 12 Apr 2016 16:52:05 +0000 (11:52 -0500)
No need for 256 pointers per DC.

Acked-by: Brian Paul <brianp@vmware.com>
src/gallium/drivers/swr/rasterizer/core/api.cpp
src/gallium/drivers/swr/rasterizer/core/backend.cpp
src/gallium/drivers/swr/rasterizer/core/backend.h
src/gallium/drivers/swr/rasterizer/core/context.h
src/gallium/drivers/swr/rasterizer/core/threads.cpp
src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
src/gallium/drivers/swr/rasterizer/core/tilemgr.h

index 665b6c0453f2a9df50be45f3bc60f5d5e233df81..d0738a7e2e09894757117f98b2138106dbbccc9f 100644 (file)
@@ -29,6 +29,7 @@
 #include <cfloat>
 #include <cmath>
 #include <cstdio>
+#include <new>
 
 #include "core/api.h"
 #include "core/backend.h"
@@ -65,11 +66,14 @@ HANDLE SwrCreateContext(
     pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
     pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
 
+    pContext->pMacroTileManagerArray = (MacroTileMgr*)_aligned_malloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+    pContext->pDispatchQueueArray = (DispatchQueue*)_aligned_malloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+
     for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
     {
         pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
-        pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
-        pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
+        new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
+        new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
 
         pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
     }
@@ -143,10 +147,13 @@ void SwrDestroyContext(HANDLE hContext)
     {
         delete pContext->dcRing[i].pArena;
         delete pContext->dsRing[i].pArena;
-        delete(pContext->dcRing[i].pTileMgr);
-        delete(pContext->dcRing[i].pDispatch);
+        pContext->pMacroTileManagerArray[i].~MacroTileMgr();
+        pContext->pDispatchQueueArray[i].~DispatchQueue();
     }
 
+    _aligned_free(pContext->pDispatchQueueArray);
+    _aligned_free(pContext->pMacroTileManagerArray);
+
     // Free scratch space.
     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
     {
@@ -176,6 +183,15 @@ void WakeAllThreads(SWR_CONTEXT *pContext)
 template<bool IsDraw>
 void QueueWork(SWR_CONTEXT *pContext)
 {
+    DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
+    uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+    if (IsDraw)
+    {
+        pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
+        pDC->pTileMgr->initialize();
+    }
+
     // Each worker thread looks at a DC for both FE and BE work at different times and so we
     // multiply threadDone by 2.  When the threadDone counter has reached 0 then all workers
     // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
@@ -299,8 +315,6 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
         pCurDrawContext->FeLock = 0;
         pCurDrawContext->threadsDone = 0;
 
-        pCurDrawContext->pTileMgr->initialize();
-
         // Assign unique drawId for this DC
         pCurDrawContext->drawId = pContext->dcRing.GetHead();
 
@@ -1368,9 +1382,6 @@ void SwrDispatch(
 
     pDC->isCompute = true;      // This is a compute context.
 
-    // Ensure spill fill pointers are initialized to nullptr.
-    memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill));
-
     COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
 
     pTaskData->threadGroupCountX = threadGroupCountX;
@@ -1378,6 +1389,8 @@ void SwrDispatch(
     pTaskData->threadGroupCountZ = threadGroupCountZ;
 
     uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
+    uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+    pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
     pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
 
     QueueDispatch(pContext);
index 842ea326e68229490c409c1a7984a5328428b8d3..b2d3d9ef4f4d9e1fe4ea216c0f6039d7138861cb 100644 (file)
@@ -70,7 +70,7 @@ static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
 /// @param pDC - pointer to draw context (dispatch).
 /// @param workerId - The unique worker ID that is assigned to this thread.
 /// @param threadGroupId - the linear index for the thread group within the dispatch.
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId)
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
 {
     RDTSC_START(BEDispatch);
 
@@ -80,10 +80,10 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
     SWR_ASSERT(pTaskData != nullptr);
 
     // Ensure spill fill memory has been allocated.
-    if (pDC->pSpillFill[workerId] == nullptr)
+    if (pSpillFillBuffer == nullptr)
     {
         ///@todo Add state which indicates the spill fill size.
-        pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8);
+        pSpillFillBuffer = pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8);
     }
 
     const API_STATE& state = GetApiState(pDC);
@@ -94,7 +94,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
     csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
     csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
     csContext.pTGSM = pContext->pScratch[workerId];
-    csContext.pSpillFillBuffer = pDC->pSpillFill[workerId];
+    csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
 
     state.pfnCsFunc(GetPrivateState(pDC), &csContext);
 
index 2fa18953cadf4d027b8f1563db6c6265fb23762f..d0626b997af9c9274277d60e3ab5c1211a537ec1 100644 (file)
@@ -32,7 +32,7 @@
 #include "core/context.h"
 #include "core/multisample.h"
 
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
index 2c28286b5ad5eb28a2f88963f61eb4bf27f8316c..660c86e1194fca1f0f1516d6d838d474da7ad746 100644 (file)
@@ -384,8 +384,11 @@ struct DRAW_CONTEXT
 {
     SWR_CONTEXT*    pContext;
     uint64_t        drawId;
-    MacroTileMgr*   pTileMgr;
-    DispatchQueue*  pDispatch;      // Queue for thread groups. (isCompute)
+    union
+    {
+        MacroTileMgr*   pTileMgr;
+        DispatchQueue*  pDispatch;      // Queue for thread groups. (isCompute)
+    };
     uint64_t        dependency;
     DRAW_STATE*     pState;
     CachingArena*   pArena;
@@ -394,12 +397,10 @@ struct DRAW_CONTEXT
     bool            cleanupState;   // True if this is the last draw using an entry in the state ring.
     volatile bool   doneFE;         // Is FE work done for this draw?
 
+    FE_WORK         FeWork;
+
     volatile OSALIGNLINE(uint32_t)   FeLock;
     volatile int64_t    threadsDone;
-
-    OSALIGNLINE(FE_WORK) FeWork;
-    uint8_t*        pSpillFill[KNOB_MAX_NUM_THREADS];  // Scratch space used for spill fills.
-
 };
 
 static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
@@ -445,6 +446,9 @@ struct SWR_CONTEXT
     DRAW_CONTEXT *pCurDrawContext;    // This points to DC entry in ring for an unsubmitted draw.
     DRAW_CONTEXT *pPrevDrawContext;   // This points to DC entry for the previous context submitted that we can copy state from.
 
+    MacroTileMgr* pMacroTileManagerArray;
+    DispatchQueue* pDispatchQueueArray;
+
     // Draw State Ring
     //  When draw are very large (lots of primitives) then the API thread will break these up.
     //  These split draws all have identical state. So instead of storing the state directly
index bee1e138002af2d85ffdce24c4a5ed7a5479e50e..4b7a207f366d32d1b3963f35cbb5bdf942204d24 100644 (file)
@@ -291,7 +291,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
     {
         // Cleanup memory allocations
         pDC->pArena->Reset(true);
-        pDC->pTileMgr->initialize();
+        if (!pDC->isCompute)
+        {
+            pDC->pTileMgr->initialize();
+        }
         if (pDC->cleanupState)
         {
             pDC->pState->pArena->Reset(true);
@@ -546,10 +549,11 @@ void WorkOnCompute(
         // Is there any work remaining?
         if (queue.getNumQueued() > 0)
         {
+            void* pSpillFillBuffer = nullptr;
             uint32_t threadGroupId = 0;
             while (queue.getWork(threadGroupId))
             {
-                ProcessComputeBE(pDC, workerId, threadGroupId);
+                ProcessComputeBE(pDC, workerId, threadGroupId, pSpillFillBuffer);
 
                 queue.finishedWork();
             }
index 794577270cf1ecc581a63c8c2cdcf5ab1fdc40ca..c053e27f9a768c0190ee73f26f5ab7f6d97978e5 100644 (file)
 
 #define TILE_ID(x,y) ((x << 16 | y))
 
-// override new/delete for alignment
-void *MacroTileMgr::operator new(size_t size)
-{
-    return _aligned_malloc(size, 64);
-}
-
-void MacroTileMgr::operator delete(void *p)
-{
-    _aligned_free(p);
-}
-
-void* DispatchQueue::operator new(size_t size)
-{
-    return _aligned_malloc(size, 64);
-}
-
-void DispatchQueue::operator delete(void *p)
-{
-    _aligned_free(p);
-}
-
 MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
 {
 }
index 34992aaea75b8213cf2de37d3a4f0b31c208dc66..82a15e16a33d7a8ec57a230bf0788a5ca690929a 100644 (file)
@@ -140,9 +140,6 @@ public:
         x = (tileID >> 16) & 0xffff;
     }
 
-    void *operator new(size_t size);
-    void operator delete (void *p);
-
 private:
     CachingArena& mArena;
     std::unordered_map<uint32_t, MacroTileQueue> mTiles;
@@ -229,9 +226,6 @@ public:
         return mpTaskData;
     }
 
-    void *operator new(size_t size);
-    void operator delete (void *p);
-
     void* mpTaskData{ nullptr };        // The API thread will set this up and the callback task function will interpet this.
 
     OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };