No need for 256 pointers per DC.
Acked-by: Brian Paul <>
#include <cfloat>
#include <cmath>
#include <cstdio>
+#include <new>
#include "core/api.h"
#include "core/backend.h"
+ pContext->pMacroTileManagerArray = (MacroTileMgr*)_aligned_malloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+ pContext->pDispatchQueueArray = (DispatchQueue*)_aligned_malloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
- pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
- pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
+ new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
+ new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
delete pContext->dcRing[i].pArena;
delete pContext->dsRing[i].pArena;
- delete(pContext->dcRing[i].pTileMgr);
- delete(pContext->dcRing[i].pDispatch);
+ pContext->pMacroTileManagerArray[i].~MacroTileMgr();
+ pContext->pDispatchQueueArray[i].~DispatchQueue();
+ _aligned_free(pContext->pDispatchQueueArray);
+ _aligned_free(pContext->pMacroTileManagerArray);
// Free scratch space.
for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
template<bool IsDraw>
void QueueWork(SWR_CONTEXT *pContext)
+ DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
+ uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+ if (IsDraw)
+ {
+ pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
+ pDC->pTileMgr->initialize();
+ }
// Each worker thread looks at a DC for both FE and BE work at different times and so we
// multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
// have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
pCurDrawContext->FeLock = 0;
pCurDrawContext->threadsDone = 0;
- pCurDrawContext->pTileMgr->initialize();
// Assign unique drawId for this DC
pCurDrawContext->drawId = pContext->dcRing.GetHead();
pDC->isCompute = true; // This is a compute context.
- // Ensure spill fill pointers are initialized to nullptr.
- memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill));
COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
pTaskData->threadGroupCountX = threadGroupCountX;
pTaskData->threadGroupCountZ = threadGroupCountZ;
uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
+ uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+ pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
/// @param pDC - pointer to draw context (dispatch).
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param threadGroupId - the linear index for the thread group within the dispatch.
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId)
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
SWR_ASSERT(pTaskData != nullptr);
// Ensure spill fill memory has been allocated.
- if (pDC->pSpillFill[workerId] == nullptr)
+ if (pSpillFillBuffer == nullptr)
///@todo Add state which indicates the spill fill size.
- pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8);
+ pSpillFillBuffer = pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8);
const API_STATE& state = GetApiState(pDC);
csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
csContext.pTGSM = pContext->pScratch[workerId];
- csContext.pSpillFillBuffer = pDC->pSpillFill[workerId];
+ csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
state.pfnCsFunc(GetPrivateState(pDC), &csContext);
#include "core/context.h"
#include "core/multisample.h"
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
SWR_CONTEXT* pContext;
uint64_t drawId;
- MacroTileMgr* pTileMgr;
- DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
+ union
+ {
+ MacroTileMgr* pTileMgr;
+ DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
+ };
uint64_t dependency;
CachingArena* pArena;
bool cleanupState; // True if this is the last draw using an entry in the state ring.
volatile bool doneFE; // Is FE work done for this draw?
+ FE_WORK FeWork;
volatile OSALIGNLINE(uint32_t) FeLock;
volatile int64_t threadsDone;
- uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills.
static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from.
+ MacroTileMgr* pMacroTileManagerArray;
+ DispatchQueue* pDispatchQueueArray;
// Draw State Ring
// When draw are very large (lots of primitives) then the API thread will break these up.
// These split draws all have identical state. So instead of storing the state directly
// Cleanup memory allocations
- pDC->pTileMgr->initialize();
+ if (!pDC->isCompute)
+ {
+ pDC->pTileMgr->initialize();
+ }
if (pDC->cleanupState)
// Is there any work remaining?
if (queue.getNumQueued() > 0)
+ void* pSpillFillBuffer = nullptr;
uint32_t threadGroupId = 0;
while (queue.getWork(threadGroupId))
- ProcessComputeBE(pDC, workerId, threadGroupId);
+ ProcessComputeBE(pDC, workerId, threadGroupId, pSpillFillBuffer);
#define TILE_ID(x,y) ((x << 16 | y))
-// override new/delete for alignment
-void *MacroTileMgr::operator new(size_t size)
- return _aligned_malloc(size, 64);
-void MacroTileMgr::operator delete(void *p)
- _aligned_free(p);
-void* DispatchQueue::operator new(size_t size)
- return _aligned_malloc(size, 64);
-void DispatchQueue::operator delete(void *p)
- _aligned_free(p);
MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
x = (tileID >> 16) & 0xffff;
- void *operator new(size_t size);
- void operator delete (void *p);
CachingArena& mArena;
std::unordered_map<uint32_t, MacroTileQueue> mTiles;
return mpTaskData;
- void *operator new(size_t size);
- void operator delete (void *p);
void* mpTaskData{ nullptr }; // The API thread will set this up and the callback task function will interpet this.
OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };