No need for 256 pointers per DC.
Acked-by: Brian Paul <brianp@vmware.com>
#include <cfloat>
#include <cmath>
#include <cstdio>
+#include <new>
#include "core/api.h"
#include "core/backend.h"
pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
+ pContext->pMacroTileManagerArray = (MacroTileMgr*)_aligned_malloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+ pContext->pDispatchQueueArray = (DispatchQueue*)_aligned_malloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+
for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
{
pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
- pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
- pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
+ new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
+ new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
}
{
delete pContext->dcRing[i].pArena;
delete pContext->dsRing[i].pArena;
- delete(pContext->dcRing[i].pTileMgr);
- delete(pContext->dcRing[i].pDispatch);
+ pContext->pMacroTileManagerArray[i].~MacroTileMgr();
+ pContext->pDispatchQueueArray[i].~DispatchQueue();
}
+ _aligned_free(pContext->pDispatchQueueArray);
+ _aligned_free(pContext->pMacroTileManagerArray);
+
// Free scratch space.
for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
{
template<bool IsDraw>
void QueueWork(SWR_CONTEXT *pContext)
{
+ DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
+ uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+ if (IsDraw)
+ {
+ pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
+ pDC->pTileMgr->initialize();
+ }
+
// Each worker thread looks at a DC for both FE and BE work at different times and so we
// multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
// have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
pCurDrawContext->FeLock = 0;
pCurDrawContext->threadsDone = 0;
- pCurDrawContext->pTileMgr->initialize();
-
// Assign unique drawId for this DC
pCurDrawContext->drawId = pContext->dcRing.GetHead();
pDC->isCompute = true; // This is a compute context.
- // Ensure spill fill pointers are initialized to nullptr.
- memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill));
-
COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
pTaskData->threadGroupCountX = threadGroupCountX;
pTaskData->threadGroupCountZ = threadGroupCountZ;
uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
+ uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+ pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
QueueDispatch(pContext);
/// @param pDC - pointer to draw context (dispatch).
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param threadGroupId - the linear index for the thread group within the dispatch.
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId)
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
{
RDTSC_START(BEDispatch);
SWR_ASSERT(pTaskData != nullptr);
// Ensure spill fill memory has been allocated.
- if (pDC->pSpillFill[workerId] == nullptr)
+ if (pSpillFillBuffer == nullptr)
{
///@todo Add state which indicates the spill fill size.
- pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8);
+ pSpillFillBuffer = pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8);
}
const API_STATE& state = GetApiState(pDC);
csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
csContext.pTGSM = pContext->pScratch[workerId];
- csContext.pSpillFillBuffer = pDC->pSpillFill[workerId];
+ csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
state.pfnCsFunc(GetPrivateState(pDC), &csContext);
#include "core/context.h"
#include "core/multisample.h"
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
{
SWR_CONTEXT* pContext;
uint64_t drawId;
- MacroTileMgr* pTileMgr;
- DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
+ union
+ {
+ MacroTileMgr* pTileMgr;
+ DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
+ };
uint64_t dependency;
DRAW_STATE* pState;
CachingArena* pArena;
bool cleanupState; // True if this is the last draw using an entry in the state ring.
volatile bool doneFE; // Is FE work done for this draw?
+ FE_WORK FeWork;
+
volatile OSALIGNLINE(uint32_t) FeLock;
volatile int64_t threadsDone;
-
- OSALIGNLINE(FE_WORK) FeWork;
- uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills.
-
};
static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from.
+ MacroTileMgr* pMacroTileManagerArray;
+ DispatchQueue* pDispatchQueueArray;
+
// Draw State Ring
// When draw are very large (lots of primitives) then the API thread will break these up.
// These split draws all have identical state. So instead of storing the state directly
{
// Cleanup memory allocations
pDC->pArena->Reset(true);
- pDC->pTileMgr->initialize();
+ if (!pDC->isCompute)
+ {
+ pDC->pTileMgr->initialize();
+ }
if (pDC->cleanupState)
{
pDC->pState->pArena->Reset(true);
// Is there any work remaining?
if (queue.getNumQueued() > 0)
{
+ void* pSpillFillBuffer = nullptr;
uint32_t threadGroupId = 0;
while (queue.getWork(threadGroupId))
{
- ProcessComputeBE(pDC, workerId, threadGroupId);
+ ProcessComputeBE(pDC, workerId, threadGroupId, pSpillFillBuffer);
queue.finishedWork();
}
#define TILE_ID(x,y) ((x << 16 | y))
-// override new/delete for alignment
-void *MacroTileMgr::operator new(size_t size)
-{
- return _aligned_malloc(size, 64);
-}
-
-void MacroTileMgr::operator delete(void *p)
-{
- _aligned_free(p);
-}
-
-void* DispatchQueue::operator new(size_t size)
-{
- return _aligned_malloc(size, 64);
-}
-
-void DispatchQueue::operator delete(void *p)
-{
- _aligned_free(p);
-}
-
MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
{
}
x = (tileID >> 16) & 0xffff;
}
- void *operator new(size_t size);
- void operator delete (void *p);
-
private:
CachingArena& mArena;
std::unordered_map<uint32_t, MacroTileQueue> mTiles;
return mpTaskData;
}
- void *operator new(size_t size);
- void operator delete (void *p);
-
void* mpTaskData{ nullptr }; // The API thread will set this up and the callback task function will interpet this.
OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };