SWR_CONTEXT *pContext,
uint32_t workerId,
uint64_t &curDrawBE,
- TileSet& lockedTiles)
+ TileSet& lockedTiles,
+ uint32_t numaNode,
+ uint32_t numaMask)
{
// Find the first incomplete draw that has pending work. If no such draw is found then
// return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
for (uint32_t tileID : macroTiles)
{
+ // Only work on tiles for for this numa node
+ uint32_t x, y;
+ pDC->pTileMgr->getTileIndices(tileID, x, y);
+ if (((x ^ y) & numaMask) != numaNode)
+ {
+ continue;
+ }
+
MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID);
+ if (!tile.getNumQueued())
+ {
+ continue;
+ }
+
// can only work on this draw if it's not in use by other threads
- if (lockedTiles.find(tileID) == lockedTiles.end())
+ if (lockedTiles.find(tileID) != lockedTiles.end())
{
- if (tile.getNumQueued())
+ continue;
+ }
+
+ if (tile.tryLock())
+ {
+ BE_WORK *pWork;
+
+ RDTSC_START(WorkerFoundWork);
+
+ uint32_t numWorkItems = tile.getNumQueued();
+ SWR_ASSERT(numWorkItems);
+
+ pWork = tile.peek();
+ SWR_ASSERT(pWork);
+ if (pWork->type == DRAW)
{
- if (tile.tryLock())
- {
- BE_WORK *pWork;
-
- RDTSC_START(WorkerFoundWork);
-
- uint32_t numWorkItems = tile.getNumQueued();
-
- if (numWorkItems != 0)
- {
- pWork = tile.peek();
- SWR_ASSERT(pWork);
- if (pWork->type == DRAW)
- {
- pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
- }
- }
-
- while ((pWork = tile.peek()) != nullptr)
- {
- pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
- tile.dequeue();
- }
- RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
-
- _ReadWriteBarrier();
-
- pDC->pTileMgr->markTileComplete(tileID);
-
- // Optimization: If the draw is complete and we're the last one to have worked on it then
- // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
- if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
- {
- // We can increment the current BE and safely move to next draw since we know this draw is complete.
- curDrawBE++;
- CompleteDrawContext(pContext, pDC);
-
- lastRetiredDraw++;
-
- lockedTiles.clear();
- break;
- }
- }
- else
- {
- // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
- lockedTiles.insert(tileID);
- }
+ pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
+ }
+
+ while ((pWork = tile.peek()) != nullptr)
+ {
+ pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
+ tile.dequeue();
+ }
+ RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
+
+ _ReadWriteBarrier();
+
+ pDC->pTileMgr->markTileComplete(tileID);
+
+ // Optimization: If the draw is complete and we're the last one to have worked on it then
+ // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
+ if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
+ {
+ // We can increment the current BE and safely move to next draw since we know this draw is complete.
+ curDrawBE++;
+ CompleteDrawContext(pContext, pDC);
+
+ lastRetiredDraw++;
+
+ lockedTiles.clear();
+ break;
}
}
+ else
+ {
+ // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
+ lockedTiles.insert(tileID);
+ }
}
}
}
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode)
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode)
{
// Try to grab the next DC from the ring
uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
RDTSC_INIT(threadId);
- int numaNode = (int)pThreadData->numaId;
+ uint32_t numaNode = pThreadData->numaId;
+ uint32_t numaMask = pContext->threadPool.numaMask;
// flush denormals to 0
_mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
}
RDTSC_START(WorkerWorkOnFifoBE);
- WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles);
+ WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
WorkOnCompute(pContext, workerId, curDrawBE);
pPool->inThreadShutdown = false;
pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
+ pPool->numaMask = 0;
if (KNOB_MAX_WORKER_THREADS)
{
}
else
{
+ pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
+
uint32_t workerId = 0;
for (uint32_t n = 0; n < numNodes; ++n)
{
{
THREAD_PTR threads[KNOB_MAX_NUM_THREADS];
uint32_t numThreads;
+ uint32_t numaMask;
volatile bool inThreadShutdown;
THREAD_DATA *pThreadData;
};
void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
// Expose FE and BE worker functions to the API thread if single threaded
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode);
-void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles);
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode);
+void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
\ No newline at end of file
if (create)
{
uint32_t size = numSamples * mHotTileSize[attachment];
- hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+ uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
+ hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
hotTile.state = HOTTILE_INVALID;
hotTile.numSamples = numSamples;
hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
(hotTile.state == HOTTILE_RESOLVED) ||
(hotTile.state == HOTTILE_CLEAR));
- _aligned_free(hotTile.pBuffer);
+ FreeHotTileMem(hotTile.pBuffer);
uint32_t size = numSamples * mHotTileSize[attachment];
- hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+ uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
+ hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
hotTile.state = HOTTILE_INVALID;
hotTile.numSamples = numSamples;
}
{
for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a)
{
- if (mHotTiles[x][y].Attachment[a].pBuffer != NULL)
- {
- _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer);
- mHotTiles[x][y].Attachment[a].pBuffer = NULL;
- }
+ FreeHotTileMem(mHotTiles[x][y].Attachment[a].pBuffer);
}
}
}
private:
HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];
+
+ void* AllocHotTileMem(size_t size, uint32_t align, uint32_t numaNode)
+ {
+ void* p = nullptr;
+#if defined(_WIN32)
+ HANDLE hProcess = GetCurrentProcess();
+ p = VirtualAllocExNuma(hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode);
+#else
+ p = _aligned_malloc(size, align);
+#endif
+
+ return p;
+ }
+
+ void FreeHotTileMem(void* pBuffer)
+ {
+ if (pBuffer)
+ {
+#if defined(_WIN32)
+ VirtualFree(pBuffer, 0, MEM_RELEASE);
+#else
+ _aligned_free(pBuffer);
+#endif
+ }
+ }
};