From 93c1a2dedfa8b786e969a9ae44765bf6841218ef Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Thu, 24 Mar 2016 00:01:23 -0600 Subject: [PATCH] swr: [rasterizer core] NUMA optimizations... - Affinitize hot-tile memory to specific NUMA nodes. - Only do BE work for macrotiles assoicated with the numa node --- .../drivers/swr/rasterizer/core/api.cpp | 2 +- .../drivers/swr/rasterizer/core/threads.cpp | 124 ++++++++++-------- .../drivers/swr/rasterizer/core/threads.h | 5 +- .../drivers/swr/rasterizer/core/tilemgr.cpp | 8 +- .../drivers/swr/rasterizer/core/tilemgr.h | 31 ++++- 5 files changed, 105 insertions(+), 65 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 7ca182242e5..f0f7956b590 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -184,7 +184,7 @@ void QueueWork(SWR_CONTEXT *pContext) static TileSet lockedTiles; uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; WorkOnFifoFE(pContext, 0, curDraw[0], 0); - WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles); + WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0); } else { diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 845e28ea497..07bc94a1a54 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -349,7 +349,9 @@ void WorkOnFifoBE( SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, - TileSet& lockedTiles) + TileSet& lockedTiles, + uint32_t numaNode, + uint32_t numaMask) { // Find the first incomplete draw that has pending work. If no such draw is found then // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. @@ -390,68 +392,78 @@ void WorkOnFifoBE( for (uint32_t tileID : macroTiles) { + // Only work on tiles for for this numa node + uint32_t x, y; + pDC->pTileMgr->getTileIndices(tileID, x, y); + if (((x ^ y) & numaMask) != numaNode) + { + continue; + } + MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID); + if (!tile.getNumQueued()) + { + continue; + } + // can only work on this draw if it's not in use by other threads - if (lockedTiles.find(tileID) == lockedTiles.end()) + if (lockedTiles.find(tileID) != lockedTiles.end()) { - if (tile.getNumQueued()) + continue; + } + + if (tile.tryLock()) + { + BE_WORK *pWork; + + RDTSC_START(WorkerFoundWork); + + uint32_t numWorkItems = tile.getNumQueued(); + SWR_ASSERT(numWorkItems); + + pWork = tile.peek(); + SWR_ASSERT(pWork); + if (pWork->type == DRAW) { - if (tile.tryLock()) - { - BE_WORK *pWork; - - RDTSC_START(WorkerFoundWork); - - uint32_t numWorkItems = tile.getNumQueued(); - - if (numWorkItems != 0) - { - pWork = tile.peek(); - SWR_ASSERT(pWork); - if (pWork->type == DRAW) - { - pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID); - } - } - - while ((pWork = tile.peek()) != nullptr) - { - pWork->pfnWork(pDC, workerId, tileID, &pWork->desc); - tile.dequeue(); - } - RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId); - - _ReadWriteBarrier(); - - pDC->pTileMgr->markTileComplete(tileID); - - // Optimization: If the draw is complete and we're the last one to have worked on it then - // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete. - if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete()) - { - // We can increment the current BE and safely move to next draw since we know this draw is complete. - curDrawBE++; - CompleteDrawContext(pContext, pDC); - - lastRetiredDraw++; - - lockedTiles.clear(); - break; - } - } - else - { - // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again. - lockedTiles.insert(tileID); - } + pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID); + } + + while ((pWork = tile.peek()) != nullptr) + { + pWork->pfnWork(pDC, workerId, tileID, &pWork->desc); + tile.dequeue(); + } + RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId); + + _ReadWriteBarrier(); + + pDC->pTileMgr->markTileComplete(tileID); + + // Optimization: If the draw is complete and we're the last one to have worked on it then + // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete. + if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete()) + { + // We can increment the current BE and safely move to next draw since we know this draw is complete. + curDrawBE++; + CompleteDrawContext(pContext, pDC); + + lastRetiredDraw++; + + lockedTiles.clear(); + break; } } + else + { + // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again. + lockedTiles.insert(tileID); + } } } } -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode) +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode) { // Try to grab the next DC from the ring uint64_t drawEnqueued = GetEnqueuedDraw(pContext); @@ -547,7 +559,8 @@ DWORD workerThreadMain(LPVOID pData) RDTSC_INIT(threadId); - int numaNode = (int)pThreadData->numaId; + uint32_t numaNode = pThreadData->numaId; + uint32_t numaMask = pContext->threadPool.numaMask; // flush denormals to 0 _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); @@ -619,7 +632,7 @@ DWORD workerThreadMain(LPVOID pData) } RDTSC_START(WorkerWorkOnFifoBE); - WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles); + WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); WorkOnCompute(pContext, workerId, curDrawBE); @@ -740,6 +753,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) pPool->inThreadShutdown = false; pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA)); + pPool->numaMask = 0; if (KNOB_MAX_WORKER_THREADS) { @@ -760,6 +774,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) } else { + pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.) + uint32_t workerId = 0; for (uint32_t n = 0; n < numNodes; ++n) { diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h index 6cc8c96a00f..821d7dcb16e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.h +++ b/src/gallium/drivers/swr/rasterizer/core/threads.h @@ -51,6 +51,7 @@ struct THREAD_POOL { THREAD_PTR threads[KNOB_MAX_NUM_THREADS]; uint32_t numThreads; + uint32_t numaMask; volatile bool inThreadShutdown; THREAD_DATA *pThreadData; }; @@ -61,7 +62,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); // Expose FE and BE worker functions to the API thread if single threaded -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode); -void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles); +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode); +void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask); void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE); int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC); \ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp index 89c779e04d9..794577270cf 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp @@ -119,7 +119,8 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32 if (create) { uint32_t size = numSamples * mHotTileSize[attachment]; - hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4); + uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); + hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode); hotTile.state = HOTTILE_INVALID; hotTile.numSamples = numSamples; hotTile.renderTargetArrayIndex = renderTargetArrayIndex; @@ -139,10 +140,11 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32 SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || (hotTile.state == HOTTILE_RESOLVED) || (hotTile.state == HOTTILE_CLEAR)); - _aligned_free(hotTile.pBuffer); + FreeHotTileMem(hotTile.pBuffer); uint32_t size = numSamples * mHotTileSize[attachment]; - hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4); + uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); + hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode); hotTile.state = HOTTILE_INVALID; hotTile.numSamples = numSamples; } diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h index cf9d2fea32a..aa561badc1c 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h @@ -291,11 +291,7 @@ public: { for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a) { - if (mHotTiles[x][y].Attachment[a].pBuffer != NULL) - { - _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer); - mHotTiles[x][y].Attachment[a].pBuffer = NULL; - } + FreeHotTileMem(mHotTiles[x][y].Attachment[a].pBuffer); } } } @@ -315,5 +311,30 @@ public: private: HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y]; uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS]; + + void* AllocHotTileMem(size_t size, uint32_t align, uint32_t numaNode) + { + void* p = nullptr; +#if defined(_WIN32) + HANDLE hProcess = GetCurrentProcess(); + p = VirtualAllocExNuma(hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode); +#else + p = _aligned_malloc(size, align); +#endif + + return p; + } + + void FreeHotTileMem(void* pBuffer) + { + if (pBuffer) + { +#if defined(_WIN32) + VirtualFree(pBuffer, 0, MEM_RELEASE); +#else + _aligned_free(pBuffer); +#endif + } + } }; -- 2.30.2