From c25244f2f7f61ebb368a9651aef4de93bd8306ac Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Thu, 24 Mar 2016 16:20:02 -0600 Subject: [PATCH] swr: [rasterizer core] Affinitize thread scratch space to numa node of worker Acked-by: Brian Paul --- src/gallium/drivers/swr/rasterizer/core/api.cpp | 16 ++++++++++++++-- src/gallium/drivers/swr/rasterizer/core/arena.h | 2 +- .../drivers/swr/rasterizer/core/backend.cpp | 2 +- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index f0f7956b590..442cdd420f4 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -93,8 +93,16 @@ HANDLE SwrCreateContext( ///@note We could lazily allocate this but its rather small amount of memory. for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) { - ///@todo Use numa API for allocations using numa information from thread data (if exists). - pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4); +#if defined(_WIN32) + uint32_t numaNode = pContext->threadPool.pThreadData ? + pContext->threadPool.pThreadData[i].numaId : 0; + pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma( + GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE), + MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE, + numaNode); +#else + pContext->pScratch[i] = (uint8_t*)_aligned_malloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4); +#endif } // State setup AFTER context is fully initialized @@ -138,7 +146,11 @@ void SwrDestroyContext(HANDLE hContext) // Free scratch space. for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) { +#if defined(_WIN32) + VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE); +#else _aligned_free(pContext->pScratch[i]); +#endif } delete(pContext->pHotTileMgr); diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h index 67d81a44347..0241f5b900d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/arena.h +++ b/src/gallium/drivers/swr/rasterizer/core/arena.h @@ -209,7 +209,7 @@ struct CachingAllocatorT : DefaultAllocator }; typedef CachingAllocatorT<> CachingAllocator; -template +template class TArena { public: diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 7fb83edf169..ad0a5a07032 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -83,7 +83,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup if (pDC->pSpillFill[workerId] == nullptr) { ///@todo Add state which indicates the spill fill size. - pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4096 * 1024, sizeof(float) * 8); + pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8); } const API_STATE& state = GetApiState(pDC); -- 2.30.2