From dabd0499a6ba4032f2cf24103a7ca044061a3b98 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Wed, 19 Apr 2017 17:03:32 -0500 Subject: [PATCH] swr/rast: enable per-warp scratch space for CS Reviewed-by: Bruce Cherniak --- src/gallium/drivers/swr/rasterizer/core/api.cpp | 6 +++++- src/gallium/drivers/swr/rasterizer/core/api.h | 7 ++++++- src/gallium/drivers/swr/rasterizer/core/backend.cpp | 10 +++++++++- src/gallium/drivers/swr/rasterizer/core/backend.h | 2 +- src/gallium/drivers/swr/rasterizer/core/context.h | 2 ++ src/gallium/drivers/swr/rasterizer/core/state.h | 5 +++++ src/gallium/drivers/swr/rasterizer/core/threads.cpp | 3 ++- src/gallium/drivers/swr/rasterizer/core/tilemgr.h | 6 +++--- 8 files changed, 33 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 5c3225d58b2..dc8f517386c 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -592,12 +592,16 @@ void SwrSetCsFunc( HANDLE hContext, PFN_CS_FUNC pfnCsFunc, uint32_t totalThreadsInGroup, - uint32_t totalSpillFillSize) + uint32_t totalSpillFillSize, + uint32_t scratchSpaceSizePerInstance, + uint32_t numInstances) { API_STATE* pState = GetDrawState(GetContext(hContext)); pState->pfnCsFunc = pfnCsFunc; pState->totalThreadsInGroup = totalThreadsInGroup; pState->totalSpillFillSize = totalSpillFillSize; + pState->scratchSpaceSize = scratchSpaceSizePerInstance; + pState->scratchSpaceNumInstances = numInstances; } void SwrSetTsState( diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h index b9b994ad379..166598a48d0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -366,11 +366,16 @@ void SWR_API SwrSetGsFunc( /// @param pfnCsFunc - Pointer to compute shader function /// @param totalThreadsInGroup - product of thread group dimensions. /// @param totalSpillFillSize - size in bytes needed for spill/fill. +/// @param scratchSpaceSizePerInstance - size of the scratch space needed per simd instance +/// @param numInstances - number of simd instances that are run per execution of the shader void SWR_API SwrSetCsFunc( HANDLE hContext, PFN_CS_FUNC pfnCsFunc, uint32_t totalThreadsInGroup, - uint32_t totalSpillFillSize); + uint32_t totalSpillFillSize, + uint32_t scratchSpaceSizePerInstance, + uint32_t numInstances + ); ////////////////////////////////////////////////////////////////////////// /// @brief Set tessellation state. diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 39f4802db4c..16698ef08f9 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -45,7 +45,7 @@ static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS]; /// @param pDC - pointer to draw context (dispatch). /// @param workerId - The unique worker ID that is assigned to this thread. /// @param threadGroupId - the linear index for the thread group within the dispatch. -void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer) +void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace) { SWR_CONTEXT *pContext = pDC->pContext; @@ -60,6 +60,12 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup { pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES); } + + size_t scratchSpaceSize = pDC->pState->state.scratchSpaceSize * pDC->pState->state.scratchSpaceNumInstances; + if (scratchSpaceSize && pScratchSpace == nullptr) + { + pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD_BYTES); + } const API_STATE& state = GetApiState(pDC); @@ -70,6 +76,8 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup csContext.dispatchDims[2] = pTaskData->threadGroupCountZ; csContext.pTGSM = pContext->ppScratch[workerId]; csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer; + csContext.pScratchSpace = (uint8_t*)pScratchSpace; + csContext.scratchSpacePerSimd = pDC->pState->state.scratchSpaceSize; state.pfnCsFunc(GetPrivateState(pDC), &csContext); diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index ade9afccd95..822daa32133 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -34,7 +34,7 @@ #include "depthstencil.h" #include "rdtsc_core.h" -void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer); +void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace); void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 94085e59998..7781feaf101 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -245,6 +245,8 @@ OSALIGNLINE(struct) API_STATE PFN_CS_FUNC pfnCsFunc; uint32_t totalThreadsInGroup; uint32_t totalSpillFillSize; + uint32_t scratchSpaceSize; + uint32_t scratchSpaceNumInstances; // FE - Frontend State SWR_FRONTEND_STATE frontendState; diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 3d0b4ff951f..bf735e03f24 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -378,6 +378,11 @@ struct SWR_CS_CONTEXT uint8_t* pTGSM; // Thread Group Shared Memory pointer. uint8_t* pSpillFillBuffer; // Spill/fill buffer for barrier support + + uint8_t* pScratchSpace; // Pointer to scratch space buffer used by the shader, shader is responsible + // for subdividing scratch space per instance/simd + + uint32_t scratchSpacePerSimd; // Scratch space per work item x SIMD_WIDTH }; // enums diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 295014d1a7e..e03632b443b 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -726,10 +726,11 @@ void WorkOnCompute( if (queue.getNumQueued() > 0) { void* pSpillFillBuffer = nullptr; + void* pScratchSpace = nullptr; uint32_t threadGroupId = 0; while (queue.getWork(threadGroupId)) { - queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer); + queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace); queue.finishedWork(); } diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h index bfff339a55f..8f1cd21543d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h @@ -151,7 +151,7 @@ private: OSALIGNLINE(volatile LONG) mWorkItemsConsumed { 0 }; }; -typedef void(*PFN_DISPATCH)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer); +typedef void(*PFN_DISPATCH)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace); ////////////////////////////////////////////////////////////////////////// /// DispatchQueue - work queue for dispatch @@ -231,10 +231,10 @@ public: ////////////////////////////////////////////////////////////////////////// /// @brief Dispatches a unit of work - void dispatch(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer) + void dispatch(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace) { SWR_ASSERT(mPfnDispatch != nullptr); - mPfnDispatch(pDC, workerId, threadGroupId, pSpillFillBuffer); + mPfnDispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace); } void* mpTaskData{ nullptr }; // The API thread will set this up and the callback task function will interpet this. -- 2.30.2