From 207026d29e20223676ea587bb5bdba00b406b354 Mon Sep 17 00:00:00 2001 From: Jan Zielinski Date: Thu, 1 Aug 2019 15:14:03 +0200 Subject: [PATCH 1/1] swr/rasterizer: modernize thread TLB Reviewed-by: Alok Hota --- .../drivers/swr/rasterizer/core/api.cpp | 35 +++++++----- src/gallium/drivers/swr/rasterizer/core/api.h | 27 ++++++--- .../swr/rasterizer/core/backend_impl.h | 1 - .../drivers/swr/rasterizer/core/context.h | 5 ++ .../drivers/swr/rasterizer/core/frontend.cpp | 2 +- .../drivers/swr/rasterizer/core/knobs.h | 3 + .../swr/rasterizer/core/rasterizer.cpp | 10 +++- .../drivers/swr/rasterizer/core/state.h | 5 ++ .../drivers/swr/rasterizer/core/threads.cpp | 4 +- .../swr/rasterizer/jitter/JitManager.cpp | 3 +- .../drivers/swr/rasterizer/jitter/builder.cpp | 1 + .../drivers/swr/rasterizer/jitter/builder.h | 1 + .../swr/rasterizer/jitter/builder_misc.h | 11 ++++ .../swr/rasterizer/jitter/fetch_jit.cpp | 57 +++++++++++++++++++ 14 files changed, 135 insertions(+), 30 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index f1b0dc03352..a6f86b36f98 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -71,6 +71,21 @@ HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo) pContext->privateStateSize = pCreateInfo->privateStateSize; + // initialize callback functions + pContext->pfnLoadTile = pCreateInfo->pfnLoadTile; + pContext->pfnStoreTile = pCreateInfo->pfnStoreTile; + pContext->pfnTranslateGfxptrForRead = pCreateInfo->pfnTranslateGfxptrForRead; + pContext->pfnTranslateGfxptrForWrite = pCreateInfo->pfnTranslateGfxptrForWrite; + pContext->pfnMakeGfxPtr = pCreateInfo->pfnMakeGfxPtr; + pContext->pfnCreateMemoryContext = pCreateInfo->pfnCreateMemoryContext; + pContext->pfnDestroyMemoryContext = pCreateInfo->pfnDestroyMemoryContext; + pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset; + pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats; + pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE; + + + pContext->hExternalMemory = pCreateInfo->hExternalMemory; + pContext->MAX_DRAWS_IN_FLIGHT = KNOB_MAX_DRAWS_IN_FLIGHT; if (pCreateInfo->MAX_DRAWS_IN_FLIGHT != 0) { @@ -169,13 +184,13 @@ HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo) pContext->threadPool.pThreadData ? pContext->threadPool.pThreadData[i].numaId : 0; pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(GetCurrentProcess(), nullptr, - 32 * sizeof(KILOBYTE), + KNOB_WORKER_SCRATCH_SPACE_SIZE, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE, numaNode); #else pContext->ppScratch[i] = - (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4); + (uint8_t*)AlignedMalloc(KNOB_WORKER_SCRATCH_SPACE_SIZE, KNOB_SIMD_WIDTH * 4); #endif #if defined(KNOB_ENABLE_AR) @@ -200,17 +215,6 @@ HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo) // initialize hot tile manager pContext->pHotTileMgr = new HotTileMgr(); - // initialize callback functions - pContext->pfnLoadTile = pCreateInfo->pfnLoadTile; - pContext->pfnStoreTile = pCreateInfo->pfnStoreTile; - pContext->pfnTranslateGfxptrForRead = pCreateInfo->pfnTranslateGfxptrForRead; - pContext->pfnTranslateGfxptrForWrite = pCreateInfo->pfnTranslateGfxptrForWrite; - pContext->pfnMakeGfxPtr = pCreateInfo->pfnMakeGfxPtr; - pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset; - pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats; - pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE; - - // pass pointer to bucket manager back to caller #ifdef KNOB_ENABLE_RDTSC pCreateInfo->pBucketMgr = pContext->pBucketMgr; @@ -1531,7 +1535,9 @@ void SWR_API SwrDiscardRect(HANDLE hContext, uint32_t attachmentMask, const SWR_ void SwrDispatch(HANDLE hContext, uint32_t threadGroupCountX, uint32_t threadGroupCountY, - uint32_t threadGroupCountZ) + uint32_t threadGroupCountZ + +) { if (KNOB_TOSS_DRAW) { @@ -1551,6 +1557,7 @@ void SwrDispatch(HANDLE hContext, pTaskData->threadGroupCountX = threadGroupCountX; pTaskData->threadGroupCountY = threadGroupCountY; pTaskData->threadGroupCountZ = threadGroupCountZ; + pTaskData->enableThreadDispatch = false; uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h index 8058defb388..93ea0d42535 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -147,14 +147,20 @@ typedef void(SWR_API* PFN_CLEAR_TILE)(HANDLE hPrivateContex typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_READ)(HANDLE hPrivateContext, gfxptr_t xpAddr, - bool* pbNullTileAccessed); + bool* pbNullTileAccessed, + HANDLE hPrivateWorkerData); typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_WRITE)(HANDLE hPrivateContext, gfxptr_t xpAddr, - bool* pbNullTileAccessed); + bool* pbNullTileAccessed, + HANDLE hPrivateWorkerData); typedef gfxptr_t(SWR_API* PFN_MAKE_GFXPTR)(HANDLE hPrivateContext, void* sysAddr); +typedef HANDLE(SWR_API* PFN_CREATE_MEMORY_CONTEXT)(HANDLE hExternalMemory); + +typedef void(SWR_API* PFN_DESTROY_MEMORY_CONTEXT)(HANDLE hExternalMemory, HANDLE hMemoryContext); + ////////////////////////////////////////////////////////////////////////// /// @brief Callback to allow driver to update their copy of streamout write offset. /// This is call is made for any draw operation that has streamout enabled @@ -219,10 +225,11 @@ struct SWR_API_THREADING_INFO // Independent of KNOB_MAX_THREADS_PER_CORE. }; -struct SWR_WORKER_DATA -{ - HANDLE hArContext; // handle to the archrast context -}; +////////////////////////////////////////////////////////////////////////// +/// SWR_CONTEXT +/// Forward Declaration (see context.h for full definition) +///////////////////////////////////////////////////////////////////////// +class SWR_CONTEXT; ////////////////////////////////////////////////////////////////////////// /// SWR_WORKER_PRIVATE_STATE @@ -233,7 +240,7 @@ struct SWR_WORKER_DATA ///////////////////////////////////////////////////////////////////////// struct SWR_WORKER_PRIVATE_STATE { - typedef void(SWR_API* PFN_WORKER_DATA)(HANDLE hWorkerPrivateData, uint32_t iWorkerNum); + typedef void(SWR_API* PFN_WORKER_DATA)(SWR_CONTEXT* pContext, HANDLE hWorkerPrivateData, uint32_t iWorkerNum); size_t perWorkerPrivateStateSize; ///< Amount of data to allocate per-worker PFN_WORKER_DATA pfnInitWorkerData; ///< Init function for worker data. If null @@ -260,6 +267,8 @@ struct SWR_CREATECONTEXT_INFO PFN_TRANSLATE_GFXPTR_FOR_READ pfnTranslateGfxptrForRead; PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite; PFN_MAKE_GFXPTR pfnMakeGfxPtr; + PFN_CREATE_MEMORY_CONTEXT pfnCreateMemoryContext; + PFN_DESTROY_MEMORY_CONTEXT pfnDestroyMemoryContext; PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; PFN_UPDATE_STATS pfnUpdateStats; PFN_UPDATE_STATS_FE pfnUpdateStatsFE; @@ -275,6 +284,9 @@ struct SWR_CREATECONTEXT_INFO // ArchRast event manager. HANDLE hArEventManager; + // handle to external memory for worker datas to create memory contexts + HANDLE hExternalMemory; + // Input (optional): Threading info that overrides any set KNOB values. SWR_THREADING_INFO* pThreadInfo; @@ -588,7 +600,6 @@ SWR_FUNC(void, uint32_t threadGroupCountY, uint32_t threadGroupCountZ); - /// @note this enum needs to be kept in sync with HOTTILE_STATE! enum SWR_TILE_STATE { diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h index 9e74e2cee8e..1bd2e743781 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h @@ -1141,7 +1141,6 @@ void BackendPixelRate(DRAW_CONTEXT* pDC, // execute pixel shader RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId); state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext); - UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes))); RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0); // update stats diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 8891cc881a3..13cb7c8b856 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -535,6 +535,8 @@ struct SWR_CONTEXT PFN_TRANSLATE_GFXPTR_FOR_READ pfnTranslateGfxptrForRead; PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite; PFN_MAKE_GFXPTR pfnMakeGfxPtr; + PFN_CREATE_MEMORY_CONTEXT pfnCreateMemoryContext; + PFN_DESTROY_MEMORY_CONTEXT pfnDestroyMemoryContext; PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; PFN_UPDATE_STATS pfnUpdateStats; PFN_UPDATE_STATS_FE pfnUpdateStatsFE; @@ -558,6 +560,9 @@ struct SWR_CONTEXT // ArchRast thread contexts. HANDLE* pArContext; + // handle to external memory for worker datas to create memory contexts + HANDLE hExternalMemory; + BucketManager *pBucketMgr; }; diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index a27b33d2051..1aa98f49fd7 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -578,7 +578,7 @@ static void StreamOut( { bool nullTileAccessed = false; void* pWriteOffset = pDC->pContext->pfnTranslateGfxptrForWrite( - GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed); + GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed, pWorkerData); *((uint32_t*)pWriteOffset) = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t); } diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h index 92fbf8840e1..8dab50dab01 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h @@ -84,6 +84,9 @@ #define KNOB_GUARDBAND_WIDTH 32768.0f #define KNOB_GUARDBAND_HEIGHT 32768.0f +// Scratch space requirements per worker. Currently only used for TGSM sizing for some stages +#define KNOB_WORKER_SCRATCH_SPACE_SIZE (32 * 1024) + /////////////////////////////// // Macro tile configuration /////////////////////////////// diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp index 44c486c80bf..4f1d8ccff22 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp @@ -271,7 +271,7 @@ void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTi {48, 49, 52, 53, 56, 57, 60, 61}, {50, 51, 54, 55, 58, 59, 62, 63}}; - OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; + OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc = {}; // pull point information from triangle buffer // @todo use structs for readability @@ -287,8 +287,12 @@ void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTi // mask indices by the maximum valid index for x/y of coveragemap. uint32_t tX = workDesc.triFlags.coverageMask & 0x7; uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7; - // todo: multisample points? - triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX]; + for (uint32_t i = 0; i < _countof(triDesc.coverageMask); ++i) + { + triDesc.coverageMask[i] = 1ULL << coverageMap[tY][tX]; + } + triDesc.anyCoveredSamples = triDesc.coverageMask[0]; + triDesc.innerCoverageMask = triDesc.coverageMask[0]; // no persp divide needed for points triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs; diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 5202e6146a1..25d4fed9578 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -213,6 +213,11 @@ struct SIMDVERTEX_T typename SIMD_T::Vec4 attrib[SWR_VTX_NUM_SLOTS]; }; +struct SWR_WORKER_DATA +{ + HANDLE hArContext; // handle to the archrast context +}; + ////////////////////////////////////////////////////////////////////////// /// SWR_SHADER_STATS /// @brief Structure passed to shader for stats collection. diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index a0ddd96c61f..987469340d2 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -1216,7 +1216,7 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) pPool->pThreadData[i].pWorkerPrivateData = pWorkerData; if (pContext->workerPrivateState.pfnInitWorkerData) { - pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i); + pContext->workerPrivateState.pfnInitWorkerData(pContext, pWorkerData, i); } pWorkerData = PtrAdd(pWorkerData, perWorkerSize); } @@ -1396,7 +1396,7 @@ void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) if (pContext->workerPrivateState.pfnFinishWorkerData) { pContext->workerPrivateState.pfnFinishWorkerData( - pPool->pThreadData[t].pWorkerPrivateData, t); + pContext, pPool->pThreadData[t].pWorkerPrivateData, t); } } diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp index 0f78bd661a5..74edd4febbc 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp @@ -649,7 +649,8 @@ JitCache::JitCache() int ExecUnhookedProcess(const std::string& CmdLine, std::string* pStdOut, std::string* pStdErr) { - return ExecCmd(CmdLine, "", pStdOut, pStdErr); + + return ExecCmd(CmdLine, nullptr, pStdOut, pStdErr); } /// Calculate actual directory where module will be cached. diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp index 30481b43208..e7ba0040d9d 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp @@ -110,6 +110,7 @@ namespace SwrJit mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4); mSimdVectorIntTy = ArrayType::get(mSimdInt32Ty, 4); mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5); + mSimdVectorTRIntTy = ArrayType::get(mSimdInt32Ty, 5); } /// @brief Mark this alloca as temporary to avoid hoisting later on diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h index 6e1d94b9e68..9f2c199464d 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h @@ -108,6 +108,7 @@ namespace SwrJit Type* mSimdVectorTy; Type* mSimdVectorTRTy; Type* mSimdVectorIntTy; + Type* mSimdVectorTRIntTy; // Built in types: simd16 diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index 3987a5f3476..616c73b254a 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -50,6 +50,17 @@ Constant* C(const std::initializer_list& constList) return ConstantVector::get(vConsts); } +template +Constant* C(const std::vector& constList) +{ + std::vector vConsts; + for (auto i : constList) + { + vConsts.push_back(C((Ty)i)); + } + return ConstantVector::get(vConsts); +} + template Constant* CA(LLVMContext& ctx, ArrayRef constList) { diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index fe5b48e584b..72704e94e4c 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -1103,6 +1103,63 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState, } } + +typedef void* (*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va, bool* out_pbNullTileAccessed, void* pWorkerData); + +template +void GetSimdValidIndicesGfx(gfxptr_t indices, + gfxptr_t lastIndex, + uint32_t vWidth, + PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, + void* pdc, + uint32_t* outIndices, + void* pWorkerData) +{ + SWR_ASSERT(outIndices != nullptr); + + gfxptr_t indexPtr = indices; + for (int64_t lane = 0; lane < vWidth; lane++) + { + uint32_t index = 0; + + if (indexPtr < lastIndex) + { + // translate indexPtr and load from it + T* addr = (T*)pfnTranslate(pdc, indexPtr, nullptr, pWorkerData); + SWR_ASSERT(addr != nullptr); + index = *addr; + } + + // index to 32 bits and insert into the correct simd lane + outIndices[lane] = index; + + indexPtr += sizeof(T); + } +} + +void GetSimdValid8bitIndicesGfx(gfxptr_t indices, + gfxptr_t lastIndex, + uint32_t vWidth, + PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, + void* pdc, + uint32_t* outIndices, + void* pWorkerData) +{ + GetSimdValidIndicesGfx(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData); +} + +void GetSimdValid16bitIndicesGfx(gfxptr_t indices, + gfxptr_t lastIndex, + uint32_t vWidth, + PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, + void* pdc, + uint32_t* outIndices, + void* pWorkerData) +{ + GetSimdValidIndicesGfx(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData); +} + + template Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex) { -- 2.30.2