From ec9d4c4b372df773e4453c228b938e7c6c526c4c Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Mon, 21 Mar 2016 11:15:32 -0600 Subject: [PATCH] swr: [rasterizer core] Globally cache allocated arena blocks for fast re-allocation. --- .../drivers/swr/rasterizer/core/api.cpp | 6 +- .../drivers/swr/rasterizer/core/arena.h | 58 +++++---- .../drivers/swr/rasterizer/core/context.h | 120 +++++++++++++++++- .../drivers/swr/rasterizer/core/fifo.hpp | 6 +- .../drivers/swr/rasterizer/core/frontend.cpp | 10 +- .../drivers/swr/rasterizer/core/threads.cpp | 2 +- .../drivers/swr/rasterizer/core/tilemgr.cpp | 2 +- .../drivers/swr/rasterizer/core/tilemgr.h | 10 +- 8 files changed, 168 insertions(+), 46 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 453d0295b54..6ebb3f87f7a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -66,11 +66,11 @@ HANDLE SwrCreateContext( for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) { - pContext->dcRing[dc].pArena = new Arena(); + pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena)); pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen. - pContext->dsRing[dc].pArena = new Arena(); + pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); } if (!KNOB_SINGLE_THREADED) @@ -252,7 +252,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT; pCurDrawContext->pState = &pContext->dsRing[dsIndex]; - Arena& stateArena = *(pCurDrawContext->pState->pArena); + auto& stateArena = *(pCurDrawContext->pState->pArena); // Copy previous state to current state. if (pContext->pPrevDrawContext) diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h index 4cdb728e1ef..71fb258f4d4 100644 --- a/src/gallium/drivers/swr/rasterizer/core/arena.h +++ b/src/gallium/drivers/swr/rasterizer/core/arena.h @@ -51,6 +51,16 @@ public: } }; +static const size_t ARENA_BLOCK_ALIGN = KNOB_SIMD_WIDTH * 4; + +struct ArenaBlock +{ + void* pMem = nullptr; + size_t blockSize = 0; + ArenaBlock* pNext = nullptr; +}; +static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, "Increase BLOCK_ALIGN size"); + template class TArena { @@ -67,12 +77,12 @@ public: if (m_pCurBlock) { ArenaBlock* pCurBlock = m_pCurBlock; - pCurBlock->offset = AlignUp(pCurBlock->offset, align); + m_offset = AlignUp(m_offset, align); - if ((pCurBlock->offset + size) <= pCurBlock->blockSize) + if ((m_offset + size) <= pCurBlock->blockSize) { - void* pMem = PtrAdd(pCurBlock->pMem, pCurBlock->offset); - pCurBlock->offset += size; + void* pMem = PtrAdd(pCurBlock->pMem, m_offset); + m_offset += size; m_size += size; return pMem; } @@ -85,21 +95,21 @@ public: size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize)); // Add in one BLOCK_ALIGN unit to store ArenaBlock in. - blockSize = AlignUp(blockSize + BLOCK_ALIGN, BLOCK_ALIGN); + blockSize = AlignUp(blockSize + ARENA_BLOCK_ALIGN, ARENA_BLOCK_ALIGN); - void *pMem = m_allocator.AllocateAligned(blockSize, BLOCK_ALIGN); // Arena blocks are always simd byte aligned. + void *pMem = m_allocator.AllocateAligned(blockSize, ARENA_BLOCK_ALIGN); // Arena blocks are always simd byte aligned. SWR_ASSERT(pMem != nullptr); ArenaBlock* pNewBlock = new (pMem) ArenaBlock(); if (pNewBlock != nullptr) { + m_offset = 0; pNewBlock->pNext = m_pCurBlock; m_pCurBlock = pNewBlock; - m_pCurBlock->pMem = PtrAdd(pMem, BLOCK_ALIGN); - m_pCurBlock->blockSize = blockSize - BLOCK_ALIGN; - + m_pCurBlock->pMem = PtrAdd(pMem, ARENA_BLOCK_ALIGN); + m_pCurBlock->blockSize = blockSize - ARENA_BLOCK_ALIGN; } return AllocAligned(size, align); @@ -134,10 +144,10 @@ public: void Reset(bool removeAll = false) { + m_offset = 0; + if (m_pCurBlock) { - m_pCurBlock->offset = 0; - ArenaBlock *pUsedBlocks = m_pCurBlock->pNext; m_pCurBlock->pNext = nullptr; while (pUsedBlocks) @@ -162,28 +172,20 @@ public: private: - static const size_t BLOCK_ALIGN = KNOB_SIMD_WIDTH * 4; + ArenaBlock* m_pCurBlock = nullptr; + size_t m_offset = 0; + size_t m_size = 0; + + /// @note Mutex is only used by sync allocation functions. + MutexT m_mutex; DefaultAllocator m_defAllocator; T& m_allocator; - - struct ArenaBlock - { - void* pMem = nullptr; - size_t blockSize = 0; - size_t offset = 0; - ArenaBlock* pNext = nullptr; - }; - static_assert(sizeof(ArenaBlock) <= BLOCK_ALIGN, "Increase BLOCK_ALIGN size"); - - ArenaBlock* m_pCurBlock = nullptr; - size_t m_size = 0; - - /// @note Mutex is only used by sync allocation functions. - MutexT m_mutex; }; -typedef TArena<> Arena; +template +using Arena = TArena; +using StdArena = Arena; struct NullMutex { diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index ed972fa5478..6240b2e08d3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -360,6 +360,120 @@ struct BACKEND_FUNCS PFN_OUTPUT_MERGER pfnOutputMerger; }; +// Caching Allocator for Arena +struct CachingAllocator : DefaultAllocator +{ + void* AllocateAligned(size_t size, size_t align) + { + SWR_ASSERT(size >= sizeof(ArenaBlock)); + + { + // search cached blocks + std::lock_guard l(m_mutex); + ArenaBlock* pPrevBlock = &m_cachedBlocks; + ArenaBlock* pBlock = m_cachedBlocks.pNext; + ArenaBlock* pPotentialBlock = nullptr; + ArenaBlock* pPotentialPrev = nullptr; + + while (pBlock) + { + if (pBlock->blockSize >= (size - ARENA_BLOCK_ALIGN)) + { + if (pBlock == AlignUp(pBlock, align)) + { + if (pBlock->blockSize == size) + { + // Won't find a better match + break; + } + + // We could use this as it is larger than we wanted, but + // continue to search for a better match + pPotentialBlock = pBlock; + pPotentialPrev = pPrevBlock; + } + } + else + { + // Blocks are sorted by size (biggest first) + // So, if we get here, there are no blocks + // large enough, fall through to allocation. + pBlock = nullptr; + break; + } + + pPrevBlock = pBlock; + pBlock = pBlock->pNext; + } + + if (!pBlock) + { + // Couldn't find an exact match, use next biggest size + pBlock = pPotentialBlock; + pPrevBlock = pPotentialPrev; + } + + if (pBlock) + { + SWR_ASSERT(pPrevBlock && pPrevBlock->pNext == pBlock); + pPrevBlock->pNext = pBlock->pNext; + pBlock->pNext = nullptr; + + return pBlock; + } + } + + return this->DefaultAllocator::AllocateAligned(size, align); + } + + void Free(void* pMem) + { + if (pMem) + { + ArenaBlock* pNewBlock = reinterpret_cast(pMem); + SWR_ASSERT(pNewBlock->blockSize >= 0 && pNewBlock->pMem != nullptr); + + std::unique_lock l(m_mutex); + ArenaBlock* pPrevBlock = &m_cachedBlocks; + ArenaBlock* pBlock = m_cachedBlocks.pNext; + + while (pBlock) + { + if (pNewBlock->blockSize >= pBlock->blockSize) + { + // Insert here + break; + } + pPrevBlock = pBlock; + pBlock = pBlock->pNext; + } + + // Insert into list + SWR_ASSERT(pPrevBlock); + pPrevBlock->pNext = pNewBlock; + pNewBlock->pNext = pBlock; + } + } + + ~CachingAllocator() + { + // Free all cached blocks + ArenaBlock* pBlock = m_cachedBlocks.pNext; + while (pBlock) + { + ArenaBlock* pNext = pBlock->pNext; + this->DefaultAllocator::Free(pBlock); + pBlock = pNext; + } + } + + ArenaBlock m_cachedBlocks; + std::mutex m_mutex; + +}; + +using CachingArena = Arena; + // Draw State struct DRAW_STATE { @@ -371,7 +485,7 @@ struct DRAW_STATE BACKEND_FUNCS backendFuncs; PFN_PROCESS_PRIMS pfnProcessPrims; - Arena* pArena; // This should only be used by API thread. + CachingArena* pArena; // This should only be used by API thread. }; // Draw Context @@ -398,7 +512,7 @@ struct DRAW_CONTEXT DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) DRAW_STATE* pState; - Arena* pArena; + CachingArena* pArena; uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills. }; @@ -476,6 +590,8 @@ struct SWR_CONTEXT // Scratch space for workers. uint8_t* pScratch[KNOB_MAX_NUM_THREADS]; + + CachingAllocator cachingArenaAllocator; }; void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId); diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp index 7e556012e6b..ccf0b70544f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp +++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp @@ -49,7 +49,8 @@ struct QUEUE static const uint32_t mBlockSizeShift = 6; static const uint32_t mBlockSize = 1 << mBlockSizeShift; - void clear(Arena& arena) + template + void clear(ArenaT& arena) { mHead = 0; mTail = 0; @@ -102,7 +103,8 @@ struct QUEUE mNumEntries --; } - bool enqueue_try_nosync(Arena& arena, const T* entry) + template + bool enqueue_try_nosync(ArenaT& arena, const T* entry) { memcpy(&mCurBlock[mTail], entry, sizeof(T)); diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index e780ffbf175..36721e00beb 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -881,7 +881,7 @@ static void GeometryShaderStage( static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer, void **ppStreamCutBuffer) { - Arena* pArena = pDC->pArena; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); SWR_ASSERT(state.gsState.gsEnable); // allocate arena space to hold GS output verts @@ -1813,7 +1813,7 @@ void BinTriangles( work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X]; } - Arena* pArena = pDC->pArena; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store active attribs @@ -1985,7 +1985,7 @@ void BinPoints( work.pfnWork = RasterizeSimplePoint; - Arena* pArena = pDC->pArena; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store attributes @@ -2119,7 +2119,7 @@ void BinPoints( work.pfnWork = RasterizeTriPoint; - Arena* pArena = pDC->pArena; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store active attribs @@ -2336,7 +2336,7 @@ void BinLines( work.pfnWork = RasterizeLine; - Arena* pArena = pDC->pArena; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store active attribs diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index ff25e82f0fe..ce8646fb28d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -290,7 +290,7 @@ INLINE void CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) _ReadWriteBarrier(); // Cleanup memory allocations - pDC->pArena->Reset(); + pDC->pArena->Reset(true); pDC->pTileMgr->initialize(); pContext->dcRing.Dequeue(); // Remove from tail diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp index f3c24dacb48..89c779e04d9 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp @@ -56,7 +56,7 @@ void DispatchQueue::operator delete(void *p) _aligned_free(p); } -MacroTileMgr::MacroTileMgr(Arena& arena) : mArena(arena) +MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena) { } diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h index f3e1373b00f..cf9d2fea32a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h @@ -59,7 +59,8 @@ struct MacroTileQueue ////////////////////////////////////////////////////////////////////////// /// @brief Clear fifo and unlock it. - void clear(Arena& arena) + template + void clear(ArenaT& arena) { mFifo.clear(arena); } @@ -71,7 +72,8 @@ struct MacroTileQueue return mFifo.peek(); } - bool enqueue_try_nosync(Arena& arena, const BE_WORK* entry) + template + bool enqueue_try_nosync(ArenaT& arena, const BE_WORK* entry) { return mFifo.enqueue_try_nosync(arena, entry); } @@ -104,7 +106,7 @@ private: class MacroTileMgr { public: - MacroTileMgr(Arena& arena); + MacroTileMgr(CachingArena& arena); ~MacroTileMgr() { for (auto &tile : mTiles) @@ -142,7 +144,7 @@ public: void operator delete (void *p); private: - Arena& mArena; + CachingArena& mArena; std::unordered_map mTiles; // Any tile that has work queued to it is a dirty tile. -- 2.30.2