From: George Kyriazis Date: Wed, 2 May 2018 00:33:38 +0000 (-0500) Subject: swr/rast: Thread locked tiles improvement X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=4e52cb51b56eaae7153394ed712f49ce0ba63bcc;p=mesa.git swr/rast: Thread locked tiles improvement - Change tilemgr TILE_ID encoding to use Morton-order (Z-order). - Change locked tiles set to bitset. Makes clear, set, get much faster. Reviewed-by: Bruce Cherniak --- diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 3458793fd8d..47f3633d54b 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -42,6 +42,7 @@ #include "core/tilemgr.h" #include "core/clip.h" #include "core/utils.h" +#include "core/tileset.h" #include "common/os.h" @@ -139,6 +140,11 @@ HANDLE SwrCreateContext( BindApiThread(pContext, 0); } + if (pContext->threadInfo.SINGLE_THREADED) + { + pContext->pSingleThreadLockedTiles = new TileSet(); + } + pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads]; pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64); @@ -245,7 +251,7 @@ void QueueWork(SWR_CONTEXT *pContext) { uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; WorkOnFifoFE(pContext, 0, curDraw[0]); - WorkOnFifoBE(pContext, 0, curDraw[1], pContext->singleThreadLockedTiles, 0, 0); + WorkOnFifoBE(pContext, 0, curDraw[1], *pContext->pSingleThreadLockedTiles, 0, 0); } else { @@ -427,7 +433,8 @@ void SwrDestroyContext(HANDLE hContext) delete[] pContext->ppScratch; AlignedFree(pContext->pStats); - delete(pContext->pHotTileMgr); + delete pContext->pHotTileMgr; + delete pContext->pSingleThreadLockedTiles; pContext->~SWR_CONTEXT(); AlignedFree(GetContext(hContext)); diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index af8f4b8db4f..2cd61e4abbb 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -516,7 +516,7 @@ struct SWR_CONTEXT uint32_t lastFrameChecked; uint64_t lastDrawChecked; - TileSet singleThreadLockedTiles; + TileSet* pSingleThreadLockedTiles; // ArchRast thread contexts. HANDLE* pArContext; diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 9e16246c3f4..f77ae22a80a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -49,6 +49,7 @@ #include "rasterizer.h" #include "rdtsc_core.h" #include "tilemgr.h" +#include "tileset.h" @@ -587,7 +588,7 @@ bool WorkOnFifoBE( } // can only work on this draw if it's not in use by other threads - if (lockedTiles.find(tileID) != lockedTiles.end()) + if (lockedTiles.get(tileID)) { continue; } @@ -645,7 +646,7 @@ bool WorkOnFifoBE( else { // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again. - lockedTiles.insert(tileID); + lockedTiles.set(tileID); } } } diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h index cb918ddb60d..0489a3cc6cf 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.h +++ b/src/gallium/drivers/swr/rasterizer/core/threads.h @@ -62,7 +62,7 @@ struct THREAD_POOL THREAD_DATA *pApiThreadData; }; -typedef std::unordered_set TileSet; +struct TileSet; void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool); diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp index 28fa7877114..1bdef4bd7dd 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp @@ -33,8 +33,6 @@ #include "core/multisample.h" #include "rdtsc_core.h" -#define TILE_ID(x,y) ((x << 16 | y)) - MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena) { } @@ -50,26 +48,35 @@ void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork) return; } - uint32_t id = TILE_ID(x, y); + uint32_t id = getTileId(x, y); + + if (id >= mTiles.size()) + { + mTiles.resize((16 + id) * 2); + } - MacroTileQueue &tile = mTiles[id]; - tile.mWorkItemsFE++; - tile.mId = id; + MacroTileQueue *pTile = mTiles[id]; + if (!pTile) + { + pTile = mTiles[id] = new MacroTileQueue(); + } + pTile->mWorkItemsFE++; + pTile->mId = id; - if (tile.mWorkItemsFE == 1) + if (pTile->mWorkItemsFE == 1) { - tile.clear(mArena); - mDirtyTiles.push_back(&tile); + pTile->clear(mArena); + mDirtyTiles.push_back(pTile); } mWorkItemsProduced++; - tile.enqueue_try_nosync(mArena, pWork); + pTile->enqueue_try_nosync(mArena, pWork); } void MacroTileMgr::markTileComplete(uint32_t id) { - SWR_ASSERT(mTiles.find(id) != mTiles.end()); - MacroTileQueue &tile = mTiles[id]; + SWR_ASSERT(mTiles.size() > id); + MacroTileQueue &tile = *mTiles[id]; uint32_t numTiles = tile.mWorkItemsFE; InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles); diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h index 2831010b12f..8392db1b05f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h @@ -31,6 +31,7 @@ #include #include #include "common/formats.h" +#include "common/intrin.h" #include "fifo.hpp" #include "context.h" #include "format_traits.h" @@ -41,7 +42,7 @@ struct MacroTileQueue { MacroTileQueue() { } - ~MacroTileQueue() { } + ~MacroTileQueue() { destroy(); } ////////////////////////////////////////////////////////////////////////// /// @brief Returns number of work items queued for this tile. @@ -110,9 +111,9 @@ public: MacroTileMgr(CachingArena& arena); ~MacroTileMgr() { - for (auto &tile : mTiles) + for (auto *pTile : mTiles) { - tile.second.destroy(); + delete pTile; } } @@ -136,13 +137,20 @@ public: static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y) { - y = tileID & 0xffff; - x = (tileID >> 16) & 0xffff; + // Morton / Z order of tiles + x = pext_u32(tileID, 0x55555555); + y = pext_u32(tileID, 0xAAAAAAAA); + } + + static INLINE uint32_t getTileId(uint32_t x, uint32_t y) + { + // Morton / Z order of tiles + return pdep_u32(x, 0x55555555) | pdep_u32(y, 0xAAAAAAAA); } private: CachingArena& mArena; - std::unordered_map mTiles; + std::vector mTiles; // Any tile that has work queued to it is a dirty tile. std::vector mDirtyTiles; diff --git a/src/gallium/drivers/swr/rasterizer/core/tileset.h b/src/gallium/drivers/swr/rasterizer/core/tileset.h new file mode 100644 index 00000000000..3eb4c5d1f00 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/tileset.h @@ -0,0 +1,105 @@ +/**************************************************************************** +* Copyright (C) 2018 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file tileset.h +* +* @brief Custom bitset class for managing locked tiles +* +******************************************************************************/ +#pragma once + +struct TileSet +{ + ~TileSet() + { + if (m_bits) + { + AlignedFree(m_bits); + } + } + INLINE void set(size_t idx) + { + _grow(idx); + size_t& word = _get_word(idx); + word |= (size_t(1) << (idx & BITS_OFFSET)); + m_maxSet = std::max(m_maxSet, idx + 1); + } + INLINE bool get(size_t idx) + { + if (idx >= m_size) + { + return false; + } + size_t word = _get_word(idx); + return 0 != (word & (size_t(1) << (idx & BITS_OFFSET))); + } + + INLINE void clear() + { + if (m_maxSet) + { + size_t num_words = (m_maxSet + BITS_OFFSET) / BITS_PER_WORD; + memset(m_bits, 0, sizeof(size_t) * num_words); + m_maxSet = 0; + } + } + +private: + static const size_t BITS_PER_WORD = sizeof(size_t) * 8; + static const size_t BITS_OFFSET = BITS_PER_WORD - 1; + + size_t m_size = 0; + size_t m_maxSet = 0; + size_t* m_bits = nullptr; + + INLINE size_t& _get_word(size_t idx) + { + return m_bits[idx / BITS_PER_WORD]; + } + + void _grow(size_t idx) + { + if (idx < m_size) + { + return; + } + + size_t new_size = (1 + idx + BITS_OFFSET) & ~BITS_OFFSET; + size_t num_words = new_size / BITS_PER_WORD; + size_t* newBits = (size_t*)AlignedMalloc(sizeof(size_t) * num_words, 64); + size_t copy_words = 0; + + if (m_bits) + { + copy_words = (m_size + BITS_OFFSET) / BITS_PER_WORD; + num_words -= copy_words; + memcpy(newBits, m_bits, copy_words * sizeof(size_t)); + + AlignedFree(m_bits); + } + + m_bits = newBits; + m_size = new_size; + + memset(&m_bits[copy_words], 0, sizeof(size_t) * num_words); + } +};