swr/rast: Thread locked tiles improvement

author George Kyriazis <george.kyriazis@intel.com>

Wed, 2 May 2018 00:33:38 +0000 (19:33 -0500)

committer George Kyriazis <george.kyriazis@intel.com>

Fri, 11 May 2018 16:26:35 +0000 (11:26 -0500)
author George Kyriazis <george.kyriazis@intel.com>
Wed, 2 May 2018 00:33:38 +0000 (19:33 -0500)
committer George Kyriazis <george.kyriazis@intel.com>
Fri, 11 May 2018 16:26:35 +0000 (11:26 -0500)
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp

index 3458793fd8d9001392c32aba76e5ce0c29f16174..47f3633d54ba9c7d12f3ac352c3eaa08764c7799 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -42,6 +42,7 @@
  #include "core/tilemgr.h"
  #include "core/clip.h"
  #include "core/utils.h"
+#include "core/tileset.h"
  
  #include "common/os.h"
  
@@ -139,6 +140,11 @@ HANDLE SwrCreateContext(
          BindApiThread(pContext, 0);
      }
  
+    if (pContext->threadInfo.SINGLE_THREADED)
+    {
+        pContext->pSingleThreadLockedTiles = new TileSet();
+    }
+
      pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
      pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
  
@@ -245,7 +251,7 @@ void QueueWork(SWR_CONTEXT *pContext)
          {
              uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
              WorkOnFifoFE(pContext, 0, curDraw[0]);
-            WorkOnFifoBE(pContext, 0, curDraw[1], pContext->singleThreadLockedTiles, 0, 0);
+            WorkOnFifoBE(pContext, 0, curDraw[1], *pContext->pSingleThreadLockedTiles, 0, 0);
          }
          else
          {
@@ -427,7 +433,8 @@ void SwrDestroyContext(HANDLE hContext)
      delete[] pContext->ppScratch;
      AlignedFree(pContext->pStats);
  
-    delete(pContext->pHotTileMgr);
+    delete pContext->pHotTileMgr;
+    delete pContext->pSingleThreadLockedTiles;
  
      pContext->~SWR_CONTEXT();
      AlignedFree(GetContext(hContext));
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h

index af8f4b8db4f352a1bdc0c67599eeec88163f818f..2cd61e4abbb283826e61194e6bb9327a5665a124 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -516,7 +516,7 @@ struct SWR_CONTEXT
  
      uint32_t lastFrameChecked;
      uint64_t lastDrawChecked;
-    TileSet singleThreadLockedTiles;
+    TileSet* pSingleThreadLockedTiles;
  
      // ArchRast thread contexts.
      HANDLE* pArContext;
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp

index 9e16246c3f407cf23998232bc7a03fdf1c48b09f..f77ae22a80a9d26563ac3e35f03d177116c14753 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -49,6 +49,7 @@
  #include "rasterizer.h"
  #include "rdtsc_core.h"
  #include "tilemgr.h"
+#include "tileset.h"
  
  
  
@@ -587,7 +588,7 @@ bool WorkOnFifoBE(
              }
  
              // can only work on this draw if it's not in use by other threads
-            if (lockedTiles.find(tileID) != lockedTiles.end())
+            if (lockedTiles.get(tileID))
              {
                  continue;
              }
@@ -645,7 +646,7 @@ bool WorkOnFifoBE(
              else
              {
                  // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
-                lockedTiles.insert(tileID);
+                lockedTiles.set(tileID);
              }
          }
      }
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h

index cb918ddb60d76c707e20b8264ee7ce3f8791169b..0489a3cc6cf4b83d59089a7f1d0763ce068f6b6c 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -62,7 +62,7 @@ struct THREAD_POOL
      THREAD_DATA *pApiThreadData;
  };
  
-typedef std::unordered_set<uint32_t> TileSet;
+struct TileSet;
  
  void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
  void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool);
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp

index 28fa78771140875cf86b81e11cb443424b0a8c66..1bdef4bd7dd1b709b07aae9b94a06f39e4049748 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -33,8 +33,6 @@
  #include "core/multisample.h"
  #include "rdtsc_core.h"
  
-#define TILE_ID(x,y) ((x << 16 | y))
-
  MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
  {
  }
@@ -50,26 +48,35 @@ void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
          return;
      }
  
-    uint32_t id = TILE_ID(x, y);
+    uint32_t id = getTileId(x, y);
+
+    if (id >= mTiles.size())
+    {
+        mTiles.resize((16 + id) * 2);
+    }
  
-    MacroTileQueue &tile = mTiles[id];
-    tile.mWorkItemsFE++;
-    tile.mId = id;
+    MacroTileQueue *pTile = mTiles[id];
+    if (!pTile)
+    {
+        pTile = mTiles[id] = new MacroTileQueue();
+    }
+    pTile->mWorkItemsFE++;
+    pTile->mId = id;
  
-    if (tile.mWorkItemsFE == 1)
+    if (pTile->mWorkItemsFE == 1)
      {
-        tile.clear(mArena);
-        mDirtyTiles.push_back(&tile);
+        pTile->clear(mArena);
+        mDirtyTiles.push_back(pTile);
      }
  
      mWorkItemsProduced++;
-    tile.enqueue_try_nosync(mArena, pWork);
+    pTile->enqueue_try_nosync(mArena, pWork);
  }
  
  void MacroTileMgr::markTileComplete(uint32_t id)
  {
-    SWR_ASSERT(mTiles.find(id) != mTiles.end());
-    MacroTileQueue &tile = mTiles[id];
+    SWR_ASSERT(mTiles.size() > id);
+    MacroTileQueue &tile = *mTiles[id];
      uint32_t numTiles = tile.mWorkItemsFE;
      InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
  
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h

index 2831010b12f2718a87670ae86892b16f72583c21..8392db1b05f9d2c93bf0de9e078f8c7dbd873ed4 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -31,6 +31,7 @@
  #include <set>
  #include <unordered_map>
  #include "common/formats.h"
+#include "common/intrin.h"
  #include "fifo.hpp"
  #include "context.h"
  #include "format_traits.h"
@@ -41,7 +42,7 @@
  struct MacroTileQueue
  {
      MacroTileQueue() { }
-    ~MacroTileQueue() { }
+    ~MacroTileQueue() { destroy(); }
  
      //////////////////////////////////////////////////////////////////////////
      /// @brief Returns number of work items queued for this tile.
@@ -110,9 +111,9 @@ public:
      MacroTileMgr(CachingArena& arena);
      ~MacroTileMgr()
      {
-        for (auto &tile : mTiles)
+        for (auto *pTile : mTiles)
          {
-            tile.second.destroy();
+            delete pTile;
          }
      }
  
@@ -136,13 +137,20 @@ public:
  
      static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y)
      {
-        y = tileID & 0xffff;
-        x = (tileID >> 16) & 0xffff;
+        // Morton / Z order of tiles
+        x = pext_u32(tileID, 0x55555555);
+        y = pext_u32(tileID, 0xAAAAAAAA);
+    }
+
+    static INLINE uint32_t getTileId(uint32_t x, uint32_t y)
+    {
+        // Morton / Z order of tiles
+        return pdep_u32(x, 0x55555555) | pdep_u32(y, 0xAAAAAAAA);
      }
  
  private:
      CachingArena& mArena;
-    std::unordered_map<uint32_t, MacroTileQueue> mTiles;
+    std::vector<MacroTileQueue*> mTiles;
  
      // Any tile that has work queued to it is a dirty tile.
      std::vector<MacroTileQueue*> mDirtyTiles;
diff --git a/src/gallium/drivers/swr/rasterizer/core/tileset.h b/src/gallium/drivers/swr/rasterizer/core/tileset.h

new file mode 100644 (file)

index 0000000..3eb4c5d
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/tileset.h
@@ -0,0 +1,105 @@
+/****************************************************************************
+* Copyright (C) 2018 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file tileset.h
+*
+* @brief Custom bitset class for managing locked tiles
+*
+******************************************************************************/
+#pragma once
+
+struct TileSet
+{
+    ~TileSet()
+    {
+        if (m_bits)
+        {
+            AlignedFree(m_bits);
+        }
+    }
+    INLINE void set(size_t idx)
+    {
+        _grow(idx);
+        size_t& word = _get_word(idx);
+        word |= (size_t(1) << (idx & BITS_OFFSET));
+        m_maxSet = std::max(m_maxSet, idx + 1);
+    }
+    INLINE bool get(size_t idx)
+    {
+        if (idx >= m_size)
+        {
+            return false;
+        }
+        size_t word = _get_word(idx);
+        return 0 != (word & (size_t(1) << (idx & BITS_OFFSET)));
+    }
+
+    INLINE void clear()
+    {
+        if (m_maxSet)
+        {
+            size_t num_words = (m_maxSet + BITS_OFFSET) / BITS_PER_WORD;
+            memset(m_bits, 0, sizeof(size_t) * num_words);
+            m_maxSet = 0;
+        }
+    }
+
+private:
+    static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
+    static const size_t BITS_OFFSET = BITS_PER_WORD - 1;
+
+    size_t              m_size = 0;
+    size_t              m_maxSet = 0;
+    size_t*             m_bits = nullptr;
+
+    INLINE size_t& _get_word(size_t idx)
+    {
+        return m_bits[idx / BITS_PER_WORD];
+    }
+
+    void _grow(size_t idx)
+    {
+        if (idx < m_size)
+        {
+            return;
+        }
+
+        size_t new_size = (1 + idx + BITS_OFFSET) & ~BITS_OFFSET;
+        size_t num_words = new_size / BITS_PER_WORD;
+        size_t* newBits = (size_t*)AlignedMalloc(sizeof(size_t) * num_words, 64);
+        size_t copy_words = 0;
+
+        if (m_bits)
+        {
+            copy_words = (m_size + BITS_OFFSET) / BITS_PER_WORD;
+            num_words -= copy_words;
+            memcpy(newBits, m_bits, copy_words * sizeof(size_t));
+
+            AlignedFree(m_bits);
+        }
+
+        m_bits = newBits;
+        m_size = new_size;
+
+        memset(&m_bits[copy_words], 0, sizeof(size_t) * num_words);
+    }
+};
author	George Kyriazis <george.kyriazis@intel.com>
	Wed, 2 May 2018 00:33:38 +0000 (19:33 -0500)
committer	George Kyriazis <george.kyriazis@intel.com>
	Fri, 11 May 2018 16:26:35 +0000 (11:26 -0500)
src/gallium/drivers/swr/rasterizer/core/api.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/context.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/threads.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/threads.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/tilemgr.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/tileset.h	[new file with mode: 0644]	patch \| blob