swr/rasterizer: do not mark tiles dirty until actually rendered
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / tilemgr.cpp
index a6c54ab86e84e4c489c958e697d37a1fb7f32561..a23de56a0a5d9878bc27017bb5218318da4a8d1d 100644 (file)
@@ -1,31 +1,31 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file tilemgr.cpp
-*
-* @brief Implementation for Macro Tile Manager which provides the facilities
-*        for threads to work on an macro tile.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file tilemgr.cpp
+ *
+ * @brief Implementation for Macro Tile Manager which provides the facilities
+ *        for threads to work on an macro tile.
+ *
+ ******************************************************************************/
 #include <unordered_map>
 
 #include "fifo.hpp"
 #include "core/multisample.h"
 #include "rdtsc_core.h"
 
-#define TILE_ID(x,y) ((x << 16 | y))
+MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena) {}
 
-MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
-{
-}
-
-void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
+void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK* pWork)
 {
     // Should not enqueue more then what we have backing for in the hot tile manager.
     SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
     SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
 
-    if ((x & ~(KNOB_NUM_HOT_TILES_X-1)) | (y & ~(KNOB_NUM_HOT_TILES_Y-1)))
+    if ((x & ~(KNOB_NUM_HOT_TILES_X - 1)) | (y & ~(KNOB_NUM_HOT_TILES_Y - 1)))
     {
         return;
     }
 
-    uint32_t id = TILE_ID(x, y);
+    uint32_t id = getTileId(x, y);
+
+    if (id >= mTiles.size())
+    {
+        mTiles.resize((16 + id) * 2);
+    }
 
-    MacroTileQueue &tile = mTiles[id];
-    tile.mWorkItemsFE++;
-    tile.mId = id;
+    MacroTileQueue* pTile = mTiles[id];
+    if (!pTile)
+    {
+        pTile = mTiles[id] = new MacroTileQueue();
+    }
+    pTile->mWorkItemsFE++;
+    pTile->mId = id;
 
-    if (tile.mWorkItemsFE == 1)
+    if (pTile->mWorkItemsFE == 1)
     {
-        tile.clear(mArena);
-        mDirtyTiles.push_back(&tile);
+        pTile->clear(mArena);
+        mDirtyTiles.push_back(pTile);
     }
 
     mWorkItemsProduced++;
-    tile.enqueue_try_nosync(mArena, pWork);
+    pTile->enqueue_try_nosync(mArena, pWork);
 }
 
 void MacroTileMgr::markTileComplete(uint32_t id)
 {
-    SWR_ASSERT(mTiles.find(id) != mTiles.end());
-    MacroTileQueue &tile = mTiles[id];
-    uint32_t numTiles = tile.mWorkItemsFE;
+    SWR_ASSERT(mTiles.size() > id);
+    MacroTileQueue& tile     = *mTiles[id];
+    uint32_t        numTiles = tile.mWorkItemsFE;
     InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
 
     _ReadWriteBarrier();
@@ -83,8 +88,14 @@ void MacroTileMgr::markTileComplete(uint32_t id)
     tile.mWorkItemsBE = 0;
 }
 
-HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples,
-    uint32_t renderTargetArrayIndex)
+HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT*                pContext,
+                                DRAW_CONTEXT*               pDC,
+                                HANDLE                      hWorkerPrivateData,
+                                uint32_t                    macroID,
+                                SWR_RENDERTARGET_ATTACHMENT attachment,
+                                bool                        create,
+                                uint32_t                    numSamples,
+                                uint32_t                    renderTargetArrayIndex)
 {
     uint32_t x, y;
     MacroTileMgr::getTileIndices(macroID, x, y);
@@ -92,17 +103,18 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
     SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
     SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
 
-    HotTileSet &tile = mHotTiles[x][y];
-    HOTTILE& hotTile = tile.Attachment[attachment];
+    HotTileSet& tile    = mHotTiles[x][y];
+    HOTTILE&    hotTile = tile.Attachment[attachment];
     if (hotTile.pBuffer == NULL)
     {
         if (create)
         {
-            uint32_t size = numSamples * mHotTileSize[attachment];
+            uint32_t size     = numSamples * mHotTileSize[attachment];
             uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
-            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode);
-            hotTile.state = HOTTILE_INVALID;
-            hotTile.numSamples = numSamples;
+            hotTile.pBuffer =
+                (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
+            hotTile.state                  = HOTTILE_INVALID;
+            hotTile.numSamples             = numSamples;
             hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
         }
         else
@@ -115,22 +127,22 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
         // free the old tile and create a new one with enough space to hold all samples
         if (numSamples > hotTile.numSamples)
         {
-            // tile should be either uninitialized or resolved if we're deleting and switching to a 
+            // tile should be either uninitialized or resolved if we're deleting and switching to a
             // new sample count
-            SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
-                (hotTile.state == HOTTILE_RESOLVED) ||
-                (hotTile.state == HOTTILE_CLEAR));
+            SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || (hotTile.state == HOTTILE_RESOLVED) ||
+                       (hotTile.state == HOTTILE_CLEAR));
             FreeHotTileMem(hotTile.pBuffer);
 
-            uint32_t size = numSamples * mHotTileSize[attachment];
+            uint32_t size     = numSamples * mHotTileSize[attachment];
             uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
-            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode);
-            hotTile.state = HOTTILE_INVALID;
+            hotTile.pBuffer =
+                (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
+            hotTile.state      = HOTTILE_INVALID;
             hotTile.numSamples = numSamples;
         }
 
-        // if requested render target array index isn't currently loaded, need to store out the current hottile 
-        // and load the requested array slice
+        // if requested render target array index isn't currently loaded, need to store out the
+        // current hottile and load the requested array slice
         if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
         {
             SWR_FORMAT format;
@@ -143,10 +155,19 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
             case SWR_ATTACHMENT_COLOR4:
             case SWR_ATTACHMENT_COLOR5:
             case SWR_ATTACHMENT_COLOR6:
-            case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
-            case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
-            case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
-            default: SWR_INVALID("Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+            case SWR_ATTACHMENT_COLOR7:
+                format = KNOB_COLOR_HOT_TILE_FORMAT;
+                break;
+            case SWR_ATTACHMENT_DEPTH:
+                format = KNOB_DEPTH_HOT_TILE_FORMAT;
+                break;
+            case SWR_ATTACHMENT_STENCIL:
+                format = KNOB_STENCIL_HOT_TILE_FORMAT;
+                break;
+            default:
+                SWR_INVALID("Unknown attachment: %d", attachment);
+                format = KNOB_COLOR_HOT_TILE_FORMAT;
+                break;
             }
 
             if (hotTile.state == HOTTILE_CLEAR)
@@ -163,23 +184,38 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
 
             if (hotTile.state == HOTTILE_DIRTY)
             {
-                pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
-                    x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
+                pContext->pfnStoreTile(GetPrivateState(pDC),
+                                       hWorkerPrivateData,
+                                       format,
+                                       attachment,
+                                       x * KNOB_MACROTILE_X_DIM,
+                                       y * KNOB_MACROTILE_Y_DIM,
+                                       hotTile.renderTargetArrayIndex,
+                                       hotTile.pBuffer);
             }
 
-            pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
-                x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
+            pContext->pfnLoadTile(GetPrivateState(pDC),
+                                  hWorkerPrivateData,
+                                  format,
+                                  attachment,
+                                  x * KNOB_MACROTILE_X_DIM,
+                                  y * KNOB_MACROTILE_Y_DIM,
+                                  renderTargetArrayIndex,
+                                  hotTile.pBuffer);
 
             hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
-            hotTile.state = HOTTILE_DIRTY;
+            hotTile.state = HOTTILE_RESOLVED;
         }
     }
     return &tile.Attachment[attachment];
 }
 
-HOTTILE* HotTileMgr::GetHotTileNoLoad(
-    SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID,
-    SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples)
+HOTTILE* HotTileMgr::GetHotTileNoLoad(SWR_CONTEXT*                pContext,
+                                      DRAW_CONTEXT*               pDC,
+                                      uint32_t                    macroID,
+                                      SWR_RENDERTARGET_ATTACHMENT attachment,
+                                      bool                        create,
+                                      uint32_t                    numSamples)
 {
     uint32_t x, y;
     MacroTileMgr::getTileIndices(macroID, x, y);
@@ -187,16 +223,16 @@ HOTTILE* HotTileMgr::GetHotTileNoLoad(
     SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
     SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
 
-    HotTileSet &tile = mHotTiles[x][y];
-    HOTTILE& hotTile = tile.Attachment[attachment];
+    HotTileSet& tile    = mHotTiles[x][y];
+    HOTTILE&    hotTile = tile.Attachment[attachment];
     if (hotTile.pBuffer == NULL)
     {
         if (create)
         {
-            uint32_t size = numSamples * mHotTileSize[attachment];
-            hotTile.pBuffer = (uint8_t*)AlignedMalloc(size, 64);
-            hotTile.state = HOTTILE_INVALID;
-            hotTile.numSamples = numSamples;
+            uint32_t size                  = numSamples * mHotTileSize[attachment];
+            hotTile.pBuffer                = (uint8_t*)AlignedMalloc(size, 64);
+            hotTile.state                  = HOTTILE_INVALID;
+            hotTile.numSamples             = numSamples;
             hotTile.renderTargetArrayIndex = 0;
         }
         else
@@ -208,24 +244,25 @@ HOTTILE* HotTileMgr::GetHotTileNoLoad(
     return &hotTile;
 }
 
-#if USE_8x2_TILE_BACKEND
-void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+void HotTileMgr::ClearColorHotTile(
+    const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
 {
     // Load clear color into SIMD register...
-    float *pClearData = (float *)(pHotTile->clearData);
-    simd16scalar valR = _simd16_broadcast_ss(&pClearData[0]);
-    simd16scalar valG = _simd16_broadcast_ss(&pClearData[1]);
-    simd16scalar valB = _simd16_broadcast_ss(&pClearData[2]);
-    simd16scalar valA = _simd16_broadcast_ss(&pClearData[3]);
+    float*       pClearData = (float*)(pHotTile->clearData);
+    simd16scalar valR       = _simd16_broadcast_ss(&pClearData[0]);
+    simd16scalar valG       = _simd16_broadcast_ss(&pClearData[1]);
+    simd16scalar valB       = _simd16_broadcast_ss(&pClearData[2]);
+    simd16scalar valA       = _simd16_broadcast_ss(&pClearData[3]);
 
-    float *pfBuf = (float *)pHotTile->pBuffer;
+    float*   pfBuf      = (float*)pHotTile->pBuffer;
     uint32_t numSamples = pHotTile->numSamples;
 
     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
     {
         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
         {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
+                 si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
             {
                 _simd16_store_ps(pfBuf, valR);
                 pfBuf += KNOB_SIMD16_WIDTH;
@@ -243,20 +280,22 @@ void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro ti
     }
 }
 
-void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+void HotTileMgr::ClearDepthHotTile(
+    const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
 {
     // Load clear color into SIMD register...
-    float *pClearData = (float *)(pHotTile->clearData);
-    simd16scalar valZ = _simd16_broadcast_ss(&pClearData[0]);
+    float*       pClearData = (float*)(pHotTile->clearData);
+    simd16scalar valZ       = _simd16_broadcast_ss(&pClearData[0]);
 
-    float *pfBuf = (float *)pHotTile->pBuffer;
+    float*   pfBuf      = (float*)pHotTile->pBuffer;
     uint32_t numSamples = pHotTile->numSamples;
 
     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
     {
         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
         {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
+                 si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
             {
                 _simd16_store_ps(pfBuf, valZ);
                 pfBuf += KNOB_SIMD16_WIDTH;
@@ -269,18 +308,19 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
 {
     // convert from F32 to U8.
     uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
-    //broadcast 32x into __m256i...
+    // broadcast 32x into __m256i...
     simd16scalari valS = _simd16_set1_epi8(clearVal);
 
-    simd16scalari *pBuf = (simd16scalari *)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
+    simd16scalari* pBuf       = (simd16scalari*)pHotTile->pBuffer;
+    uint32_t       numSamples = pHotTile->numSamples;
 
     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
     {
         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
         {
             // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM * 4)
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
+                 si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM * 4)
             {
                 _simd16_store_si(pBuf, valS);
                 pBuf += 1;
@@ -289,85 +329,6 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
     }
 }
 
-#else
-void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
-{
-    // Load clear color into SIMD register...
-    float *pClearData = (float*)(pHotTile->clearData);
-    simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
-    simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
-    simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
-    simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
-
-    float *pfBuf = (float*)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
-
-    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-    {
-        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-        {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
-            {
-                _simd_store_ps(pfBuf, valR);
-                pfBuf += KNOB_SIMD_WIDTH;
-                _simd_store_ps(pfBuf, valG);
-                pfBuf += KNOB_SIMD_WIDTH;
-                _simd_store_ps(pfBuf, valB);
-                pfBuf += KNOB_SIMD_WIDTH;
-                _simd_store_ps(pfBuf, valA);
-                pfBuf += KNOB_SIMD_WIDTH;
-            }
-        }
-    }
-}
-
-void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
-{
-    // Load clear color into SIMD register...
-    float *pClearData = (float*)(pHotTile->clearData);
-    simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
-
-    float *pfBuf = (float*)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
-
-    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-    {
-        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-        {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
-            {
-                _simd_store_ps(pfBuf, valZ);
-                pfBuf += KNOB_SIMD_WIDTH;
-            }
-        }
-    }
-}
-
-void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
-{
-    // convert from F32 to U8.
-    uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
-    //broadcast 32x into __m256i...
-    simdscalari valS = _simd_set1_epi8(clearVal);
-
-    simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
-
-    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-    {
-        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-        {
-            // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
-            {
-                _simd_store_si(pBuf, valS);
-                pBuf += 1;
-            }
-        }
-    }
-}
-
-#endif
 //////////////////////////////////////////////////////////////////////////
 /// @brief InitializeHotTiles
 /// for draw calls, we initialize the active hot tiles and perform deferred
@@ -376,9 +337,13 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
 /// to avoid unnecessary setup every triangle
 /// @todo support deferred clear
 /// @param pCreateInfo - pointer to creation info.
-void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroID)
+void HotTileMgr::InitializeHotTiles(SWR_CONTEXT*  pContext,
+                                    DRAW_CONTEXT* pDC,
+                                    uint32_t      workerId,
+                                    uint32_t      macroID)
 {
-    const API_STATE& state = GetApiState(pDC);
+    const API_STATE& state    = GetApiState(pDC);
+    HANDLE hWorkerPrivateData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 
     uint32_t x, y;
     MacroTileMgr::getTileIndices(macroID, x, y);
@@ -388,27 +353,41 @@ void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, ui
     uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
 
     // check RT if enabled
-    unsigned long rtSlot = 0;
-    uint32_t colorHottileEnableMask = state.colorHottileEnable;
+    unsigned long rtSlot                 = 0;
+    uint32_t      colorHottileEnableMask = state.colorHottileEnable;
     while (_BitScanForward(&rtSlot, colorHottileEnableMask))
     {
-        HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
+        HOTTILE* pHotTile =
+            GetHotTile(pContext,
+                       pDC,
+                       hWorkerPrivateData,
+                       macroID,
+                       (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
+                       true,
+                       numSamples);
 
         if (pHotTile->state == HOTTILE_INVALID)
         {
-            AR_BEGIN(BELoadTiles, pDC->drawId);
+            RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
             // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
-            pHotTile->state = HOTTILE_DIRTY;
-            AR_END(BELoadTiles, 0);
+            pContext->pfnLoadTile(GetPrivateState(pDC),
+                                  hWorkerPrivateData,
+                                  KNOB_COLOR_HOT_TILE_FORMAT,
+                                  (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
+                                  x,
+                                  y,
+                                  pHotTile->renderTargetArrayIndex,
+                                  pHotTile->pBuffer);
+            pHotTile->state = HOTTILE_RESOLVED;
+            RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
         }
         else if (pHotTile->state == HOTTILE_CLEAR)
         {
-            AR_BEGIN(BELoadTiles, pDC->drawId);
+            RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
             // Clear the tile.
             ClearColorHotTile(pHotTile);
             pHotTile->state = HOTTILE_DIRTY;
-            AR_END(BELoadTiles, 0);
+            RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
         }
         colorHottileEnableMask &= ~(1 << rtSlot);
     }
@@ -416,44 +395,60 @@ void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, ui
     // check depth if enabled
     if (state.depthHottileEnable)
     {
-        HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
+        HOTTILE* pHotTile = GetHotTile(
+            pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
         if (pHotTile->state == HOTTILE_INVALID)
         {
-            AR_BEGIN(BELoadTiles, pDC->drawId);
+            RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
             // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pContext->pfnLoadTile(GetPrivateState(pDC),
+                                  hWorkerPrivateData,
+                                  KNOB_DEPTH_HOT_TILE_FORMAT,
+                                  SWR_ATTACHMENT_DEPTH,
+                                  x,
+                                  y,
+                                  pHotTile->renderTargetArrayIndex,
+                                  pHotTile->pBuffer);
             pHotTile->state = HOTTILE_DIRTY;
-            AR_END(BELoadTiles, 0);
+            RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
         }
         else if (pHotTile->state == HOTTILE_CLEAR)
         {
-            AR_BEGIN(BELoadTiles, pDC->drawId);
+            RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
             // Clear the tile.
             ClearDepthHotTile(pHotTile);
             pHotTile->state = HOTTILE_DIRTY;
-            AR_END(BELoadTiles, 0);
+            RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
         }
     }
 
     // check stencil if enabled
     if (state.stencilHottileEnable)
     {
-        HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
+        HOTTILE* pHotTile = GetHotTile(
+            pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
         if (pHotTile->state == HOTTILE_INVALID)
         {
-            AR_BEGIN(BELoadTiles, pDC->drawId);
+            RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
             // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pContext->pfnLoadTile(GetPrivateState(pDC),
+                                  hWorkerPrivateData,
+                                  KNOB_STENCIL_HOT_TILE_FORMAT,
+                                  SWR_ATTACHMENT_STENCIL,
+                                  x,
+                                  y,
+                                  pHotTile->renderTargetArrayIndex,
+                                  pHotTile->pBuffer);
             pHotTile->state = HOTTILE_DIRTY;
-            AR_END(BELoadTiles, 0);
+            RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
         }
         else if (pHotTile->state == HOTTILE_CLEAR)
         {
-            AR_BEGIN(BELoadTiles, pDC->drawId);
+            RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
             // Clear the tile.
             ClearStencilHotTile(pHotTile);
             pHotTile->state = HOTTILE_DIRTY;
-            AR_END(BELoadTiles, 0);
+            RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
         }
     }
 }