swr/rast: Split backend.cpp to improve compile time
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.cpp
index ca9a8b404493838db703494a2e84e8732fa7539f..fe11cdfd2f9a181fa2e83d936e06483e59858f90 100644 (file)
 #include <smmintrin.h>
 
 #include "backend.h"
+#include "backend_impl.h"
 #include "tilemgr.h"
 #include "memory/tilingtraits.h"
 #include "core/multisample.h"
+#include "backends/gen_BackendPixelRate.hpp"
 
 #include <algorithm>
 
-typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
-static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
-
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Process compute work.
@@ -103,238 +102,6 @@ void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
     SWR_ASSERT(x == 0 && y == 0);
 }
 
-template<SWR_FORMAT format>
-void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
-{
-    auto lambda = [&](int32_t comp)
-    {
-        FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
-
-        pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
-    };
-
-    const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
-
-    for (uint32_t i = 0; i < numIter; ++i)
-    {
-        UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
-    }
-}
-
-#if USE_8x2_TILE_BACKEND
-template<SWR_FORMAT format>
-void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
-{
-    auto lambda = [&](int32_t comp)
-    {
-        FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
-
-        pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
-    };
-
-    const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
-
-    for (uint32_t i = 0; i < numIter; ++i)
-    {
-        UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
-    }
-}
-
-#endif
-template<SWR_FORMAT format>
-INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
-{
-    // convert clear color to hottile format
-    // clear color is in RGBA float/uint32
-#if USE_8x2_TILE_BACKEND
-    simd16vector vClear;
-    for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
-    {
-        simd16scalar vComp;
-        vComp = _simd16_load1_ps((const float*)&clear[comp]);
-        if (FormatTraits<format>::isNormalized(comp))
-        {
-            vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
-            vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
-        }
-        vComp = FormatTraits<format>::pack(comp, vComp);
-        vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
-    }
-
-#else
-    simdvector vClear;
-    for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
-    {
-        simdscalar vComp;
-        vComp = _simd_load1_ps((const float*)&clear[comp]);
-        if (FormatTraits<format>::isNormalized(comp))
-        {
-            vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
-            vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
-        }
-        vComp = FormatTraits<format>::pack(comp, vComp);
-        vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
-    }
-
-#endif
-    uint32_t tileX, tileY;
-    MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
-
-    // Init to full macrotile
-    SWR_RECT clearTile =
-    {
-        KNOB_MACROTILE_X_DIM * int32_t(tileX),
-        KNOB_MACROTILE_Y_DIM * int32_t(tileY),
-        KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
-        KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
-    };
-
-    // intersect with clear rect
-    clearTile &= rect;
-
-    // translate to local hottile origin
-    clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
-
-    // Make maximums inclusive (needed for convert to raster tiles)
-    clearTile.xmax -= 1;
-    clearTile.ymax -= 1;
-
-    // convert to raster tiles
-    clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
-    clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
-    clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
-    clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
-
-    const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
-    // compute steps between raster tile samples / raster tiles / macro tile rows
-    const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
-    const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
-    const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
-    const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
-
-    HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex);
-    uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
-    uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
-
-    // loop over all raster tiles in the current hot tile
-    for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
-    {
-        uint8_t* pRasterTile = pRasterTileRow;
-        for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
-        {
-            for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
-            {
-                ClearRasterTile<format>(pRasterTile, vClear);
-                pRasterTile += rasterTileSampleStep;
-            }
-        }
-        pRasterTileRow += macroTileRowStep;
-    }
-
-    pHotTile->state = HOTTILE_DIRTY;
-}
-
-
-void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
-{
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-    if (KNOB_FAST_CLEAR)
-    {
-        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
-        SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
-        uint32_t numSamples = GetNumSamples(sampleCount);
-
-        SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
-
-        AR_BEGIN(BEClear, pDC->drawId);
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
-        {
-            unsigned long rt = 0;
-            uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
-            while (_BitScanForward(&rt, mask))
-            {
-                mask &= ~(1 << rt);
-
-                HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
-
-                // All we want to do here is to mark the hot tile as being in a "needs clear" state.
-                pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
-                pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
-                pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
-                pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
-                pHotTile->state = HOTTILE_CLEAR;
-            }
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
-        {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
-            pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
-            pHotTile->state = HOTTILE_CLEAR;
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
-        {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
-
-            pHotTile->clearData[0] = pClear->clearStencil;
-            pHotTile->state = HOTTILE_CLEAR;
-        }
-
-        AR_END(BEClear, 1);
-    }
-    else
-    {
-        // Legacy clear
-        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
-        AR_BEGIN(BEClear, pDC->drawId);
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
-        {
-            DWORD clearData[4];
-            clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
-            clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
-            clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
-            clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
-
-            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
-            SWR_ASSERT(pfnClearTiles != nullptr);
-
-            unsigned long rt = 0;
-            uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
-            while (_BitScanForward(&rt, mask))
-            {
-                mask &= ~(1 << rt);
-
-                pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
-            }
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
-        {
-            DWORD clearData[4];
-            clearData[0] = *(DWORD*)&pClear->clearDepth;
-            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
-            SWR_ASSERT(pfnClearTiles != nullptr);
-
-            pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
-        {
-            DWORD clearData[4];
-            clearData[0] = pClear->clearStencil;
-            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
-
-            pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
-        }
-
-        AR_END(BEClear, 1);
-    }
-}
-
 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc, 
     SWR_RENDERTARGET_ATTACHMENT attachment)
 {
@@ -368,7 +135,7 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
         // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
         if (pHotTile->state == HOTTILE_CLEAR)
         {
-            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
+            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
             SWR_ASSERT(pfnClearTiles != nullptr);
 
             pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
@@ -386,7 +153,10 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
 
         if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
         {
-            pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
+            if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && pHotTile->state == HOTTILE_RESOLVED))
+            {
+                pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
+            }
         }
     }
     AR_END(BEStoreTiles, 1);
@@ -426,457 +196,6 @@ void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint3
     }
 }
 
-#if KNOB_SIMD_WIDTH == 8
-const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
-const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
-const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
-#else
-#error Unsupported vector width
-#endif
-
-simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
-{
-    simdscalar vClipMask = _simd_setzero_ps();
-    uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
-
-    for (uint32_t i = 0; i < numClipDistance; ++i)
-    {
-        // pull triangle clip distance values from clip buffer
-        simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
-        simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
-        simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
-
-        // interpolate
-        simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
-        
-        // clip if interpolated clip distance is < 0 || NAN
-        simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
-
-        vClipMask = _simd_or_ps(vClipMask, vCull);
-    }
-
-    return _simd_movemask_ps(vClipMask);
-}
-
-template<typename T>
-void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-{
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-    AR_BEGIN(BESingleSampleBackend, pDC->drawId);
-    AR_BEGIN(BESetup, pDC->drawId);
-
-    const API_STATE &state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
-
-    SWR_PS_CONTEXT psContext;
-    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
-    SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
-    AR_END(BESetup, 1);
-
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
-    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
-    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
-        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-#if USE_8x2_TILE_BACKEND
-            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-
-#endif
-            simdmask coverageMask = work.coverageMask[0] & MASK;
-
-            if (coverageMask)
-            {
-                if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-                {
-                    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-
-                    const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
-
-                    const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                    const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                    coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
-                }
-
-                if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
-                {
-                    const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
-
-                    generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
-                }
-
-                AR_BEGIN(BEBarycentric, pDC->drawId);
-
-                CalcPixelBarycentrics(coeffs, psContext);
-
-                CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
-                // interpolate and quantize z
-                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
-                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
-                AR_END(BEBarycentric, 1);
-
-                // interpolate user clip distance if available
-                if (state.rastState.clipDistanceMask)
-                {
-                    coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
-                }
-
-                simdscalar vCoverageMask = vMask(coverageMask);
-                simdscalar depthPassMask = vCoverageMask;
-                simdscalar stencilPassMask = vCoverageMask;
-
-                // Early-Z?
-                if (T::bCanEarlyZ)
-                {
-                    AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
-                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                                     psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
-                    AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                    AR_END(BEEarlyDepthTest, 0);
-
-                    // early-exit if no pixels passed depth or earlyZ is forced on
-                    if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
-                    {
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                            pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
-
-                        if (!_simd_movemask_ps(depthPassMask))
-                        {
-                            goto Endtile;
-                        }
-                    }
-                }
-
-                psContext.sampleIndex = 0;
-                psContext.activeMask = _simd_castps_si(vCoverageMask);
-
-                // execute pixel shader
-                AR_BEGIN(BEPixelShader, pDC->drawId);
-                UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
-                state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-                AR_END(BEPixelShader, 0);
-
-                vCoverageMask = _simd_castsi_ps(psContext.activeMask);
-
-                // late-Z
-                if (!T::bCanEarlyZ)
-                {
-                    AR_BEGIN(BELateDepthTest, pDC->drawId);
-                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                                        psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
-                    AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                    AR_END(BELateDepthTest, 0);
-
-                    if (!_simd_movemask_ps(depthPassMask))
-                    {
-                        // need to call depth/stencil write for stencil write
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                            pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
-                        goto Endtile;
-                    }
-                } else {
-                    // for early z, consolidate discards from shader
-                    // into depthPassMask
-                    depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
-                }
-
-                uint32_t statMask = _simd_movemask_ps(depthPassMask);
-                uint32_t statCount = _mm_popcnt_u32(statMask);
-                UPDATE_STAT_BE(DepthPassCount, statCount);
-
-                // output merger
-                AR_BEGIN(BEOutputMerger, pDC->drawId);
-#if USE_8x2_TILE_BACKEND
-                OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
-#else
-                OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
-#endif
-
-                // do final depth write after all pixel kills
-                if (!state.psState.forceEarlyZ)
-                {
-                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                        pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
-                }
-                AR_END(BEOutputMerger, 0);
-            }
-
-Endtile:
-            AR_BEGIN(BEEndTile, pDC->drawId);
-
-            work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-            {
-                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-#if USE_8x2_TILE_BACKEND
-            if (useAlternateOffset)
-            {
-                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-                {
-                    pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-                }
-            }
-#else
-            for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-            {
-                pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-            }
-#endif
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
-            AR_END(BEEndTile, 0);
-
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
-            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
-        }
-
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
-        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
-    }
-
-    AR_END(BESingleSampleBackend, 0);
-}
-
-template<typename T>
-void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-{
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-    AR_BEGIN(BESampleRateBackend, pDC->drawId);
-    AR_BEGIN(BESetup, pDC->drawId);
-
-    const API_STATE &state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
-
-    SWR_PS_CONTEXT psContext;
-    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
-    SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
-    AR_END(BESetup, 0);
-
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
-    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
-    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
-        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-#if USE_8x2_TILE_BACKEND
-            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-
-#endif
-            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
-            {
-                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
-
-                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
-            }
-
-            AR_BEGIN(BEBarycentric, pDC->drawId);
-
-            CalcPixelBarycentrics(coeffs, psContext);
-
-            CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
-            AR_END(BEBarycentric, 0);
-
-            for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
-            {
-                simdmask coverageMask = work.coverageMask[sample] & MASK;
-
-                if (coverageMask)
-                {
-                    // offset depth/stencil buffers current sample
-                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
-                    uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
-                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-                    {
-                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-
-                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
-
-                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
-                    }
-
-                    AR_BEGIN(BEBarycentric, pDC->drawId);
-
-                    // calculate per sample positions
-                    psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
-                    psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
-
-                    CalcSampleBarycentrics(coeffs, psContext);
-
-                    // interpolate and quantize z
-                    psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
-                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
-                    AR_END(BEBarycentric, 0);
-
-                    // interpolate user clip distance if available
-                    if (state.rastState.clipDistanceMask)
-                    {
-                        coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
-                    }
-
-                    simdscalar vCoverageMask = vMask(coverageMask);
-                    simdscalar depthPassMask = vCoverageMask;
-                    simdscalar stencilPassMask = vCoverageMask;
-
-                    // Early-Z?
-                    if (T::bCanEarlyZ)
-                    {
-                        AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
-                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                              psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
-                        AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                        AR_END(BEEarlyDepthTest, 0);
-
-                        // early-exit if no samples passed depth or earlyZ is forced on.
-                        if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
-                        {
-                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
-
-                            if (!_simd_movemask_ps(depthPassMask))
-                            {
-                                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-                                continue;
-                            }
-                        }
-                    }
-
-                    psContext.sampleIndex = sample;
-                    psContext.activeMask = _simd_castps_si(vCoverageMask);
-
-                    // execute pixel shader
-                    AR_BEGIN(BEPixelShader, pDC->drawId);
-                    UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
-                    state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-                    AR_END(BEPixelShader, 0);
-
-                    vCoverageMask = _simd_castsi_ps(psContext.activeMask);
-
-                    // late-Z
-                    if (!T::bCanEarlyZ)
-                    {
-                        AR_BEGIN(BELateDepthTest, pDC->drawId);
-                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                              psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
-                        AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                        AR_END(BELateDepthTest, 0);
-
-                        if (!_simd_movemask_ps(depthPassMask))
-                        {
-                            // need to call depth/stencil write for stencil write
-                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
-
-                            work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-                            continue;
-                        }
-                    }
-
-                    uint32_t statMask = _simd_movemask_ps(depthPassMask);
-                    uint32_t statCount = _mm_popcnt_u32(statMask);
-                    UPDATE_STAT_BE(DepthPassCount, statCount);
-
-                    // output merger
-                    AR_BEGIN(BEOutputMerger, pDC->drawId);
-#if USE_8x2_TILE_BACKEND
-                    OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
-#else
-                    OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
-#endif
-
-                    // do final depth write after all pixel kills
-                    if (!state.psState.forceEarlyZ)
-                    {
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                            pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
-                    }
-                    AR_END(BEOutputMerger, 0);
-                }
-                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-Endtile:
-            ATTR_UNUSED;
-
-            AR_BEGIN(BEEndTile, pDC->drawId);
-
-            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-            {
-                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-#if USE_8x2_TILE_BACKEND
-            if (useAlternateOffset)
-            {
-                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-                {
-                    pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-                }
-            }
-#else
-            for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-            {
-                pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-            }
-#endif
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
-            AR_END(BEEndTile, 0);
-
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
-            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
-        }
-
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
-        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
-    }
-
-    AR_END(BESampleRateBackend, 0);
-}
-// optimized backend flow with NULL PS
 template<uint32_t sampleCountT>
 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
@@ -974,7 +293,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                     UPDATE_STAT_BE(DepthPassCount, statCount);
                 }
 
-Endtile:
+            Endtile:
                 ATTR_UNUSED;
                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
@@ -991,17 +310,7 @@ Endtile:
     AR_END(BENullBackend, 0);
 }
 
-void InitClearTilesTable()
-{
-    memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
-
-    sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
-    sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
-    sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
-    sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
-    sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
-}
-
+PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {};
 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
                                      [2] // centroid
@@ -1020,113 +329,10 @@ PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
                                         [2] // canEarlyZ
                                         = {};
 
-// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
-// arguments to static template arguments.
-template <uint32_t... ArgsT>
-struct BEChooser
-{
-    // Last Arg Terminator
-    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
-    {
-        switch(tArg)
-        {
-        case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
-        case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
-        case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
-        default:
-            SWR_ASSERT(0 && "Invalid backend func\n");
-            return nullptr;
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
-    {
-        switch(tArg)
-        {
-        case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
-        case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
-        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
-        default:
-        SWR_ASSERT(0 && "Invalid sample pattern\n");
-        return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
-        break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
-    {
-        switch(tArg)
-        {
-        case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
-        default:
-        SWR_ASSERT(0 && "Invalid sample count\n");
-        return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
-        break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
-    {
-        if(tArg == true)
-        {
-            return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
-        }
-
-        return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
-    }
-};
-
-void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
-{
-    for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
-    {
-        for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
-        {
-            for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
-            {
-                table[inputCoverage][isCentroid][canEarlyZ] =
-                    BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage,
-                                         (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
-            }
-        }
-    }
-}
-
-void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
-{
-    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
-    {
-        for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
-        {
-            for(uint32_t centroid = 0; centroid < 2; centroid++)
-            {
-                for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
-                {
-                    table[sampleCount][inputCoverage][centroid][canEarlyZ] =
-                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage, 
-                                             (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
-                }
-            }
-        }
-    }
-}
-
-void InitBackendPixelRate0();
 void InitBackendFuncTables()
 {    
+    InitBackendPixelRate();
     InitBackendSingleFuncTable(gBackendSingleSample);
-    InitBackendPixelRate0();
     InitBackendSampleFuncTable(gBackendSampleRateTable);
 
     gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;