swr: [rasterizer] warning cleanup
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.cpp
index 8c1858b92911596b5ae6218194f677e51d8616aa..d2547f398888b15f51cf410bca41c8a570e61f3f 100644 (file)
@@ -29,7 +29,6 @@
 
 #include <smmintrin.h>
 
-#include "rdtsc_core.h"
 #include "backend.h"
 #include "depthstencil.h"
 #include "tilemgr.h"
 
 #include <algorithm>
 
-const __m128 vTileOffsetsX = {0.5, KNOB_TILE_X_DIM - 0.5, 0.5, KNOB_TILE_X_DIM - 0.5};
-const __m128 vTileOffsetsY = {0.5, 0.5, KNOB_TILE_Y_DIM - 0.5, KNOB_TILE_Y_DIM - 0.5};
-
-/// @todo move to common lib
-#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
-static const __m128 gMaskToVec[] = {
-    MASKTOVEC(0,0,0,0),
-    MASKTOVEC(0,0,0,1),
-    MASKTOVEC(0,0,1,0),
-    MASKTOVEC(0,0,1,1),
-    MASKTOVEC(0,1,0,0),
-    MASKTOVEC(0,1,0,1),
-    MASKTOVEC(0,1,1,0),
-    MASKTOVEC(0,1,1,1),
-    MASKTOVEC(1,0,0,0),
-    MASKTOVEC(1,0,0,1),
-    MASKTOVEC(1,0,1,0),
-    MASKTOVEC(1,0,1,1),
-    MASKTOVEC(1,1,0,0),
-    MASKTOVEC(1,1,0,1),
-    MASKTOVEC(1,1,1,0),
-    MASKTOVEC(1,1,1,1),
-};
-
 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4]);
 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
 
@@ -70,7 +45,7 @@ static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
 /// @param pDC - pointer to draw context (dispatch).
 /// @param workerId - The unique worker ID that is assigned to this thread.
 /// @param threadGroupId - the linear index for the thread group within the dispatch.
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId)
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
 {
     RDTSC_START(BEDispatch);
 
@@ -80,10 +55,10 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
     SWR_ASSERT(pTaskData != nullptr);
 
     // Ensure spill fill memory has been allocated.
-    if (pDC->pSpillFill[workerId] == nullptr)
+    size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
+    if (spillFillSize && pSpillFillBuffer == nullptr)
     {
-        ///@todo Add state which indicates the spill fill size.
-        pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4096 * 1024, sizeof(float) * 8);
+        pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
     }
 
     const API_STATE& state = GetApiState(pDC);
@@ -94,7 +69,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
     csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
     csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
     csContext.pTGSM = pContext->pScratch[workerId];
-    csContext.pSpillFillBuffer = pDC->pSpillFill[workerId];
+    csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
 
     state.pfnCsFunc(GetPrivateState(pDC), &csContext);
 
@@ -156,7 +131,7 @@ void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTil
 }
 
 template<SWR_FORMAT format>
-void ClearRasterTile(BYTE *pTileBuffer, simdvector &value)
+void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
 {
     auto lambda = [&](int comp)
     {
@@ -299,10 +274,10 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
             /// @todo clear data should come in as RGBA32_FLOAT
             DWORD clearData[4];
             float clearFloat[4];
-            clearFloat[0] = ((BYTE*)(&pClear->clearRTColor))[0] / 255.0f;
-            clearFloat[1] = ((BYTE*)(&pClear->clearRTColor))[1] / 255.0f;
-            clearFloat[2] = ((BYTE*)(&pClear->clearRTColor))[2] / 255.0f;
-            clearFloat[3] = ((BYTE*)(&pClear->clearRTColor))[3] / 255.0f;
+            clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
+            clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
+            clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
+            clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
             clearData[0] = *(DWORD*)&clearFloat[0];
             clearData[1] = *(DWORD*)&clearFloat[1];
             clearData[2] = *(DWORD*)&clearFloat[2];
@@ -399,30 +374,32 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
 }
 
 
-void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
+void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
 {
-    INVALIDATE_TILES_DESC *pDesc = (INVALIDATE_TILES_DESC*)pData;
+    DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
     SWR_CONTEXT *pContext = pDC->pContext;
 
+    const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
+
     for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
     {
         if (pDesc->attachmentMask & (1 << i))
         {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, false);
+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
+                pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
             if (pHotTile)
             {
-                pHotTile->state = HOTTILE_INVALID;
+                pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
             }
         }
     }
 }
 
 #if KNOB_SIMD_WIDTH == 8
-const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 };
-const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 };
-const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
-#define MASK 0xff
+const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
+const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
+const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
 #else
 #error Unsupported vector width
 #endif
@@ -457,390 +434,17 @@ simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscala
     return _simd_movemask_ps(vClipMask);
 }
 
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
-{
-
-    // will need to update for avx512
-    assert(KNOB_SIMD_WIDTH == 8);
-
-    __m256i mask[2];
-    __m256i sampleCoverage[2];
-    if(bIsStandardPattern)
-    {
-        __m256i src = _mm256_set1_epi32(0);
-        __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
-
-        if(MultisampleTraits<sampleCountT>::numSamples == 1)
-        {
-            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
-        {
-            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
-        {
-            mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
-        {
-            mask[0] = _mm256_set1_epi32(-1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
-        {
-            mask[0] = _mm256_set1_epi32(-1);
-            mask[1] = _mm256_set1_epi32(-1);
-            index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
-        }
-
-        // gather coverage for samples 0-7
-        sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
-        if(MultisampleTraits<sampleCountT>::numSamples > 8)
-        {
-            // gather coverage for samples 8-15
-            sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
-        }
-    }
-    else
-    {
-        // center coverage is the same for all samples; just broadcast to the sample slots
-        uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
-        if(MultisampleTraits<sampleCountT>::numSamples == 1)
-        {
-            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
-        {
-            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
-        {
-            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
-        {
-            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
-        {
-            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
-            sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
-        }
-    }
-
-    mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
-                              -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
-    // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
-    __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
-
-    __m256i packedCoverage1;
-    if(MultisampleTraits<sampleCountT>::numSamples > 8)
-    {
-        // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
-        packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
-    }
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
-    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
-    __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
-    __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-    packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
-
-    __m256i packedSampleCoverage;
-    if(MultisampleTraits<sampleCountT>::numSamples > 8)
-    {
-        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-        hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
-        shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-        shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
-        packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
-        packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
-    }
-    else
-    {
-        packedSampleCoverage = packedCoverage0;
-    }
-#else
-    __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
-    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
-    packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
-
-    __m256i packedSampleCoverage;
-    if(MultisampleTraits<sampleCountT>::numSamples > 8)
-    {
-        permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
-        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-        packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
-
-        // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
-        packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
-    }
-    else
-    {
-        packedSampleCoverage = packedCoverage0;
-    }
-#endif
-
-    for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
-    {
-        // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
-        inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
-
-        if(!bForcedSampleCount)
-        {
-            // input coverage has to be anded with sample mask if MSAA isn't forced on
-            inputMask[i] &= sampleMask;
-        }
-
-        // shift to the next pixel in the 4x2
-        packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
-    }
-}
-
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
-{
-    uint32_t inputMask[KNOB_SIMD_WIDTH]; 
-    generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
-    inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
-}
-
-template<bool perspMask>
-INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
-{
-    if(perspMask)
-    {
-        // evaluate I,J
-        psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
-        psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
-        psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
-        psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
-
-        // interpolate 1/w
-        psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
-    }
-}
-
-template<bool perspMask>
-INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
-{
-    if(perspMask)
-    {
-        // evaluate I,J
-        psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
-        psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
-        psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
-        psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
-
-        // interpolate 1/w
-        psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
-    }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Centroid behaves exactly as follows :
-// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to 
-//     have a sample location there).
-// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the 
-//     coverage with the SampleMask Rasterizer State.
-// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is 
-//     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the 
-//     SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-template<SWR_MULTISAMPLE_COUNT sampleCount, bool bForcedSampleCount>
-INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
-                            const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
-{
-    uint32_t inputMask[KNOB_SIMD_WIDTH];
-
-    generateInputCoverage<sampleCount, 1, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
-
-    // Case (2) - partially covered pixel
-
-    // scan for first covered sample per pixel in the 4x2 span
-    unsigned long sampleNum[KNOB_SIMD_WIDTH];
-    (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
-    (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
-    (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
-    (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
-    (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
-    (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
-    (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
-    (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
-
-    // look up and set the sample offsets from UL pixel corner for first covered sample 
-    __m256 vXSample = _mm256_set_ps(MultisampleTraits<sampleCount>::X(sampleNum[7]),
-                                    MultisampleTraits<sampleCount>::X(sampleNum[6]),
-                                    MultisampleTraits<sampleCount>::X(sampleNum[5]),
-                                    MultisampleTraits<sampleCount>::X(sampleNum[4]),
-                                    MultisampleTraits<sampleCount>::X(sampleNum[3]),
-                                    MultisampleTraits<sampleCount>::X(sampleNum[2]),
-                                    MultisampleTraits<sampleCount>::X(sampleNum[1]),
-                                    MultisampleTraits<sampleCount>::X(sampleNum[0]));
-
-    __m256 vYSample = _mm256_set_ps(MultisampleTraits<sampleCount>::Y(sampleNum[7]),
-                                    MultisampleTraits<sampleCount>::Y(sampleNum[6]),
-                                    MultisampleTraits<sampleCount>::Y(sampleNum[5]),
-                                    MultisampleTraits<sampleCount>::Y(sampleNum[4]),
-                                    MultisampleTraits<sampleCount>::Y(sampleNum[3]),
-                                    MultisampleTraits<sampleCount>::Y(sampleNum[2]),
-                                    MultisampleTraits<sampleCount>::Y(sampleNum[1]),
-                                    MultisampleTraits<sampleCount>::Y(sampleNum[0]));
-    // add sample offset to UL pixel corner
-    vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
-    vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
-
-    // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
-    static const __m256i vFullyCoveredMask = MultisampleTraits<sampleCount>::FullSampleMask();
-    __m256i vInputCoveragei =  _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
-    __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
-
-    static const __m256i vZero = _simd_setzero_si();
-    const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
-    __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
-    __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
-    __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
-
-    __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
-
-    // set the centroid position based on results from above
-    psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
-    psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
-
-    // Case (3a) No samples covered and partial sample mask
-    __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
-    // sample mask should never be all 0's for this case, but handle it anyways
-    unsigned long firstCoveredSampleMaskSample = 0;
-    (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
-
-    __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
-
-    vXSample = _simd_set1_ps(MultisampleTraits<sampleCount>::X(firstCoveredSampleMaskSample));
-    vYSample = _simd_set1_ps(MultisampleTraits<sampleCount>::Y(firstCoveredSampleMaskSample));
-
-    // blend in case 3a pixel locations
-    psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
-    psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
-}
-
-template<uint32_t sampleCount, uint32_t persp, uint32_t standardPattern, uint32_t forcedMultisampleCount>
-INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
-                                     const uint64_t *const coverageMask, const uint32_t sampleMask,
-                                     const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
-{
-    static const bool bPersp = (bool)persp;
-    static const bool bIsStandardPattern = (bool)standardPattern;
-    static const bool bForcedMultisampleCount = (bool)forcedMultisampleCount;
-
-    // calculate centroid positions
-    if(bPersp)
-    {
-        if(bIsStandardPattern)
-        {
-            ///@ todo: don't need to generate input coverage 2x if input coverage and centroid
-            CalcCentroidPos<(SWR_MULTISAMPLE_COUNT)sampleCount, bForcedMultisampleCount>(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL);
-        }
-        else
-        {
-            static const __m256 pixelCenter = _simd_set1_ps(0.5f);
-            psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter);
-            psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter);
-        }
-        // evaluate I,J
-        psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
-        psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
-        psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
-        psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
-
-        // interpolate 1/w
-        psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
-    }
-}
-
-template<uint32_t NumRT, uint32_t sampleCountT>
-void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
-                  const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask)
-{
-    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
-    static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
-    uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
-    simdvector blendOut;
-
-    for(uint32_t rt = 0; rt < NumRT; ++rt)
-    {
-        uint8_t *pColorSample;
-        if(sampleCount == SWR_MULTISAMPLE_1X)
-        {
-            pColorSample = pColorBase[rt];
-        }
-        else
-        {
-            pColorSample = pColorBase[rt] + rasterTileColorOffset;
-        }
-
-        const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
-        // pfnBlendFunc may not update all channels.  Initialize with PS output.
-        /// TODO: move this into the blend JIT.
-        blendOut = psContext.shaded[rt];
-
-        // Blend outputs and update coverage mask for alpha test
-        if(pfnBlendFunc[rt] != nullptr)
-        {
-            pfnBlendFunc[rt](
-                pBlendState,
-                psContext.shaded[rt],
-                psContext.shaded[1],
-                sample,
-                pColorSample,
-                blendOut,
-                &psContext.oMask,
-                (simdscalari*)&coverageMask);
-        }
-
-        // final write mask 
-        simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
-
-        ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
-        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
-
-        const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
-
-        // store with color mask
-        if(!pRTBlend->writeDisableRed)
-        {
-            _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
-        }
-        if(!pRTBlend->writeDisableGreen)
-        {
-            _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
-        }
-        if(!pRTBlend->writeDisableBlue)
-        {
-            _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
-        }
-        if(!pRTBlend->writeDisableAlpha)
-        {
-            _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
-        }
-    }
-}
-
-template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
+template<typename T>
 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
+    RDTSC_START(BESingleSampleBackend);
     RDTSC_START(BESetup);
-    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
-    static const bool bInputCoverage = (bool)inputCoverage;
-    static const bool bCentroidPos = (bool)centroidPos;
 
     SWR_CONTEXT *pContext = pDC->pContext;
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
     const SWR_PS_STATE *pPSState = &state.psState;
     const SWR_BLEND_STATE *pBlendState = &state.blendState;
-    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
     uint64_t coverageMask = work.coverageMask[0];
 
     // broadcast scalars
@@ -883,33 +487,33 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
     psContext.J = work.J;
     psContext.recipDet = work.recipDet;
     psContext.pRecipW = work.pRecipW;
-    psContext.pSamplePosX = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosX;
-    psContext.pSamplePosY = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosY;
+    psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
+    psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
 
     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
         // UL pixel corner
-        psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
         // pixel center
-        psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
 
         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
-            if(bInputCoverage)
-            {
-                generateInputCoverage<SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
-            }
-
             if(coverageMask & MASK)
             {
-                RDTSC_START(BEBarycentric);
-                psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+                psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
                 // pixel center
-                psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+                psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
-                backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
+                if(T::bInputCoverage)
+                {
+                    generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+                }
 
-                if(bCentroidPos)
+                RDTSC_START(BEBarycentric);
+                CalcPixelBarycentrics(coeffs, psContext);
+
+                if(T::bCentroidPos)
                 {
                     // for 1x case, centroid is pixel center
                     psContext.vX.centroid = psContext.vX.center;
@@ -919,12 +523,12 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                     psContext.vOneOverW.centroid = psContext.vOneOverW.center;
                 }
 
-                // interpolate z
+                // interpolate and quantize z
                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
                 RDTSC_STOP(BEBarycentric, 0, 0);
 
                 simdmask clipCoverageMask = coverageMask & MASK;
-
                 // interpolate user clip distance if available
                 if(rastState.clipDistanceMask)
                 {
@@ -937,10 +541,10 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 simdscalar stencilPassMask = vCoverageMask;
 
                 // Early-Z?
-                if(CanEarlyZ(pPSState))
+                if(T::bCanEarlyZ)
                 {
                     RDTSC_START(BEEarlyDepthTest);
-                    depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
                                                         psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
                     RDTSC_STOP(BEEarlyDepthTest, 0, 0);
 
@@ -969,10 +573,10 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
 
                 // late-Z
-                if(!CanEarlyZ(pPSState))
+                if(!T::bCanEarlyZ)
                 {
                     RDTSC_START(BELateDepthTest);
-                    depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
                                                         psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
                     RDTSC_STOP(BELateDepthTest, 0, 0);
 
@@ -991,8 +595,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
 
                 // output merger
                 RDTSC_START(BEOutputMerger);
-                backendFuncs.pfnOutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc,
-                                             vCoverageMask, depthPassMask);
+                OutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
 
                 // do final depth write after all pixel kills
                 if (!pPSState->forceEarlyZ)
@@ -1016,16 +619,13 @@ Endtile:
             RDTSC_STOP(BEEndTile, 0, 0);
         }
     }
+    RDTSC_STOP(BESingleSampleBackend, 0, 0);
 }
 
-template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
+template<typename T>
 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
-    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
-    static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
-    static const bool bInputCoverage = (bool)inputCoverage;
-    static const bool bCentroidPos = (bool)centroidPos;
-
+    RDTSC_START(BESampleRateBackend);
     RDTSC_START(BESetup);
 
     SWR_CONTEXT *pContext = pDC->pContext;
@@ -1033,7 +633,6 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
     const SWR_RASTSTATE& rastState = state.rastState;
     const SWR_PS_STATE *pPSState = &state.psState;
     const SWR_BLEND_STATE *pBlendState = &state.blendState;
-    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
 
     // broadcast scalars
     BarycentricCoeffs coeffs;
@@ -1075,58 +674,54 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
     psContext.I = work.I;
     psContext.J = work.J;
     psContext.recipDet = work.recipDet;
-    psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX;
-    psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY;
-    const uint32_t numSamples = MultisampleTraits<sampleCount>::numSamples;
+    psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
+    psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
 
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
         // UL pixel corner
-        psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
         // pixel center
-        psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
         
         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
-            psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+            psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
             // pixel center
-            psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+            psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
             RDTSC_START(BEBarycentric);
-            backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
+            CalcPixelBarycentrics(coeffs, psContext);
             RDTSC_STOP(BEBarycentric, 0, 0);
 
-            if(bInputCoverage)
+            if(T::bInputCoverage)
             {
-                generateInputCoverage<sampleCount, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+                generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
             }
 
-            if(bCentroidPos)
+            if(T::bCentroidPos)
             {
                 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
                 RDTSC_START(BEBarycentric);
-                backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
+                CalcCentroidBarycentrics<T>(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
                 RDTSC_STOP(BEBarycentric, 0, 0);
             }
 
-            for(uint32_t sample = 0; sample < numSamples; sample++)
+            for(uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
             {
-                if (work.coverageMask[sample] & MASK)
+                simdmask coverageMask = work.coverageMask[sample] & MASK;
+                if (coverageMask)
                 {
                     RDTSC_START(BEBarycentric);
-
                     // calculate per sample positions
-                    psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample));
-                    psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample));
-                    
-                    simdmask coverageMask = work.coverageMask[sample] & MASK;
-                    simdscalar vCoverageMask = vMask(coverageMask);
+                    psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
+                    psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
 
-                    backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
+                    CalcSampleBarycentrics(coeffs, psContext);
 
-                    // interpolate z
+                    // interpolate and quantize z
                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
-
+                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
                     RDTSC_STOP(BEBarycentric, 0, 0);
 
                     // interpolate user clip distance if available
@@ -1135,19 +730,20 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                         coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
                             psContext.vI.sample, psContext.vJ.sample);
                     }
-                    
+
+                    simdscalar vCoverageMask = vMask(coverageMask);
                     simdscalar depthPassMask = vCoverageMask;
                     simdscalar stencilPassMask = vCoverageMask;
 
                     // offset depth/stencil buffers current sample
-                    uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
-                    uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
+                    uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
+                    uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
 
                     // Early-Z?
-                    if (CanEarlyZ(pPSState))
+                    if (T::bCanEarlyZ)
                     {
                         RDTSC_START(BEEarlyDepthTest);
-                        depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
                                               psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                         RDTSC_STOP(BEEarlyDepthTest, 0, 0);
 
@@ -1176,11 +772,11 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
 
                     vCoverageMask = _simd_castsi_ps(psContext.activeMask);
 
-                    //// late-Z
-                    if (!CanEarlyZ(pPSState))
+                    // late-Z
+                    if (!T::bCanEarlyZ)
                     {
                         RDTSC_START(BELateDepthTest);
-                        depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
                                               psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                         RDTSC_STOP(BELateDepthTest, 0, 0);
 
@@ -1201,8 +797,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
 
                     // output merger
                     RDTSC_START(BEOutputMerger);
-                    backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, 
-                                                 vCoverageMask, depthPassMask);
+                    OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
 
                     // do final depth write after all pixel kills
                     if (!pPSState->forceEarlyZ)
@@ -1225,18 +820,13 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
             RDTSC_STOP(BEEndTile, 0, 0);
         }
     }
+    RDTSC_STOP(BESampleRateBackend, 0, 0);
 }
 
-template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
+template<typename T>
 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
-    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
-    static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
-    static const bool bIsStandardPattern = (bool)samplePattern;
-    static const bool bInputCoverage = (bool)inputCoverage;
-    static const bool bCentroidPos = (bool)centroidPos;
-    static const bool bForcedSampleCount = (bool)forcedSampleCount;
-
+    RDTSC_START(BEPixelRateBackend);
     RDTSC_START(BESetup);
 
     SWR_CONTEXT *pContext = pDC->pContext;
@@ -1244,7 +834,6 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
     const SWR_RASTSTATE& rastState = state.rastState;
     const SWR_PS_STATE *pPSState = &state.psState;
     const SWR_BLEND_STATE *pBlendState = &state.blendState;
-    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
 
     // broadcast scalars
     BarycentricCoeffs coeffs;
@@ -1285,260 +874,152 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
     psContext.I = work.I;
     psContext.J = work.J;
     psContext.recipDet = work.recipDet;
-    psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX;
-    psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY;
+    psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
+    psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
     psContext.sampleIndex = 0;
-
-    uint32_t numCoverageSamples;
-    if(bIsStandardPattern)
-    {
-        numCoverageSamples = MultisampleTraits<sampleCount>::numSamples;
-    }
-    else
-    {
-        numCoverageSamples = 1;
-    }
-
-    uint32_t numOMSamples;
-    // RT has to be single sample if we're in forcedMSAA mode
-    if(bForcedSampleCount && (sampleCount > SWR_MULTISAMPLE_1X))
-    {
-        numOMSamples = 1;
-    }
-    // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
-    else if(bForcedSampleCount && (sampleCount == SWR_MULTISAMPLE_1X))
-    {
-        numOMSamples = GetNumSamples(pBlendState->sampleCount);
-    }
-    // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
-    else
-    {
-        numOMSamples = MultisampleTraits<sampleCount>::numSamples;
-    }
     
+    PixelRateZTestLoop<T> PixelRateZTest(pDC, work, coeffs, state, pDepthBase, pStencilBase, rastState.clipDistanceMask);
+
     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
-        psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
-        psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
-            simdscalar vZ[MultisampleTraits<sampleCount>::numSamples];
-            psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+            if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
+
+            psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
             // set pixel center positions
-            psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+            psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
-            if (bInputCoverage)
+            RDTSC_START(BEBarycentric);
+            CalcPixelBarycentrics(coeffs, psContext);
+            RDTSC_STOP(BEBarycentric, 0, 0);
+
+            if (T::bInputCoverage)
             {
-                generateInputCoverage<sampleCount, bIsStandardPattern, bForcedSampleCount>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+                generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
             }
 
-            if(bCentroidPos)
+            if(T::bCentroidPos)
             {
                 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
                 RDTSC_START(BEBarycentric);
-                backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
+                CalcCentroidBarycentrics<T>(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
                 RDTSC_STOP(BEBarycentric, 0, 0);
             }
 
-            // if oDepth written to, or there is a potential to discard any samples, we need to 
-            // run the PS early, then interp or broadcast Z and test
-            if(pPSState->writesODepth || pPSState->killsPixel)
+                       simdscalar activeLanes;
+            if(T::bForcedSampleCount)
             {
-                RDTSC_START(BEBarycentric);
-                backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
-
-                // interpolate z
-                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
-                RDTSC_STOP(BEBarycentric, 0, 0);
+                // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
+                const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
+                activeLanes = _simd_and_ps(vMask(work.anyCoveredSamples & MASK), vSampleMask);
+            }
 
-                // execute pixel shader
-                RDTSC_START(BEPixelShader);
-                state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-                RDTSC_STOP(BEPixelShader, 0, 0);
+            // Early-Z?
+            if(T::bCanEarlyZ && !T::bForcedSampleCount)
+            {
+                activeLanes = _simd_setzero_ps();
+                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
+                UPDATE_STAT(DepthPassCount, depthPassCount);
             }
-            else
+            // if we can't do early z, set the active mask to any samples covered in the current simd
+            else if(!T::bCanEarlyZ && !T::bForcedSampleCount)
             {
-                               psContext.activeMask = _simd_set1_epi32(-1);
+                activeLanes = vMask(work.anyCoveredSamples & MASK);
             }
 
-            // need to declare enough space for all samples
-            simdscalar vCoverageMask[MultisampleTraits<sampleCount>::numSamples];
-            simdscalar depthPassMask[MultisampleTraits<sampleCount>::numSamples]; 
-            simdscalar stencilPassMask[MultisampleTraits<sampleCount>::numSamples];
-            simdscalar anyDepthSamplePassed = _simd_setzero_ps();
-            simdscalar anyStencilSamplePassed = _simd_setzero_ps();
-            for(uint32_t sample = 0; sample < numCoverageSamples; sample++)
+            // if we have no covered samples that passed depth at this point, go to next tile
+            if(!_simd_movemask_ps(activeLanes))
             {
-                vCoverageMask[sample] = vMask(work.coverageMask[sample] & MASK);
-
-                // pull mask back out for any discards and and with coverage
-                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_castsi_ps(psContext.activeMask));
-
-                if (!_simd_movemask_ps(vCoverageMask[sample]))
-                {
-                    vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] =  _simd_setzero_ps();
-                    continue;
-                }
-
-                if(bForcedSampleCount)
-                {
-                    // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
-                    const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
-                    anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, _simd_and_ps(vCoverageMask[sample], vSampleMask));
-                    continue;
-                }
-
-                depthPassMask[sample] = vCoverageMask[sample];
-
-                // if oDepth isn't written to, we need to interpolate Z for each sample
-                // if clip distances are enabled, we need to interpolate for each sample
-                if(!pPSState->writesODepth || rastState.clipDistanceMask)
-                {
-                    RDTSC_START(BEBarycentric);
-                    if(bIsStandardPattern)
-                    {
-                        // calculate per sample positions
-                        psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample));
-                        psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample));
-                    }
-                    else
-                    {
-                        psContext.vX.sample = psContext.vX.center;
-                        psContext.vY.sample = psContext.vY.center;
-                    }
-
-                    // calc I & J per sample
-                    backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
-
-                    // interpolate z
-                    if (!pPSState->writesODepth)
-                    {
-                        vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
-                    }
-                    
-                    ///@todo: perspective correct vs non-perspective correct clipping?
-                    // interpolate clip distances
-                    if (rastState.clipDistanceMask)
-                    {
-                        uint8_t clipMask = ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
-                            psContext.vI.sample, psContext.vJ.sample);
-                        vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
-                    }
-                    RDTSC_STOP(BEBarycentric, 0, 0);
-                }
-                // else 'broadcast' and test psContext.vZ written from the PS each sample
-                else
-                {
-                    vZ[sample] = psContext.vZ;
-                }
-
-                // offset depth/stencil buffers current sample
-                uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
-                uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
-
-                // ZTest for this sample
-                RDTSC_START(BEEarlyDepthTest);
-                stencilPassMask[sample] = vCoverageMask[sample];
-                depthPassMask[sample] = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
-                                        vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
-                RDTSC_STOP(BEEarlyDepthTest, 0, 0);
-
-                anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
-                anyStencilSamplePassed = _simd_or_ps(anyStencilSamplePassed, stencilPassMask[sample]);
-                uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
-                uint32_t statCount = _mm_popcnt_u32(statMask);
-                UPDATE_STAT(DepthPassCount, statCount);
+                goto Endtile;
             }
 
-            // if we didn't have to execute the PS early, and at least 1 sample passed the depth test, run the PS
-            if(!pPSState->writesODepth && !pPSState->killsPixel && _simd_movemask_ps(anyDepthSamplePassed))
+            if(pPSState->usesSourceDepth)
             {
                 RDTSC_START(BEBarycentric);
-                backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
-                // interpolate z
+                // interpolate and quantize z
                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
                 RDTSC_STOP(BEBarycentric, 0, 0);
+            }
 
-                // execute pixel shader
-                RDTSC_START(BEPixelShader);
-                state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-                RDTSC_STOP(BEPixelShader, 0, 0);
+            // pixels that are currently active
+            psContext.activeMask = _simd_castps_si(activeLanes);
+            psContext.oMask = T::MultisampleT::FullSampleMask();
+
+            // execute pixel shader
+            RDTSC_START(BEPixelShader);
+            state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+            UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
+            RDTSC_STOP(BEPixelShader, 0, 0);
+
+            // update active lanes to remove any discarded or oMask'd pixels
+            activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
+            if(!_simd_movemask_ps(activeLanes))
+            {
+                goto Endtile;
             }
-            ///@todo: make sure this works for kill pixel
-            else if(!_simd_movemask_ps(anyStencilSamplePassed))
+
+            // late-Z
+            if(!T::bCanEarlyZ && !T::bForcedSampleCount)
+            {
+                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
+                UPDATE_STAT(DepthPassCount, depthPassCount);
+            }
+
+            // if we have no covered samples that passed depth at this point, skip OM and go to next tile
+            if(!_simd_movemask_ps(activeLanes))
             {
                 goto Endtile;
             }
 
+            // output merger
             // loop over all samples, broadcasting the results of the PS to all passing pixels
-            for(uint32_t sample = 0; sample < numOMSamples; sample++)
+            for(uint32_t sample = 0; sample < GetNumOMSamples<T>(pBlendState->sampleCount); sample++)
             {
-                uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
-                uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
-
-                // output merger
                 RDTSC_START(BEOutputMerger);
-
-                // skip if none of the pixels for this sample passed
-                simdscalar coverageMaskSample;
-                simdscalar depthMaskSample;
-                simdscalar stencilMaskSample;
-                simdscalar vInterpolatedZ;
-
-                // forcedSampleCount outputs to any pixels with covered samples not masked off by SampleMask
-                // depth test is disabled, so just set the z val to 0.
-                if(bForcedSampleCount)
+                // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
+                uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
+                simdscalar coverageMask, depthMask;
+                if(T::bForcedSampleCount)
                 {
-                    coverageMaskSample = depthMaskSample = anyDepthSamplePassed;
-                    vInterpolatedZ = _simd_setzero_ps();
-                }
-                else if(bIsStandardPattern)
-                {
-                    if(!_simd_movemask_ps(depthPassMask[sample]))
-                    {
-                        depthPassMask[sample] = _simd_setzero_ps();
-                        DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], pDepthSample, depthPassMask[sample],
-                                          vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
-                        continue;
-                    }
-                    coverageMaskSample = vCoverageMask[sample];
-                    depthMaskSample = depthPassMask[sample];
-                    stencilMaskSample = stencilPassMask[sample];
-                    vInterpolatedZ = vZ[sample];
+                    coverageMask = depthMask = activeLanes;
                 }
                 else
                 {
-                    // center pattern only needs to use a single depth test as all samples are at the same position
-                    if(!_simd_movemask_ps(depthPassMask[0]))
+                    coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
+                    depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
+                    if(!_simd_movemask_ps(depthMask))
                     {
-                        depthPassMask[0] = _simd_setzero_ps();
-                        DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[0], pDepthSample, depthPassMask[0],
-                                          vCoverageMask[0], pStencilSample, stencilPassMask[0]);
+                        // stencil should already have been written in early/lateZ tests
+                        RDTSC_STOP(BEOutputMerger, 0, 0);
                         continue;
                     }
-                    coverageMaskSample = (vCoverageMask[0]);
-                    depthMaskSample = depthPassMask[0];
-                    stencilMaskSample = stencilPassMask[0];
-                    vInterpolatedZ = vZ[0];
                 }
+                
+                // broadcast the results of the PS to all passing pixels
+                OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, coverageMask, depthMask, pPSState->numRenderTargets);
 
-                // output merger
-                RDTSC_START(BEOutputMerger);
-                backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc,
-                                             coverageMaskSample, depthMaskSample);
+                if(!pPSState->forceEarlyZ && !T::bForcedSampleCount)
+                {
+                    uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
+                    uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
 
-                DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vInterpolatedZ, pDepthSample, depthMaskSample,
-                                  coverageMaskSample, pStencilSample, stencilMaskSample);
-                RDTSC_STOP(BEOutputMerger, 0, 0);
+                    DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
+                                      pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
+                }
+                RDTSC_STOP(BEOutputMerger, 0, 0);        
             }
-
 Endtile:
             RDTSC_START(BEEndTile);
-            for(uint32_t sample = 0; sample < numCoverageSamples; sample++)
+            for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
             {
                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
 
+            work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
             pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 
@@ -1549,17 +1030,20 @@ Endtile:
             RDTSC_STOP(BEEndTile, 0, 0);
         }
     }
+    RDTSC_STOP(BEPixelRateBackend, 0, 0);
 }
 // optimized backend flow with NULL PS
 template<uint32_t sampleCountT>
 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
+    RDTSC_START(BENullBackend);
+    ///@todo: handle center multisample pattern
+    typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
     RDTSC_START(BESetup);
 
-    static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
     SWR_CONTEXT *pContext = pDC->pContext;
     const API_STATE& state = GetApiState(pDC);
-    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+    const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
 
     // broadcast scalars
     BarycentricCoeffs coeffs;
@@ -1577,7 +1061,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
 
     coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
 
-    BYTE *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
+    uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
 
     RDTSC_STOP(BESetup, 0, 0);
 
@@ -1585,12 +1069,12 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
         // UL pixel corner
-        simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+        simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
 
         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
             // UL pixel corners
-            simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+            simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
 
             // iterate over active samples
             unsigned long sample = 0;
@@ -1598,29 +1082,38 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
             while (_BitScanForward(&sample, sampleMask))
             {
                 sampleMask &= ~(1 << sample);
-                if (work.coverageMask[sample] & MASK)
+                simdmask coverageMask = work.coverageMask[sample] & MASK;
+                if (coverageMask)
                 {
                     RDTSC_START(BEBarycentric);
                     // calculate per sample positions
-                    psContext.vX.sample = _simd_add_ps(vXSamplePosUL, MultisampleTraits<sampleCount>::vX(sample));
-                    psContext.vY.sample = _simd_add_ps(vYSamplePosUL, MultisampleTraits<sampleCount>::vY(sample));
+                    psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
+                    psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
 
-                    backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
+                    CalcSampleBarycentrics(coeffs, psContext);
 
-                    // interpolate z
+                    // interpolate and quantize z
                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
 
                     RDTSC_STOP(BEBarycentric, 0, 0);
 
-                    simdscalar vCoverageMask = vMask(work.coverageMask[sample] & MASK);
+                    // interpolate user clip distance if available
+                    if (rastState.clipDistanceMask)
+                    {
+                        coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
+                            psContext.vI.sample, psContext.vJ.sample);
+                    }
+
+                    simdscalar vCoverageMask = vMask(coverageMask);
                     simdscalar stencilPassMask = vCoverageMask;
 
                     // offset depth/stencil buffers current sample
-                    uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
-                    uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
+                    uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
+                    uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
 
                     RDTSC_START(BEEarlyDepthTest);
-                    simdscalar depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                    simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
                         psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                     DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                         pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
@@ -1636,6 +1129,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
             pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
         }
     }
+    RDTSC_STOP(BENullBackend, 0, 0);
 }
 
 void InitClearTilesTable()
@@ -1650,90 +1144,37 @@ void InitClearTilesTable()
 }
 
 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
-PFN_BACKEND_FUNC gBackendSingleSample[2][2] = {};
-PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2] = {};
-PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2] = {};
-PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX] = {};
-PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2] = {};
-PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2] = {};
-PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2] = {};
+PFN_BACKEND_FUNC gBackendSingleSample[2] // input coverage
+                                     [2] // centroid
+                                     [2] // canEarlyZ
+                                     = {};
+PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX]
+                                       [SWR_MSAA_SAMPLE_PATTERN_MAX]
+                                       [SWR_INPUT_COVERAGE_MAX]
+                                       [2] // centroid
+                                       [2] // forcedSampleCount
+                                       [2] // canEarlyZ
+                                       = {};
+PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX]
+                                        [2] // centroid
+                                        [2] // canEarlyZ
+                                        = {};
 
 // Recursive template used to auto-nest conditionals.  Converts dynamic enum function
 // arguments to static template arguments.
 template <uint32_t... ArgsT>
-struct OMChooser
-{
-    // Last Arg Terminator
-    static PFN_OUTPUT_MERGER GetFunc(SWR_MULTISAMPLE_COUNT tArg)
-    {
-        switch(tArg)
-        {
-        case SWR_MULTISAMPLE_1X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_1X>; break;
-        case SWR_MULTISAMPLE_2X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_2X>; break;
-        case SWR_MULTISAMPLE_4X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_4X>; break;
-        case SWR_MULTISAMPLE_8X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_8X>; break;
-        case SWR_MULTISAMPLE_16X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_16X>; break;
-        default:
-            SWR_ASSERT(0 && "Invalid sample count\n");
-            return nullptr;
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_OUTPUT_MERGER GetFunc(uint32_t tArg, TArgsT... remainingArgs)
-    {
-        switch(tArg)
-        {
-        case 0: return OMChooser<ArgsT..., 0>::GetFunc(remainingArgs...); break;
-        case 1: return OMChooser<ArgsT..., 1>::GetFunc(remainingArgs...); break;
-        case 2: return OMChooser<ArgsT..., 2>::GetFunc(remainingArgs...); break;
-        case 3: return OMChooser<ArgsT..., 3>::GetFunc(remainingArgs...); break;
-        case 4: return OMChooser<ArgsT..., 4>::GetFunc(remainingArgs...); break;
-        case 5: return OMChooser<ArgsT..., 5>::GetFunc(remainingArgs...); break;
-        case 6: return OMChooser<ArgsT..., 6>::GetFunc(remainingArgs...); break;
-        case 7: return OMChooser<ArgsT..., 7>::GetFunc(remainingArgs...); break;
-        case 8: return OMChooser<ArgsT..., 8>::GetFunc(remainingArgs...); break;
-        default:
-            SWR_ASSERT(0 && "Invalid RT index\n");
-            return nullptr;
-            break;
-        }
-    }
-};
-
-// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
-// arguments to static template arguments.
-template <uint32_t... ArgsT>
-struct BECentroidBarycentricChooser
+struct BEChooser
 {
-
     // Last Arg Terminator
-    template <typename... TArgsT>
-    static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg)
-    {
-        if(tArg > 0)
-        {
-            return CalcCentroidBarycentrics<ArgsT..., 1>;
-        }
-
-        return CalcCentroidBarycentrics<ArgsT..., 0>;
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
+    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
     {
         switch(tArg)
         {
-        case SWR_MULTISAMPLE_1X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_2X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_4X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_8X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_16X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
+        case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
+        case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
+        case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
         default:
-            SWR_ASSERT(0 && "Invalid sample count\n");
+            SWR_ASSERT(0 && "Invalid backend func\n");
             return nullptr;
             break;
         }
@@ -1741,38 +1182,19 @@ struct BECentroidBarycentricChooser
 
     // Recursively parse args
     template <typename... TArgsT>
-    static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg, TArgsT... remainingArgs)
-    {
-        if(tArg > 0)
-        {
-            return BECentroidBarycentricChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
-        }
-
-        return BECentroidBarycentricChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
-    }
-};
-
-// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
-// arguments to static template arguments.
-template <uint32_t... ArgsT>
-struct BEChooser
-{
-    // Last Arg Terminator
-    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
+    static PFN_BACKEND_FUNC GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg, TArgsT... remainingArgs)
     {
         switch(tArg)
         {
-        case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<ArgsT...>; break;
-        case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<ArgsT...>; break;
-        case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<ArgsT...>; break;
+        case SWR_MSAA_CENTER_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_CENTER_PATTERN>::GetFunc(remainingArgs...); break;
+        case SWR_MSAA_STANDARD_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...); break;
         default:
-            SWR_ASSERT(0 && "Invalid backend func\n");
-            return nullptr;
-            break;
+        SWR_ASSERT(0 && "Invalid sample pattern\n");
+        return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...);
+        break;
         }
     }
 
-
     // Recursively parse args
     template <typename... TArgsT>
     static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
@@ -1785,116 +1207,91 @@ struct BEChooser
         case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
         case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
         default:
-            SWR_ASSERT(0 && "Invalid sample count\n");
-            return nullptr;
-            break;
+        SWR_ASSERT(0 && "Invalid sample count\n");
+        return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
+        break;
         }
     }
 
     // Recursively parse args
     template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(uint32_t tArg, TArgsT... remainingArgs)
+    static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
     {
-        if(tArg > 0)
+        if(tArg == true)
         {
             return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
         }
 
-         return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
+        return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
     }
 };
 
-template <uint32_t numRenderTargets, SWR_MULTISAMPLE_COUNT numSampleRates>
-void InitBackendOMFuncTable(PFN_OUTPUT_MERGER (&table)[numRenderTargets][numSampleRates])
+void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[2][2][2])
 {
-    for(uint32_t rtNum = SWR_ATTACHMENT_COLOR0; rtNum < numRenderTargets; rtNum++)
+    for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++)
     {
-        for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+        for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
         {
-            table[rtNum][sampleCount] =
-                OMChooser<>::GetFunc((SWR_RENDERTARGET_ATTACHMENT)rtNum, (SWR_MULTISAMPLE_COUNT)sampleCount);
+            for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
+            {
+                table[inputCoverage][isCentroid][canEarlyZ] =
+                    BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL),
+                                         (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
+            }
         }
     }
 }
 
-template <SWR_MULTISAMPLE_COUNT numSampleRates>
-void InitBackendBarycentricsTables(PFN_CALC_PIXEL_BARYCENTRICS (&pixelTable)[2], 
-                                   PFN_CALC_SAMPLE_BARYCENTRICS (&sampleTable)[2],
-                                   PFN_CALC_CENTROID_BARYCENTRICS (&centroidTable)[numSampleRates][2][2][2])
+void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX]
+                                                        [2][2][2])
 {
-    pixelTable[0] = CalcPixelBarycentrics<0>;
-    pixelTable[1] = CalcPixelBarycentrics<1>;
-
-    sampleTable[0] = CalcSampleBarycentrics<0>;
-    sampleTable[1] = CalcSampleBarycentrics<1>;
-
-    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_MAX; sampleCount++)
     {
-        for(uint32_t baryMask = 0; baryMask < 2; baryMask++)
+        for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_MAX; samplePattern++)
         {
-            for(uint32_t patternNum = 0; patternNum < 2; patternNum++)
+            for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++)
             {
-                for(uint32_t forcedSampleEnable = 0; forcedSampleEnable < 2; forcedSampleEnable++)
+                for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
                 {
-                    centroidTable[sampleCount][baryMask][patternNum][forcedSampleEnable]=
-                        BECentroidBarycentricChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, baryMask, patternNum, forcedSampleEnable);
+                    for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++)
+                    {
+                        for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
+                        {
+                            table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
+                                BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), 
+                                                        (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
+                        }
+                    }
                 }
             }
         }
     }
 }
 
-void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[2][2])
+void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2][2])
 {
-    gBackendSingleSample[0][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
-    gBackendSingleSample[0][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
-    gBackendSingleSample[1][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
-    gBackendSingleSample[1][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
-}
-
-template <SWR_MULTISAMPLE_COUNT numSampleRates, SWR_MSAA_SAMPLE_PATTERN numSamplePatterns, SWR_INPUT_COVERAGE numCoverageModes>
-void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numSamplePatterns][numCoverageModes][2][2])
-{
-    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_MAX; sampleCount++)
     {
-        for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < numSamplePatterns; samplePattern++)
+        for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++)
         {
-            for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
+            for(uint32_t centroid = 0; centroid < 2; centroid++)
             {
-                for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
+                for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
                 {
-                    table[sampleCount][samplePattern][inputCoverage][isCentroid][0] =
-                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE);
-                    table[sampleCount][samplePattern][inputCoverage][isCentroid][1] =
-                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 1, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE);
+                    table[sampleCount][inputCoverage][centroid][canEarlyZ] =
+                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), 
+                                             (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
                 }
             }
         }
     }
 }
 
-template <uint32_t numSampleRates, uint32_t numCoverageModes>
-void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numCoverageModes][2])
-{
-    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
-    {
-        for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
-        {
-            table[sampleCount][inputCoverage][0] =
-                BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
-            table[sampleCount][inputCoverage][1] =
-                BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
-        }
-    }
-}
-
 void InitBackendFuncTables()
 {    
-    InitBackendSampleFuncTable(gBackendSingleSample);
-    InitBackendPixelFuncTable<(SWR_MULTISAMPLE_COUNT)SWR_MULTISAMPLE_TYPE_MAX, SWR_MSAA_SAMPLE_PATTERN_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendPixelRateTable);
-    InitBackendSampleFuncTable<SWR_MULTISAMPLE_TYPE_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendSampleRateTable);
-    InitBackendOMFuncTable<SWR_NUM_RENDERTARGETS+1, SWR_MULTISAMPLE_TYPE_MAX>(gBackendOutputMergerTable);
-    InitBackendBarycentricsTables<(SWR_MULTISAMPLE_COUNT)(SWR_MULTISAMPLE_TYPE_MAX)>(gPixelBarycentricTable, gSampleBarycentricTable, gCentroidBarycentricTable);
+    InitBackendSingleFuncTable(gBackendSingleSample);
+    InitBackendPixelFuncTable(gBackendPixelRateTable);
+    InitBackendSampleFuncTable(gBackendSampleRateTable);
 
     gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
     gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;