swr: [rasterizer core] Refactor/cleanup backends
authorGeorge Kyriazis <george.kyriazis@intel.com>
Tue, 18 Oct 2016 20:42:33 +0000 (15:42 -0500)
committerTim Rowley <timothy.o.rowley@intel.com>
Mon, 31 Oct 2016 14:06:15 +0000 (09:06 -0500)
Used for common code reuse and simplification

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
src/gallium/drivers/swr/rasterizer/core/backend.cpp
src/gallium/drivers/swr/rasterizer/core/backend.h

index f71c2b2d3452ef10d6afd590aef4debcaa5740a7..3b228925053a55e6152e3dbe28f43cfcf3119903 100644 (file)
@@ -451,134 +451,95 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
     AR_BEGIN(BESingleSampleBackend, pDC->drawId);
     AR_BEGIN(BESetup, pDC->drawId);
 
-    const API_STATE& state = GetApiState(pDC);
-    const SWR_RASTSTATE& rastState = state.rastState;
-    const SWR_PS_STATE *pPSState = &state.psState;
-    const SWR_BLEND_STATE *pBlendState = &state.blendState;
-    uint64_t coverageMask = work.coverageMask[0];
+    const API_STATE &state = GetApiState(pDC);
 
-    // broadcast scalars
     BarycentricCoeffs coeffs;
-    coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
-    coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
-    coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
-
-    coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
-    coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
-    coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
-
-    coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
-    coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
-    coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
+    SetupBarycentricCoeffs(&coeffs, work);
 
-    coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
+    uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
+    SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
 
-    coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
-    coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
-    coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
+    SWR_PS_CONTEXT psContext;
+    SetupPixelShaderContext<T>(&psContext, work);
 
-    uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
-    uint32_t NumRT = state.psState.numRenderTargets;
-    for(uint32_t rt = 0; rt < NumRT; ++rt)
-    {
-        pColorBase[rt] = renderBuffers.pColor[rt];
-    }
-    uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
     AR_END(BESetup, 1);
 
-    SWR_PS_CONTEXT psContext;
-    psContext.pAttribs = work.pAttribs;
-    psContext.pPerspAttribs = work.pPerspAttribs;
-    psContext.frontFace = work.triFlags.frontFacing;
-    psContext.primID = work.triFlags.primID;
-
-    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
-    psContext.I = work.I;
-    psContext.J = work.J;
-    psContext.recipDet = work.recipDet;
-    psContext.pRecipW = work.pRecipW;
-    psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
-    psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
-    psContext.rasterizerSampleCount = T::MultisampleT::numSamples;
+    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
+    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
 
-    for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
+
+    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
-        // UL pixel corner
-        psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
-        // pixel center
-        psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
+        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
 
-        for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
+
+        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
 #if USE_8x2_TILE_BACKEND
             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
 
 #endif
-            if(coverageMask & MASK)
+            simdmask coverageMask = work.coverageMask[0] & MASK;
+
+            if (coverageMask)
             {
-                psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
-                // pixel center
-                psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
+                if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
+                {
+                    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+
+                    const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
 
-                if(T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
+                    const float minz = state.depthBoundsState.depthBoundsTestMinValue;
+                    const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
+
+                    coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
+                }
+
+                if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
                 {
-                    const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : 
-                                                    &work.coverageMask[0];
-                    generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
+                    const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+
+                    generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
                 }
 
                 AR_BEGIN(BEBarycentric, pDC->drawId);
+
                 CalcPixelBarycentrics(coeffs, psContext);
 
-                // for 1x case, centroid is pixel center
-                psContext.vX.centroid = psContext.vX.center;
-                psContext.vY.centroid = psContext.vY.center;
-                psContext.vI.centroid = psContext.vI.center;
-                psContext.vJ.centroid = psContext.vJ.center;
-                psContext.vOneOverW.centroid = psContext.vOneOverW.center;
+                CalcCentroid<T, true>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
 
                 // interpolate and quantize z
                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+
                 AR_END(BEBarycentric, 1);
 
-                simdmask clipCoverageMask = coverageMask & MASK;
                 // interpolate user clip distance if available
-                if (rastState.clipDistanceMask)
-                {
-                    clipCoverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
-                        psContext.vI.center, psContext.vJ.center);
-                }
-
-                if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
+                if (state.rastState.clipDistanceMask)
                 {
-                    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-
-                    const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBase));
-
-                    const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                    const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                    clipCoverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
+                    coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
                 }
 
-                simdscalar vCoverageMask = vMask(clipCoverageMask);
+                simdscalar vCoverageMask = vMask(coverageMask);
                 simdscalar depthPassMask = vCoverageMask;
                 simdscalar stencilPassMask = vCoverageMask;
 
                 // Early-Z?
-                if(T::bCanEarlyZ)
+                if (T::bCanEarlyZ)
                 {
                     AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                                     psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
+                                                     psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
                     AR_END(BEEarlyDepthTest, 0);
 
                     // early-exit if no pixels passed depth or earlyZ is forced on
-                    if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
+                    if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
                     {
                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                                            pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
+                            pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
 
                         if (!_simd_movemask_ps(depthPassMask))
                         {
@@ -599,18 +560,18 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
 
                 // late-Z
-                if(!T::bCanEarlyZ)
+                if (!T::bCanEarlyZ)
                 {
                     AR_BEGIN(BELateDepthTest, pDC->drawId);
                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                                        psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
+                                                        psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
                     AR_END(BELateDepthTest, 0);
 
-                    if(!_simd_movemask_ps(depthPassMask))
+                    if (!_simd_movemask_ps(depthPassMask))
                     {
                         // need to call depth/stencil write for stencil write
                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                                            pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
+                            pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
                         goto Endtile;
                     }
                 }
@@ -622,47 +583,56 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 // output merger
                 AR_BEGIN(BEOutputMerger, pDC->drawId);
 #if USE_8x2_TILE_BACKEND
-                OutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets, useAlternateOffset);
+                OutputMerger(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, useAlternateOffset);
 #else
-                OutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
+                OutputMerger(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
 #endif
 
                 // do final depth write after all pixel kills
-                if (!pPSState->forceEarlyZ)
+                if (!state.psState.forceEarlyZ)
                 {
                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                        pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
+                        pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
                 }
                 AR_END(BEOutputMerger, 0);
             }
 
 Endtile:
             AR_BEGIN(BEEndTile, pDC->drawId);
-            coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+
+            work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
             {
                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
-            pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 
 #if USE_8x2_TILE_BACKEND
             if (useAlternateOffset)
             {
-                for (uint32_t rt = 0; rt < NumRT; ++rt)
+                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
                 {
-                    pColorBase[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                    pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
                 }
             }
 #else
-            for (uint32_t rt = 0; rt < NumRT; ++rt)
+            for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
             {
-                pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
             }
 #endif
+            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
             AR_END(BEEndTile, 0);
+
+            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
+            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
         }
+
+        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
+        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
     }
+
     AR_END(BESingleSampleBackend, 0);
 }
 
@@ -674,111 +644,76 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
     AR_BEGIN(BESampleRateBackend, pDC->drawId);
     AR_BEGIN(BESetup, pDC->drawId);
 
-    const API_STATE& state = GetApiState(pDC);
-    const SWR_RASTSTATE& rastState = state.rastState;
-    const SWR_PS_STATE *pPSState = &state.psState;
-    const SWR_BLEND_STATE *pBlendState = &state.blendState;
+    const API_STATE &state = GetApiState(pDC);
 
-    // broadcast scalars
     BarycentricCoeffs coeffs;
-    coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
-    coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
-    coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
+    SetupBarycentricCoeffs(&coeffs, work);
 
-    coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
-    coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
-    coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
+    uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
+    SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
 
-    coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
-    coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
-    coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
-
-    coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
-
-    coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
-    coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
-    coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
+    SWR_PS_CONTEXT psContext;
+    SetupPixelShaderContext<T>(&psContext, work);
 
-    uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
-    uint32_t NumRT = state.psState.numRenderTargets;
-    for(uint32_t rt = 0; rt < NumRT; ++rt)
-    {
-        pColorBase[rt] = renderBuffers.pColor[rt];
-    }
-    uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
     AR_END(BESetup, 0);
 
-    SWR_PS_CONTEXT psContext;
-    psContext.pAttribs = work.pAttribs;
-    psContext.pPerspAttribs = work.pPerspAttribs;
-    psContext.pRecipW = work.pRecipW;
-    psContext.frontFace = work.triFlags.frontFacing;
-    psContext.primID = work.triFlags.primID;
-
-    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
-    psContext.I = work.I;
-    psContext.J = work.J;
-    psContext.recipDet = work.recipDet;
-    psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
-    psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
-    psContext.rasterizerSampleCount = T::MultisampleT::numSamples;
+    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
+    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+
+    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
 
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
-        // UL pixel corner
-        psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
-        // pixel center
-        psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
-        
+        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
+        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+
+        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
+
         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
 #if USE_8x2_TILE_BACKEND
             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
 
 #endif
-            psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
-            // pixel center
-            psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
+            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
+            {
+                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+
+                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+            }
 
             AR_BEGIN(BEBarycentric, pDC->drawId);
+
             CalcPixelBarycentrics(coeffs, psContext);
-            AR_END(BEBarycentric, 0);
 
-            if(T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
-            {
-                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask :
-                                                &work.coverageMask[0];
-                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
-            }
+            CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
 
-            if(T::bCentroidPos)
-            {
-                ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
-                AR_BEGIN(BEBarycentric, pDC->drawId);
-                if(T::bIsStandardPattern)
-                {
-                    CalcCentroidPos<T>(psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
-                }
-                else
-                {
-                    psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
-                    psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
-                }
-                CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, psContext.vY.UL);
-                AR_END(BEBarycentric, 0);
-            }
-            else
-            {
-                psContext.vX.centroid = psContext.vX.sample;
-                psContext.vY.centroid = psContext.vY.sample;
-            }
+            AR_END(BEBarycentric, 0);
 
-            for(uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
+            for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
             {
                 simdmask coverageMask = work.coverageMask[sample] & MASK;
+
                 if (coverageMask)
                 {
+                    // offset depth/stencil buffers current sample
+                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+                    uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+
+                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
+                    {
+                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+
+                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
+
+                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
+                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
+
+                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
+                    }
+
                     AR_BEGIN(BEBarycentric, pDC->drawId);
+
                     // calculate per sample positions
                     psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
                     psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
@@ -788,29 +723,13 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     // interpolate and quantize z
                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+
                     AR_END(BEBarycentric, 0);
 
                     // interpolate user clip distance if available
-                    if (rastState.clipDistanceMask)
+                    if (state.rastState.clipDistanceMask)
                     {
-                        coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
-                            psContext.vI.sample, psContext.vJ.sample);
-                    }
-
-                    // offset depth/stencil buffers current sample
-                    uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
-                    uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
-
-                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-                    {
-                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-
-                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
-
-                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
+                        coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
                     }
 
                     simdscalar vCoverageMask = vMask(coverageMask);
@@ -826,7 +745,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                         AR_END(BEEarlyDepthTest, 0);
 
                         // early-exit if no samples passed depth or earlyZ is forced on.
-                        if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
+                        if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
                         {
                             DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                                 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
@@ -876,13 +795,13 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     // output merger
                     AR_BEGIN(BEOutputMerger, pDC->drawId);
 #if USE_8x2_TILE_BACKEND
-                    OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets, useAlternateOffset);
+                    OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, useAlternateOffset);
 #else
-                    OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
+                    OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
 #endif
 
                     // do final depth write after all pixel kills
-                    if (!pPSState->forceEarlyZ)
+                    if (!state.psState.forceEarlyZ)
                     {
                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                             pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
@@ -894,31 +813,41 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
 
 Endtile:
             ATTR_UNUSED;
+
             AR_BEGIN(BEEndTile, pDC->drawId);
+
             if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
             {
                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
-            pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 
 #if USE_8x2_TILE_BACKEND
             if (useAlternateOffset)
             {
-                for (uint32_t rt = 0; rt < NumRT; ++rt)
+                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
                 {
-                    pColorBase[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                    pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
                 }
             }
 #else
-            for (uint32_t rt = 0; rt < NumRT; ++rt)
+            for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
             {
-                pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
             }
 #endif
+            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
             AR_END(BEEndTile, 0);
+
+            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
+            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
         }
+
+        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
+        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
     }
+
     AR_END(BESampleRateBackend, 0);
 }
 
@@ -930,62 +859,33 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
     AR_BEGIN(BEPixelRateBackend, pDC->drawId);
     AR_BEGIN(BESetup, pDC->drawId);
 
-    const API_STATE& state = GetApiState(pDC);
-    const SWR_RASTSTATE& rastState = state.rastState;
-    const SWR_PS_STATE *pPSState = &state.psState;
-    const SWR_BLEND_STATE *pBlendState = &state.blendState;
+    const API_STATE &state = GetApiState(pDC);
 
-    // broadcast scalars
     BarycentricCoeffs coeffs;
-    coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
-    coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
-    coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
+    SetupBarycentricCoeffs(&coeffs, work);
 
-    coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
-    coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
-    coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
+    uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
+    SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
 
-    coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
-    coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
-    coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
+    SWR_PS_CONTEXT psContext;
+    SetupPixelShaderContext<T>(&psContext, work);
 
-    coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
+    AR_END(BESetup, 0);
 
-    coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
-    coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
-    coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
+    PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
 
-    uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
-    uint32_t NumRT = state.psState.numRenderTargets;
-    for(uint32_t rt = 0; rt < NumRT; ++rt)
-    {
-        pColorBase[rt] = renderBuffers.pColor[rt];
-    }
-    uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
-    AR_END(BESetup, 0);
+    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
+    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
 
-    SWR_PS_CONTEXT psContext;
-    psContext.pAttribs = work.pAttribs;
-    psContext.pPerspAttribs = work.pPerspAttribs;
-    psContext.frontFace = work.triFlags.frontFacing;
-    psContext.primID = work.triFlags.primID;
-    psContext.pRecipW = work.pRecipW;
-    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
-    psContext.I = work.I;
-    psContext.J = work.J;
-    psContext.recipDet = work.recipDet;
-    psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
-    psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
-    psContext.rasterizerSampleCount = T::MultisampleT::numSamples;
-
-    psContext.sampleIndex = 0;
-    
-    PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBase, pStencilBase, rastState.clipDistanceMask);
+    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
 
     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
-        psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
-        psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
+        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+
+        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
+
         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
 #if USE_8x2_TILE_BACKEND
@@ -996,48 +896,25 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
             activeLanes = vMask(work.anyCoveredSamples & MASK);
 
-            psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
-            // set pixel center positions
-            psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
-
-            AR_BEGIN(BEBarycentric, pDC->drawId);
-            CalcPixelBarycentrics(coeffs, psContext);
-            AR_END(BEBarycentric, 0);
-
             if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
             {
-                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask :
-                                                &work.coverageMask[0];
-                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
+                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+
+                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
             }
 
-            if(T::bCentroidPos)
-            {
-                ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
-                AR_BEGIN(BEBarycentric, pDC->drawId);
-                if(T::bIsStandardPattern)
-                {
-                    CalcCentroidPos<T>(psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
-                }
-                else
-                {
-                    psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
-                    psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
-                }
+            AR_BEGIN(BEBarycentric, pDC->drawId);
 
-                CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, psContext.vY.UL);
-                AR_END(BEBarycentric, 0);
-            }
-            else
-            {
-                psContext.vX.centroid = _simd_add_ps(psContext.vX.UL, _simd_set1_ps(0.5f));
-                psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
-            }
+            CalcPixelBarycentrics(coeffs, psContext);
+
+            CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
+
+            AR_END(BEBarycentric, 0);
 
             if(T::bForcedSampleCount)
             {
                 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
-                const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
+                const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
                 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
             }
 
@@ -1051,7 +928,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             // if we have no covered samples that passed depth at this point, go to next tile
             if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
 
-            if(pPSState->usesSourceDepth)
+            if(state.psState.usesSourceDepth)
             {
                 AR_BEGIN(BEBarycentric, pDC->drawId);
                 // interpolate and quantize z
@@ -1086,7 +963,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
 
             // output merger
             // loop over all samples, broadcasting the results of the PS to all passing pixels
-            for(uint32_t sample = 0; sample < GetNumOMSamples<T>(pBlendState->sampleCount); sample++)
+            for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
             {
                 AR_BEGIN(BEOutputMerger, pDC->drawId);
                 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
@@ -1110,15 +987,15 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                 
                 // broadcast the results of the PS to all passing pixels
 #if USE_8x2_TILE_BACKEND
-                OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, coverageMask, depthMask, pPSState->numRenderTargets, useAlternateOffset);
+                OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, useAlternateOffset);
 #else
-                OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, coverageMask, depthMask, pPSState->numRenderTargets);
+                OutputMerger(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
 #endif
 
-                if(!pPSState->forceEarlyZ && !T::bForcedSampleCount)
+                if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
                 {
-                    uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
-                    uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
+                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+                    uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
 
                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
                                       pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
@@ -1127,6 +1004,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             }
 Endtile:
             AR_BEGIN(BEEndTile, pDC->drawId);
+
             for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
             {
                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
@@ -1137,26 +1015,34 @@ Endtile:
                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
             work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 
 #if USE_8x2_TILE_BACKEND
             if (useAlternateOffset)
             {
-                for (uint32_t rt = 0; rt < NumRT; ++rt)
+                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
                 {
-                    pColorBase[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                    pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
                 }
             }
 #else
-            for(uint32_t rt = 0; rt < NumRT; ++rt)
+            for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
             {
-                pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
             }
+            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 #endif
+
             AR_END(BEEndTile, 0);
+
+            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
+            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
         }
+
+        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
+        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
     }
+
     AR_END(BEPixelRateBackend, 0);
 }
 // optimized backend flow with NULL PS
@@ -1170,50 +1056,60 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
     typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
     AR_BEGIN(BESetup, pDC->drawId);
 
-    const API_STATE& state = GetApiState(pDC);
-    const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
+    const API_STATE &state = GetApiState(pDC);
 
-    // broadcast scalars
     BarycentricCoeffs coeffs;
-    coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
-    coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
-    coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
+    SetupBarycentricCoeffs(&coeffs, work);
 
-    coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
-    coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
-    coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
+    uint8_t *pDepthBuffer, *pStencilBuffer;
+    SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
 
-    coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
-    coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
-    coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
+    SWR_PS_CONTEXT psContext;
+    // skip SetupPixelShaderContext(&psContext, ...); // not needed here
 
-    coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
+    AR_END(BESetup, 0);
 
-    uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
+    simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
 
-    AR_END(BESetup, 0);
+    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
 
-    SWR_PS_CONTEXT psContext;
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
-        // UL pixel corner
-        simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
+        simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+
+        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
 
         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
-            // UL pixel corners
-            simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
-
             // iterate over active samples
             unsigned long sample = 0;
             uint32_t sampleMask = state.blendState.sampleMask;
             while (_BitScanForward(&sample, sampleMask))
             {
                 sampleMask &= ~(1 << sample);
+
                 simdmask coverageMask = work.coverageMask[sample] & MASK;
+
                 if (coverageMask)
                 {
+                    // offset depth/stencil buffers current sample
+                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+                    uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+
+                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
+                    {
+                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+
+                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
+
+                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
+                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
+
+                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
+                    }
+
                     AR_BEGIN(BEBarycentric, pDC->drawId);
+
                     // calculate per sample positions
                     psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
                     psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
@@ -1227,26 +1123,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                     AR_END(BEBarycentric, 0);
 
                     // interpolate user clip distance if available
-                    if (rastState.clipDistanceMask)
+                    if (state.rastState.clipDistanceMask)
                     {
-                        coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
-                            psContext.vI.sample, psContext.vJ.sample);
-                    }
-
-                    // offset depth/stencil buffers current sample
-                    uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
-                    uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
-
-                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-                    {
-                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-
-                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
-
-                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
+                        coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
                     }
 
                     simdscalar vCoverageMask = vMask(coverageMask);
@@ -1268,10 +1147,16 @@ Endtile:
                 ATTR_UNUSED;
                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
-            pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+            vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
         }
+
+        vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
     }
+
     AR_END(BENullBackend, 0);
 }
 
index 53222eabccd03f11d7d0efc5d036a121b0187e8c..dc0be9063577418a10dcfe14c35abf0be56846cb 100644 (file)
@@ -438,13 +438,117 @@ INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
     }
 }
 
+inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
+{
+    // broadcast scalars
+
+    coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
+    coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
+    coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
+
+    coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
+    coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
+    coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
+
+    coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
+    coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
+    coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
+
+    coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
+
+    coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
+    coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
+    coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
+}
+
+inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers)
+{
+    SWR_ASSERT(colorBufferCount <= SWR_NUM_RENDERTARGETS);
+
+    if (pColorBuffer)
+    {
+        for (uint32_t index = 0; index < colorBufferCount; index += 1)
+        {
+            pColorBuffer[index] = renderBuffers.pColor[index];
+        }
+    }
+
+    if (pDepthBuffer)
+    {
+        *pDepthBuffer = renderBuffers.pDepth;
+    }
+
+    if (pStencilBuffer)
+    {
+        *pStencilBuffer = renderBuffers.pStencil;;
+    }
+}
+
+template<typename T>
+void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_TRIANGLE_DESC &work)
+{
+    psContext->pAttribs = work.pAttribs;
+    psContext->pPerspAttribs = work.pPerspAttribs;
+    psContext->frontFace = work.triFlags.frontFacing;
+    psContext->primID = work.triFlags.primID;
+
+    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
+    psContext->I = work.I;
+    psContext->J = work.J;
+
+    psContext->recipDet = work.recipDet;
+    psContext->pRecipW = work.pRecipW;
+    psContext->pSamplePosX = reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
+    psContext->pSamplePosY = reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
+    psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
+    psContext->sampleIndex = 0;
+}
+
+template<typename T, bool IsSingleSample>
+void CalcCentroid(SWR_PS_CONTEXT *psContext, const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
+{
+    if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
+    {
+        // for 1x case, centroid is pixel center
+        psContext->vX.centroid = psContext->vX.center;
+        psContext->vY.centroid = psContext->vY.center;
+        psContext->vI.centroid = psContext->vI.center;
+        psContext->vJ.centroid = psContext->vJ.center;
+        psContext->vOneOverW.centroid = psContext->vOneOverW.center;
+    }
+    else
+    {
+        if (T::bCentroidPos)
+        {
+            ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
+            if (T::bIsStandardPattern)
+            {
+                // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
+                CalcCentroidPos<T>(*psContext, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
+            }
+            else
+            {
+                psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
+                psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
+            }
+
+            CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
+        }
+        else
+        {
+            psContext->vX.centroid = psContext->vX.sample;
+            psContext->vY.centroid = psContext->vY.sample;
+        }
+    }
+}
+
 template<typename T>
 struct PixelRateZTestLoop
 {
     PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
-                       uint8_t*& depthBase, uint8_t*& stencilBase, const uint8_t ClipDistanceMask) :
+                       uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
                        pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
-                       clipDistanceMask(ClipDistanceMask), pDepthBase(depthBase), pStencilBase(stencilBase) {};
+                       clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer) {};
            
     INLINE
     uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext, 
@@ -465,7 +569,24 @@ struct PixelRateZTestLoop
                 continue;
             }
 
+            // offset depth/stencil buffers current sample
+            uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+            uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+
+            if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
+            {
+                static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+
+                const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
+
+                const float minz = state.depthBoundsState.depthBoundsTestMinValue;
+                const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
+
+                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz)));
+            }
+
             AR_BEGIN(BEBarycentric, pDC->drawId);
+
             // calculate per sample positions
             psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
             psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
@@ -483,31 +604,16 @@ struct PixelRateZTestLoop
                 vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
                 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
             }
+
             AR_END(BEBarycentric, 0);
 
             ///@todo: perspective correct vs non-perspective correct clipping?
             // if clip distances are enabled, we need to interpolate for each sample
             if(clipDistanceMask)
             {
-                uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer,
-                                                       psContext.vI.sample, psContext.vJ.sample);
-                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
-            }
-
-            // offset depth/stencil buffers current sample
-            uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
-            uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
+                uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
 
-            if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-            {
-                static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-
-                const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
-
-                const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz)));
+                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
             }
 
             // ZTest for this sample
@@ -557,8 +663,8 @@ private:
     const API_STATE& state;
     const SWR_PS_STATE& psState;
     const uint8_t clipDistanceMask;
-    uint8_t*& pDepthBase;
-    uint8_t*& pStencilBase;
+    uint8_t*& pDepthBuffer;
+    uint8_t*& pStencilBuffer;
 };
 
 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)