swr: [rasterizer core] backend refactor
authorTim Rowley <timothy.o.rowley@intel.com>
Thu, 14 Apr 2016 23:03:16 +0000 (17:03 -0600)
committerTim Rowley <timothy.o.rowley@intel.com>
Wed, 27 Apr 2016 15:40:44 +0000 (10:40 -0500)
Lump all template args into a bundle of traits, and add some
functionality to the MSAA traits.

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
src/gallium/drivers/swr/rasterizer/core/api.cpp
src/gallium/drivers/swr/rasterizer/core/backend.cpp
src/gallium/drivers/swr/rasterizer/core/backend.h
src/gallium/drivers/swr/rasterizer/core/multisample.cpp
src/gallium/drivers/swr/rasterizer/core/multisample.h

index e950e92c874e099885ab7625bb4f7db2faf8e780..3b02d197111366f6462dba50f3e4492455efaa7f 100644 (file)
@@ -763,7 +763,6 @@ extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_IN
 extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
 extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
 extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
-extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
 void SetupPipeline(DRAW_CONTEXT *pDC)
 {
     DRAW_STATE* pState = pDC->pState;
@@ -827,9 +826,6 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
 
         bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0;
         backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics];
-
-        bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0 ? 1 : 0;
-        backendFuncs.pfnCalcCentroidBarycentrics = gCentroidBarycentricTable[rastState.sampleCount][bBarycentrics][rastState.samplePattern][forcedSampleCount];
     }
     
     PFN_PROCESS_PRIMS pfnBinner;
index a2212ba8aa4c6df428834e8293d88884cbfcd779..310a7edcde1abfc2bda64c9d6fc3eb9a9c862730 100644 (file)
@@ -459,10 +459,10 @@ simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscala
     return _simd_movemask_ps(vClipMask);
 }
 
-template<bool perspMask>
+template<bool bGenerateBarycentrics>
 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
 {
-    if(perspMask)
+    if(bGenerateBarycentrics)
     {
         // evaluate I,J
         psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
@@ -475,10 +475,10 @@ INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEX
     }
 }
 
-template<bool perspMask>
+template<bool bGenerateBarycentrics>
 INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
 {
-    if(perspMask)
+    if(bGenerateBarycentrics)
     {
         // evaluate I,J
         psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
@@ -502,13 +502,12 @@ INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTE
 //     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the 
 //     SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-template<SWR_MULTISAMPLE_COUNT sampleCount, bool bForcedSampleCount>
+template<typename T>
 INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
                             const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
 {
     uint32_t inputMask[KNOB_SIMD_WIDTH];
-
-    generateInputCoverage<sampleCount, 1, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
+    generateInputCoverage<T>(coverageMask, inputMask, sampleMask);
 
     // Case (2) - partially covered pixel
 
@@ -524,29 +523,29 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const cov
     (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
 
     // look up and set the sample offsets from UL pixel corner for first covered sample 
-    __m256 vXSample = _mm256_set_ps(MultisampleTraits<sampleCount>::X(sampleNum[7]),
-                                    MultisampleTraits<sampleCount>::X(sampleNum[6]),
-                                    MultisampleTraits<sampleCount>::X(sampleNum[5]),
-                                    MultisampleTraits<sampleCount>::X(sampleNum[4]),
-                                    MultisampleTraits<sampleCount>::X(sampleNum[3]),
-                                    MultisampleTraits<sampleCount>::X(sampleNum[2]),
-                                    MultisampleTraits<sampleCount>::X(sampleNum[1]),
-                                    MultisampleTraits<sampleCount>::X(sampleNum[0]));
-
-    __m256 vYSample = _mm256_set_ps(MultisampleTraits<sampleCount>::Y(sampleNum[7]),
-                                    MultisampleTraits<sampleCount>::Y(sampleNum[6]),
-                                    MultisampleTraits<sampleCount>::Y(sampleNum[5]),
-                                    MultisampleTraits<sampleCount>::Y(sampleNum[4]),
-                                    MultisampleTraits<sampleCount>::Y(sampleNum[3]),
-                                    MultisampleTraits<sampleCount>::Y(sampleNum[2]),
-                                    MultisampleTraits<sampleCount>::Y(sampleNum[1]),
-                                    MultisampleTraits<sampleCount>::Y(sampleNum[0]));
+    __m256 vXSample = _mm256_set_ps(T::MultisampleT::X(sampleNum[7]),
+                                    T::MultisampleT::X(sampleNum[6]),
+                                    T::MultisampleT::X(sampleNum[5]),
+                                    T::MultisampleT::X(sampleNum[4]),
+                                    T::MultisampleT::X(sampleNum[3]),
+                                    T::MultisampleT::X(sampleNum[2]),
+                                    T::MultisampleT::X(sampleNum[1]),
+                                    T::MultisampleT::X(sampleNum[0]));
+
+    __m256 vYSample = _mm256_set_ps(T::MultisampleT::Y(sampleNum[7]),
+                                    T::MultisampleT::Y(sampleNum[6]),
+                                    T::MultisampleT::Y(sampleNum[5]),
+                                    T::MultisampleT::Y(sampleNum[4]),
+                                    T::MultisampleT::Y(sampleNum[3]),
+                                    T::MultisampleT::Y(sampleNum[2]),
+                                    T::MultisampleT::Y(sampleNum[1]),
+                                    T::MultisampleT::Y(sampleNum[0]));
     // add sample offset to UL pixel corner
     vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
     vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
 
     // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
-    static const __m256i vFullyCoveredMask = MultisampleTraits<sampleCount>::FullSampleMask();
+    static const __m256i vFullyCoveredMask = T::MultisampleT::FullSampleMask();
     __m256i vInputCoveragei =  _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
     __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
 
@@ -570,46 +569,38 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const cov
 
     __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
 
-    vXSample = _simd_set1_ps(MultisampleTraits<sampleCount>::X(firstCoveredSampleMaskSample));
-    vYSample = _simd_set1_ps(MultisampleTraits<sampleCount>::Y(firstCoveredSampleMaskSample));
+    vXSample = _simd_set1_ps(T::MultisampleT::X(firstCoveredSampleMaskSample));
+    vYSample = _simd_set1_ps(T::MultisampleT::Y(firstCoveredSampleMaskSample));
 
     // blend in case 3a pixel locations
     psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
     psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
 }
 
-template<uint32_t sampleCount, uint32_t persp, uint32_t standardPattern, uint32_t forcedMultisampleCount>
+template<typename T>
 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
                                      const uint64_t *const coverageMask, const uint32_t sampleMask,
                                      const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
 {
-    static const bool bPersp = (bool)persp;
-    static const bool bIsStandardPattern = (bool)standardPattern;
-    static const bool bForcedMultisampleCount = (bool)forcedMultisampleCount;
-
-    // calculate centroid positions
-    if(bPersp)
+    if(T::bIsStandardPattern)
     {
-        if(bIsStandardPattern)
-        {
-            ///@ todo: don't need to generate input coverage 2x if input coverage and centroid
-            CalcCentroidPos<(SWR_MULTISAMPLE_COUNT)sampleCount, bForcedMultisampleCount>(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL);
-        }
-        else
-        {
-            static const __m256 pixelCenter = _simd_set1_ps(0.5f);
-            psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter);
-            psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter);
-        }
-        // evaluate I,J
-        psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
-        psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
-        psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
-        psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
-
-        // interpolate 1/w
-        psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
+        ///@ todo: don't need to generate input coverage 2x if input coverage and centroid
+        CalcCentroidPos<T>(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL);
     }
+    else
+    {
+        static const __m256 pixelCenter = _simd_set1_ps(0.5f);
+        psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter);
+        psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter);
+    }
+    // evaluate I,J
+    psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
+    psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
+    psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
+    psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
+
+    // interpolate 1/w
+    psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
 }
 
 template<uint32_t NumRT, uint32_t sampleCountT>
@@ -680,13 +671,10 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
     }
 }
 
-template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
+template<typename T>
 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
     RDTSC_START(BESetup);
-    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
-    static const bool bInputCoverage = (bool)inputCoverage;
-    static const bool bCentroidPos = (bool)centroidPos;
 
     SWR_CONTEXT *pContext = pDC->pContext;
     const API_STATE& state = GetApiState(pDC);
@@ -736,8 +724,8 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
     psContext.J = work.J;
     psContext.recipDet = work.recipDet;
     psContext.pRecipW = work.pRecipW;
-    psContext.pSamplePosX = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosX;
-    psContext.pSamplePosY = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosY;
+    psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
+    psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
 
     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
@@ -748,9 +736,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
 
         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
-            if(bInputCoverage)
+            if(T::bInputCoverage)
             {
-                generateInputCoverage<SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+                generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
             }
 
             if(coverageMask & MASK)
@@ -762,7 +750,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
 
                 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
 
-                if(bCentroidPos)
+                if(T::bCentroidPos)
                 {
                     // for 1x case, centroid is pixel center
                     psContext.vX.centroid = psContext.vX.center;
@@ -873,14 +861,9 @@ Endtile:
     }
 }
 
-template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
+template<typename T>
 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
-    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
-    static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
-    static const bool bInputCoverage = (bool)inputCoverage;
-    static const bool bCentroidPos = (bool)centroidPos;
-
     RDTSC_START(BESetup);
 
     SWR_CONTEXT *pContext = pDC->pContext;
@@ -930,9 +913,9 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
     psContext.I = work.I;
     psContext.J = work.J;
     psContext.recipDet = work.recipDet;
-    psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX;
-    psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY;
-    const uint32_t numSamples = MultisampleTraits<sampleCount>::numSamples;
+    psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
+    psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
+    const uint32_t numSamples = T::MultisampleT::numSamples;
 
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
@@ -951,16 +934,16 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
             backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
             RDTSC_STOP(BEBarycentric, 0, 0);
 
-            if(bInputCoverage)
+            if(T::bInputCoverage)
             {
-                generateInputCoverage<sampleCount, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+                generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
             }
 
-            if(bCentroidPos)
+            if(T::bCentroidPos)
             {
                 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
                 RDTSC_START(BEBarycentric);
-                backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
+                CalcCentroidBarycentrics<T>(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
                 RDTSC_STOP(BEBarycentric, 0, 0);
             }
 
@@ -971,8 +954,8 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     RDTSC_START(BEBarycentric);
 
                     // calculate per sample positions
-                    psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample));
-                    psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample));
+                    psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
+                    psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
                     
                     simdmask coverageMask = work.coverageMask[sample] & MASK;
                     simdscalar vCoverageMask = vMask(coverageMask);
@@ -996,8 +979,8 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     simdscalar stencilPassMask = vCoverageMask;
 
                     // offset depth/stencil buffers current sample
-                    uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
-                    uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
+                    uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
+                    uint8_t *pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
 
                     // Early-Z?
                     if (CanEarlyZ(pPSState))
@@ -1032,7 +1015,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
 
                     vCoverageMask = _simd_castsi_ps(psContext.activeMask);
 
-                    //// late-Z
+                    // late-Z
                     if (!CanEarlyZ(pPSState))
                     {
                         RDTSC_START(BELateDepthTest);
@@ -1083,16 +1066,9 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
     }
 }
 
-template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
+template<typename T>
 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
-    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
-    static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
-    static const bool bIsStandardPattern = (bool)samplePattern;
-    static const bool bInputCoverage = (bool)inputCoverage;
-    static const bool bCentroidPos = (bool)centroidPos;
-    static const bool bForcedSampleCount = (bool)forcedSampleCount;
-
     RDTSC_START(BESetup);
 
     SWR_CONTEXT *pContext = pDC->pContext;
@@ -1141,35 +1117,25 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
     psContext.I = work.I;
     psContext.J = work.J;
     psContext.recipDet = work.recipDet;
-    psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX;
-    psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY;
+    psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
+    psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
     psContext.sampleIndex = 0;
 
-    uint32_t numCoverageSamples;
-    if(bIsStandardPattern)
-    {
-        numCoverageSamples = MultisampleTraits<sampleCount>::numSamples;
-    }
-    else
-    {
-        numCoverageSamples = 1;
-    }
-
     uint32_t numOMSamples;
     // RT has to be single sample if we're in forcedMSAA mode
-    if(bForcedSampleCount && (sampleCount > SWR_MULTISAMPLE_1X))
+    if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
     {
         numOMSamples = 1;
     }
     // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
-    else if(bForcedSampleCount && (sampleCount == SWR_MULTISAMPLE_1X))
+    else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
     {
         numOMSamples = GetNumSamples(pBlendState->sampleCount);
     }
     // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
     else
     {
-        numOMSamples = MultisampleTraits<sampleCount>::numSamples;
+        numOMSamples = T::MultisampleT::numSamples;
     }
     
     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
@@ -1178,21 +1144,21 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
         psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
-            simdscalar vZ[MultisampleTraits<sampleCount>::numSamples]{ 0 };
+            simdscalar vZ[T::MultisampleT::numSamples]{ 0 };
             psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
             // set pixel center positions
             psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
-            if (bInputCoverage)
+            if (T::bInputCoverage)
             {
-                generateInputCoverage<sampleCount, bIsStandardPattern, bForcedSampleCount>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+                generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
             }
 
-            if(bCentroidPos)
+            if(T::bCentroidPos)
             {
                 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
                 RDTSC_START(BEBarycentric);
-                backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
+                CalcCentroidBarycentrics<T>(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
                 RDTSC_STOP(BEBarycentric, 0, 0);
             }
 
@@ -1219,12 +1185,12 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             }
 
             // need to declare enough space for all samples
-            simdscalar vCoverageMask[MultisampleTraits<sampleCount>::numSamples];
-            simdscalar depthPassMask[MultisampleTraits<sampleCount>::numSamples]; 
-            simdscalar stencilPassMask[MultisampleTraits<sampleCount>::numSamples];
+            simdscalar vCoverageMask[T::MultisampleT::numSamples];
+            simdscalar depthPassMask[T::MultisampleT::numSamples]; 
+            simdscalar stencilPassMask[T::MultisampleT::numSamples];
             simdscalar anyDepthSamplePassed = _simd_setzero_ps();
             simdscalar anyStencilSamplePassed = _simd_setzero_ps();
-            for(uint32_t sample = 0; sample < numCoverageSamples; sample++)
+            for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
             {
                 vCoverageMask[sample] = vMask(work.coverageMask[sample] & MASK);
 
@@ -1237,7 +1203,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                     continue;
                 }
 
-                if(bForcedSampleCount)
+                if(T::bForcedSampleCount)
                 {
                     // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
                     const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
@@ -1252,11 +1218,11 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                 if(!pPSState->writesODepth || rastState.clipDistanceMask)
                 {
                     RDTSC_START(BEBarycentric);
-                    if(bIsStandardPattern)
+                    if(T::bIsStandardPattern)
                     {
                         // calculate per sample positions
-                        psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample));
-                        psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample));
+                        psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
+                        psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
                     }
                     else
                     {
@@ -1291,8 +1257,8 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                 }
 
                 // offset depth/stencil buffers current sample
-                uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
-                uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
+                uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
+                uint8_t * pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
 
                 // ZTest for this sample
                 RDTSC_START(BEEarlyDepthTest);
@@ -1332,8 +1298,8 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             // loop over all samples, broadcasting the results of the PS to all passing pixels
             for(uint32_t sample = 0; sample < numOMSamples; sample++)
             {
-                uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
-                uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
+                uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
+                uint8_t * pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
 
                 // output merger
                 RDTSC_START(BEOutputMerger);
@@ -1346,12 +1312,12 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
 
                 // forcedSampleCount outputs to any pixels with covered samples not masked off by SampleMask
                 // depth test is disabled, so just set the z val to 0.
-                if(bForcedSampleCount)
+                if(T::bForcedSampleCount)
                 {
                     coverageMaskSample = depthMaskSample = anyDepthSamplePassed;
                     vInterpolatedZ = _simd_setzero_ps();
                 }
-                else if(bIsStandardPattern)
+                else if(T::bIsStandardPattern)
                 {
                     if(!_simd_movemask_ps(depthPassMask[sample]))
                     {
@@ -1393,7 +1359,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
 
 Endtile:
             RDTSC_START(BEEndTile);
-            for(uint32_t sample = 0; sample < numCoverageSamples; sample++)
+            for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
             {
                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
@@ -1413,10 +1379,10 @@ Endtile:
 template<uint32_t sampleCountT>
 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
+    ///@todo: handle center multisample pattern
+    typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
     RDTSC_START(BESetup);
 
-    static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
-
     SWR_CONTEXT *pContext = pDC->pContext;
     const API_STATE& state = GetApiState(pDC);
     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
@@ -1464,8 +1430,8 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                 {
                     RDTSC_START(BEBarycentric);
                     // calculate per sample positions
-                    psContext.vX.sample = _simd_add_ps(vXSamplePosUL, MultisampleTraits<sampleCount>::vX(sample));
-                    psContext.vY.sample = _simd_add_ps(vYSamplePosUL, MultisampleTraits<sampleCount>::vY(sample));
+                    psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
+                    psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
 
                     backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
 
@@ -1486,8 +1452,8 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                     simdscalar stencilPassMask = vCoverageMask;
 
                     // offset depth/stencil buffers current sample
-                    uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
-                    uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
+                    uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
+                    uint8_t *pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
 
                     RDTSC_START(BEEarlyDepthTest);
                     simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
@@ -1526,7 +1492,6 @@ PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COV
 PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX] = {};
 PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2] = {};
 PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2] = {};
-PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2] = {};
 
 // Recursive template used to auto-nest conditionals.  Converts dynamic enum function
 // arguments to static template arguments.
@@ -1576,34 +1541,18 @@ struct OMChooser
 // Recursive template used to auto-nest conditionals.  Converts dynamic enum function
 // arguments to static template arguments.
 template <uint32_t... ArgsT>
-struct BECentroidBarycentricChooser
+struct BEChooser
 {
-
     // Last Arg Terminator
-    template <typename... TArgsT>
-    static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg)
-    {
-        if(tArg > 0)
-        {
-            return CalcCentroidBarycentrics<ArgsT..., 1>;
-        }
-
-        return CalcCentroidBarycentrics<ArgsT..., 0>;
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
+    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
     {
         switch(tArg)
         {
-        case SWR_MULTISAMPLE_1X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_2X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_4X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_8X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_16X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
+        case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
+        case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
+        case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
         default:
-            SWR_ASSERT(0 && "Invalid sample count\n");
+            SWR_ASSERT(0 && "Invalid backend func\n");
             return nullptr;
             break;
         }
@@ -1611,38 +1560,19 @@ struct BECentroidBarycentricChooser
 
     // Recursively parse args
     template <typename... TArgsT>
-    static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg, TArgsT... remainingArgs)
-    {
-        if(tArg > 0)
-        {
-            return BECentroidBarycentricChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
-        }
-
-        return BECentroidBarycentricChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
-    }
-};
-
-// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
-// arguments to static template arguments.
-template <uint32_t... ArgsT>
-struct BEChooser
-{
-    // Last Arg Terminator
-    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
+    static PFN_BACKEND_FUNC GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg, TArgsT... remainingArgs)
     {
         switch(tArg)
         {
-        case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<ArgsT...>; break;
-        case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<ArgsT...>; break;
-        case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<ArgsT...>; break;
+        case SWR_MSAA_CENTER_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_CENTER_PATTERN>::GetFunc(remainingArgs...); break;
+        case SWR_MSAA_STANDARD_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...); break;
         default:
-            SWR_ASSERT(0 && "Invalid backend func\n");
-            return nullptr;
-            break;
+        SWR_ASSERT(0 && "Invalid sample pattern\n");
+        return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...);
+        break;
         }
     }
 
-
     // Recursively parse args
     template <typename... TArgsT>
     static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
@@ -1655,22 +1585,22 @@ struct BEChooser
         case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
         case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
         default:
-            SWR_ASSERT(0 && "Invalid sample count\n");
-            return nullptr;
-            break;
+        SWR_ASSERT(0 && "Invalid sample count\n");
+        return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
+        break;
         }
     }
 
     // Recursively parse args
     template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(uint32_t tArg, TArgsT... remainingArgs)
+    static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
     {
-        if(tArg > 0)
+        if(tArg == true)
         {
             return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
         }
 
-         return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
+        return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
     }
 };
 
@@ -1689,37 +1619,21 @@ void InitBackendOMFuncTable(PFN_OUTPUT_MERGER (&table)[numRenderTargets][numSamp
 
 template <SWR_MULTISAMPLE_COUNT numSampleRates>
 void InitBackendBarycentricsTables(PFN_CALC_PIXEL_BARYCENTRICS (&pixelTable)[2], 
-                                   PFN_CALC_SAMPLE_BARYCENTRICS (&sampleTable)[2],
-                                   PFN_CALC_CENTROID_BARYCENTRICS (&centroidTable)[numSampleRates][2][2][2])
+                                   PFN_CALC_SAMPLE_BARYCENTRICS (&sampleTable)[2])
 {
     pixelTable[0] = CalcPixelBarycentrics<0>;
     pixelTable[1] = CalcPixelBarycentrics<1>;
 
     sampleTable[0] = CalcSampleBarycentrics<0>;
     sampleTable[1] = CalcSampleBarycentrics<1>;
-
-    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
-    {
-        for(uint32_t baryMask = 0; baryMask < 2; baryMask++)
-        {
-            for(uint32_t patternNum = 0; patternNum < 2; patternNum++)
-            {
-                for(uint32_t forcedSampleEnable = 0; forcedSampleEnable < 2; forcedSampleEnable++)
-                {
-                    centroidTable[sampleCount][baryMask][patternNum][forcedSampleEnable]=
-                        BECentroidBarycentricChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, baryMask, patternNum, forcedSampleEnable);
-                }
-            }
-        }
-    }
 }
 
 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[2][2])
 {
-    gBackendSingleSample[0][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
-    gBackendSingleSample[0][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
-    gBackendSingleSample[1][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
-    gBackendSingleSample[1][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
+    gBackendSingleSample[0][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false, false, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
+    gBackendSingleSample[0][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false, true, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
+    gBackendSingleSample[1][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, true, false, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
+    gBackendSingleSample[1][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, true, true, false, false,(SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
 }
 
 template <SWR_MULTISAMPLE_COUNT numSampleRates, SWR_MSAA_SAMPLE_PATTERN numSamplePatterns, SWR_INPUT_COVERAGE numCoverageModes>
@@ -1734,9 +1648,11 @@ void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numSamp
                 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
                 {
                     table[sampleCount][samplePattern][inputCoverage][isCentroid][0] =
-                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE);
+                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), (isCentroid > 0),
+                                                     false, false, SWR_BACKEND_MSAA_PIXEL_RATE);
                     table[sampleCount][samplePattern][inputCoverage][isCentroid][1] =
-                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 1, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE);
+                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), (isCentroid > 0),
+                                             true, false, SWR_BACKEND_MSAA_PIXEL_RATE);
                 }
             }
         }
@@ -1751,9 +1667,9 @@ void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numCov
         for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
         {
             table[sampleCount][inputCoverage][0] =
-                BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
+                BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), false, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
             table[sampleCount][inputCoverage][1] =
-                BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
+                BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), true, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
         }
     }
 }
@@ -1764,7 +1680,7 @@ void InitBackendFuncTables()
     InitBackendPixelFuncTable<(SWR_MULTISAMPLE_COUNT)SWR_MULTISAMPLE_TYPE_MAX, SWR_MSAA_SAMPLE_PATTERN_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendPixelRateTable);
     InitBackendSampleFuncTable<SWR_MULTISAMPLE_TYPE_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendSampleRateTable);
     InitBackendOMFuncTable<SWR_NUM_RENDERTARGETS+1, SWR_MULTISAMPLE_TYPE_MAX>(gBackendOutputMergerTable);
-    InitBackendBarycentricsTables<(SWR_MULTISAMPLE_COUNT)(SWR_MULTISAMPLE_TYPE_MAX)>(gPixelBarycentricTable, gSampleBarycentricTable, gCentroidBarycentricTable);
+    InitBackendBarycentricsTables<(SWR_MULTISAMPLE_COUNT)(SWR_MULTISAMPLE_TYPE_MAX)>(gPixelBarycentricTable, gSampleBarycentricTable);
 
     gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
     gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
index d0626b997af9c9274277d60e3ab5c1211a537ec1..022e60a9413d8378a282ff7e997e4834a9fc36eb 100644 (file)
@@ -60,7 +60,7 @@ extern const __m256 vULOffsetsY;
 #define MASK 0xff
 #endif
 
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+template<typename T>
 INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
 {
 
@@ -69,28 +69,28 @@ INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (
 
     __m256i mask[2];
     __m256i sampleCoverage[2];
-    if(bIsStandardPattern)
+    if(T::bIsStandardPattern)
     {
         __m256i src = _mm256_set1_epi32(0);
         __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
 
-        if(MultisampleTraits<sampleCountT>::numSamples == 1)
+        if(T::MultisampleT::numSamples == 1)
         {
             mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
         }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+        else if(T::MultisampleT::numSamples == 2)
         {
             mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
         }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+        else if(T::MultisampleT::numSamples == 4)
         {
             mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
         }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+        else if(T::MultisampleT::numSamples == 8)
         {
             mask[0] = _mm256_set1_epi32(-1);
         }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+        else if(T::MultisampleT::numSamples == 16)
         {
             mask[0] = _mm256_set1_epi32(-1);
             mask[1] = _mm256_set1_epi32(-1);
@@ -99,7 +99,7 @@ INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (
 
         // gather coverage for samples 0-7
         sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
-        if(MultisampleTraits<sampleCountT>::numSamples > 8)
+        if(T::MultisampleT::numSamples > 8)
         {
             // gather coverage for samples 8-15
             sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
@@ -109,23 +109,23 @@ INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (
     {
         // center coverage is the same for all samples; just broadcast to the sample slots
         uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
-        if(MultisampleTraits<sampleCountT>::numSamples == 1)
+        if(T::MultisampleT::numSamples == 1)
         {
             sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
         }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+        else if(T::MultisampleT::numSamples == 2)
         {
             sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
         }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+        else if(T::MultisampleT::numSamples == 4)
         {
             sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
         }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+        else if(T::MultisampleT::numSamples == 8)
         {
             sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
         }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+        else if(T::MultisampleT::numSamples == 16)
         {
             sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
             sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
@@ -138,7 +138,7 @@ INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (
     __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
 
     __m256i packedCoverage1;
-    if(MultisampleTraits<sampleCountT>::numSamples > 8)
+    if(T::MultisampleT::numSamples > 8)
     {
         // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
         packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
@@ -151,7 +151,7 @@ INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (
     packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
 
     __m256i packedSampleCoverage;
-    if(MultisampleTraits<sampleCountT>::numSamples > 8)
+    if(T::MultisampleT::numSamples > 8)
     {
         // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
         hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
@@ -170,7 +170,7 @@ INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (
     packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
 
     __m256i packedSampleCoverage;
-    if(MultisampleTraits<sampleCountT>::numSamples > 8)
+    if(T::MultisampleT::numSamples > 8)
     {
         permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
         // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
@@ -190,7 +190,7 @@ INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (
         // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
         inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
 
-        if(!bForcedSampleCount)
+        if(!T::bForcedSampleCount)
         {
             // input coverage has to be anded with sample mask if MSAA isn't forced on
             inputMask[i] &= sampleMask;
@@ -201,10 +201,22 @@ INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (
     }
 }
 
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+template<typename T>
 INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
 {
     uint32_t inputMask[KNOB_SIMD_WIDTH]; 
-    generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
+    generateInputCoverage<T>(coverageMask, inputMask, sampleMask);
     inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
 }
+
+template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t samplePattern = SWR_MSAA_STANDARD_PATTERN,
+         uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t odepth = 0>
+struct SwrBackendTraits
+{
+    static const bool bIsStandardPattern = (samplePattern == SWR_MSAA_STANDARD_PATTERN);
+    static const bool bInputCoverage = (coverage == 1);
+    static const bool bCentroidPos = (centroid == 1);
+    static const bool bForcedSampleCount = (forced == 1);
+    static const bool bWritesODepth = (odepth == 1);
+    typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, (bIsStandardPattern) ? SWR_MSAA_STANDARD_PATTERN : SWR_MSAA_CENTER_PATTERN> MultisampleT;
+};
\ No newline at end of file
index d51a546b06347d464a2119ebf46688c5877dc1d0..643ba1338082766af039b36f270ba85ae9fa79d1 100644 (file)
@@ -49,3 +49,16 @@ const float MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosX[16]
 {0.5625, 0.4375, 0.3125, 0.7500, 0.1875, 0.6250, 0.8125, 0.6875, 0.3750, 0.5000, 0.2500, 0.1250, 0.0000, 0.9375, 0.8750, 0.0625};
 const float MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosY[16]
 {0.5625, 0.3125, 0.6250, 0.4375, 0.3750, 0.8125, 0.6875, 0.1875, 0.8750, 0.0625, 0.1250, 0.7500, 0.5000, 0.2500, 0.9375, 0.0000};
+
+const float MultisampleTraits<SWR_MULTISAMPLE_1X, SWR_MSAA_CENTER_PATTERN>::samplePosX{ 0.5f };
+const float MultisampleTraits<SWR_MULTISAMPLE_1X, SWR_MSAA_CENTER_PATTERN>::samplePosY{ 0.5f };
+const float MultisampleTraits<SWR_MULTISAMPLE_2X, SWR_MSAA_CENTER_PATTERN>::samplePosX[2]{ 0.5f, 0.5f};
+const float MultisampleTraits<SWR_MULTISAMPLE_2X, SWR_MSAA_CENTER_PATTERN>::samplePosY[2]{ 0.5f, 0.5f};
+const float MultisampleTraits<SWR_MULTISAMPLE_4X, SWR_MSAA_CENTER_PATTERN>::samplePosX[4]{ 0.5f, 0.5f, 0.5f, 0.5f};
+const float MultisampleTraits<SWR_MULTISAMPLE_4X, SWR_MSAA_CENTER_PATTERN>::samplePosY[4]{ 0.5f, 0.5f, 0.5f, 0.5f };
+const float MultisampleTraits<SWR_MULTISAMPLE_8X, SWR_MSAA_CENTER_PATTERN>::samplePosX[8]{ 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
+const float MultisampleTraits<SWR_MULTISAMPLE_8X, SWR_MSAA_CENTER_PATTERN>::samplePosY[8]{ 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
+const float MultisampleTraits<SWR_MULTISAMPLE_16X, SWR_MSAA_CENTER_PATTERN>::samplePosX[16]
+{ 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f };
+const float MultisampleTraits<SWR_MULTISAMPLE_16X, SWR_MSAA_CENTER_PATTERN>::samplePosY[16]
+{ 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f };
\ No newline at end of file
index 4ae777e2fc5cc1045d45b5af2030a39b76f034f8..c5096ed31c7720ffee498c7bcff701b23c404d43 100644 (file)
@@ -54,7 +54,7 @@ SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples)
 // hardcoded offsets based on Direct3d standard multisample positions
 // 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner
 // coords are 0.8 fixed point offsets from (0, 0)
-template<SWR_MULTISAMPLE_COUNT sampleCount>
+template<SWR_MULTISAMPLE_COUNT sampleCount, SWR_MSAA_SAMPLE_PATTERN samplePattern = SWR_MSAA_STANDARD_PATTERN>
 struct MultisampleTraits
 {
     INLINE static __m128i vXi(uint32_t sampleNum) = delete;
@@ -74,7 +74,7 @@ struct MultisampleTraits
 };
 
 template<>
-struct MultisampleTraits<SWR_MULTISAMPLE_1X>
+struct MultisampleTraits<SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN>
 {
     INLINE static __m128i vXi(uint32_t sampleNum)
     {
@@ -143,10 +143,74 @@ struct MultisampleTraits<SWR_MULTISAMPLE_1X>
     static const float samplePosX;
     static const float samplePosY;
     static const uint32_t numSamples = 1;
+    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_1X;
+    static const uint32_t numCoverageSamples = 1; 
 };
 
 template<>
-struct MultisampleTraits<SWR_MULTISAMPLE_2X>
+struct MultisampleTraits<SWR_MULTISAMPLE_1X, SWR_MSAA_CENTER_PATTERN>
+{
+    INLINE static __m128i vXi(uint32_t sampleNum)
+    {
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static __m128i vYi(uint32_t sampleNum)
+    {
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static simdscalar vX(uint32_t sampleNum)
+    {
+        return _simd_set1_ps(0.5f);
+    }
+
+    INLINE static simdscalar vY(uint32_t sampleNum)
+    {
+        return _simd_set1_ps(0.5f);
+    }
+
+    INLINE static float X(uint32_t sampleNum) {return 0.5f;};
+    INLINE static float Y(uint32_t sampleNum) {return 0.5f;};
+
+    INLINE static __m128i TileSampleOffsetsX()
+    {
+        // BR,            BL,           UR,            UL
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static __m128i TileSampleOffsetsY()
+    {
+        // BR,             BL,             UR,          UL
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+    {
+        return 0;
+    }
+
+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+    {
+        return 0;
+    }
+
+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+    {
+        return 0;
+    }
+
+    INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);};
+    
+    static const uint32_t numSamples = 1;
+    static const float samplePosX;
+    static const float samplePosY;
+    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_1X;
+    static const uint32_t numCoverageSamples = 1;
+};
+
+template<>
+struct MultisampleTraits<SWR_MULTISAMPLE_2X, SWR_MSAA_STANDARD_PATTERN>
 {
     INLINE static __m128i vXi(uint32_t sampleNum)
     {
@@ -238,10 +302,92 @@ struct MultisampleTraits<SWR_MULTISAMPLE_2X>
     static const float samplePosX[2];
     static const float samplePosY[2];
     static const uint32_t numSamples = 2;
+    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_2X;
+    static const uint32_t numCoverageSamples = 2;
+};
+
+template<>
+struct MultisampleTraits<SWR_MULTISAMPLE_2X, SWR_MSAA_CENTER_PATTERN>
+{
+    INLINE static __m128i vXi(uint32_t sampleNum)
+    {
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static __m128i vYi(uint32_t sampleNum)
+    {
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static simdscalar vX(uint32_t sampleNum)
+    {
+        return _simd_set1_ps(0.5f);
+    }
+
+    INLINE static simdscalar vY(uint32_t sampleNum)
+    {
+        return _simd_set1_ps(0.5f);
+    }
+
+    INLINE static float X(uint32_t sampleNum) {return 0.5f;};
+    INLINE static float Y(uint32_t sampleNum) {return 0.5f;};
+
+    INLINE static __m128i TileSampleOffsetsX()
+    {
+        // BR,            BL,           UR,            UL
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static __m128i TileSampleOffsetsY()
+    {
+        // BR,             BL,             UR,          UL
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileColorOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileColorOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileDepthOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileDepthOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileStencilOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileStencilOffsets[sampleNum];
+    }
+
+    INLINE static simdscalari FullSampleMask()
+    {
+         static const simdscalari mask =_simd_set1_epi32(0x3);
+         return mask;
+    }
+    static const uint32_t numSamples = 2;
+    static const float samplePosX[2];
+    static const float samplePosY[2];
+    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_2X;
+    static const uint32_t numCoverageSamples = 1;
 };
 
 template<>
-struct MultisampleTraits<SWR_MULTISAMPLE_4X>
+struct MultisampleTraits<SWR_MULTISAMPLE_4X, SWR_MSAA_STANDARD_PATTERN>
 {
     INLINE static __m128i vXi(uint32_t sampleNum)
     {
@@ -343,10 +489,98 @@ struct MultisampleTraits<SWR_MULTISAMPLE_4X>
     static const float samplePosX[4];
     static const float samplePosY[4];
     static const uint32_t numSamples = 4;
+    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_4X;
+    static const uint32_t numCoverageSamples = 4;
+};
+
+template<>
+struct MultisampleTraits<SWR_MULTISAMPLE_4X, SWR_MSAA_CENTER_PATTERN>
+{
+    INLINE static __m128i vXi(uint32_t sampleNum)
+    {
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static __m128i vYi(uint32_t sampleNum)
+    {
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static simdscalar vX(uint32_t sampleNum)
+    {
+        return _simd_set1_ps(0.5f);
+    }
+
+    INLINE static simdscalar vY(uint32_t sampleNum)
+    {
+        return _simd_set1_ps(0.5f);
+    }
+
+    INLINE static float X(uint32_t sampleNum) {return 0.5f;};
+    INLINE static float Y(uint32_t sampleNum) {return 0.5f;};
+
+    INLINE static __m128i TileSampleOffsetsX()
+    {
+        // BR,            BL,           UR,            UL
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static __m128i TileSampleOffsetsY()
+    {
+        // BR,             BL,             UR,          UL
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileColorOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileColorOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileDepthOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileDepthOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileStencilOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileStencilOffsets[sampleNum];
+    }
+
+    INLINE static simdscalari FullSampleMask()
+    {
+        static const simdscalari mask = _simd_set1_epi32(0xF);
+        return mask;
+    }
+    static const uint32_t numSamples = 4;
+    static const float samplePosX[4];
+    static const float samplePosY[4];
+    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_4X;
+    static const uint32_t numCoverageSamples = 1;
 };
 
 template<>
-struct MultisampleTraits<SWR_MULTISAMPLE_8X>
+struct MultisampleTraits<SWR_MULTISAMPLE_8X, SWR_MSAA_STANDARD_PATTERN>
 {
     INLINE static __m128i vXi(uint32_t sampleNum)
     {
@@ -464,10 +698,110 @@ struct MultisampleTraits<SWR_MULTISAMPLE_8X>
     static const float samplePosX[8];
     static const float samplePosY[8];
     static const uint32_t numSamples = 8;
+    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_8X;
+    static const uint32_t numCoverageSamples = 8;
 };
 
 template<>
-struct MultisampleTraits<SWR_MULTISAMPLE_16X>
+struct MultisampleTraits<SWR_MULTISAMPLE_8X, SWR_MSAA_CENTER_PATTERN>
+{
+    INLINE static __m128i vXi(uint32_t sampleNum)
+    {
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static __m128i vYi(uint32_t sampleNum)
+    {
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static simdscalar vX(uint32_t sampleNum)
+    {
+        return _simd_set1_ps(0.5f);
+    }
+
+    INLINE static simdscalar vY(uint32_t sampleNum)
+    {
+        return _simd_set1_ps(0.5f);
+    }
+
+    INLINE static float X(uint32_t sampleNum) {return 0.5f;};
+    INLINE static float Y(uint32_t sampleNum) {return 0.5f;};
+
+    INLINE static __m128i TileSampleOffsetsX()
+    {
+        // BR,            BL,           UR,            UL
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static __m128i TileSampleOffsetsY()
+    {
+        // BR,             BL,             UR,          UL
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileColorOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileColorOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileDepthOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileDepthOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileStencilOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileStencilOffsets[sampleNum];
+    }
+
+    INLINE static simdscalari FullSampleMask()
+    {
+        static const simdscalari mask = _simd_set1_epi32(0xFF);
+        return mask;
+    }
+    static const uint32_t numSamples = 8;
+    static const float samplePosX[8];
+    static const float samplePosY[8];
+    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_8X;
+    static const uint32_t numCoverageSamples = 1;
+};
+
+template<>
+struct MultisampleTraits<SWR_MULTISAMPLE_16X, SWR_MSAA_STANDARD_PATTERN>
 {
     INLINE static __m128i vXi(uint32_t sampleNum)
     {
@@ -617,4 +951,128 @@ struct MultisampleTraits<SWR_MULTISAMPLE_16X>
     static const float samplePosX[16];
     static const float samplePosY[16];
     static const uint32_t numSamples = 16;
+    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_16X;
+    static const uint32_t numCoverageSamples = 16;
+};
+
+template<>
+struct MultisampleTraits<SWR_MULTISAMPLE_16X, SWR_MSAA_CENTER_PATTERN>
+{
+    INLINE static __m128i vXi(uint32_t sampleNum)
+    {
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static __m128i vYi(uint32_t sampleNum)
+    {
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static simdscalar vX(uint32_t sampleNum)
+    {
+        return _simd_set1_ps(0.5f);
+    }
+
+    INLINE static simdscalar vY(uint32_t sampleNum)
+    {
+        return _simd_set1_ps(0.5f);
+    }
+
+    INLINE static float X(uint32_t sampleNum) {return 0.5f;};
+    INLINE static float Y(uint32_t sampleNum) {return 0.5f;};
+
+    INLINE static __m128i TileSampleOffsetsX()
+    {
+        // BR,            BL,           UR,            UL
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static __m128i TileSampleOffsetsY()
+    {
+        // BR,             BL,             UR,          UL
+        return _mm_set1_epi32(0x80);
+    }
+
+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileColorOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileColorOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileDepthOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileDepthOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileStencilOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileStencilOffsets[sampleNum];
+    }
+
+    INLINE static simdscalari FullSampleMask()
+    {
+        static const simdscalari mask = _simd_set1_epi32(0xFFFF);
+        return mask;
+    }
+    static const uint32_t numSamples = 16;
+    static const float samplePosX[16];
+    static const float samplePosY[16];
+    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_16X;
+    static const uint32_t numCoverageSamples = 1;
 };