BackendPixelRate should be easier to read/maintain now hopefully.
Small perf bump by moving some of the pfn's to inline functions
without template params.
Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
" |-> ",
" |-> ",
" |-> ",
- " |-> "
+ " |-> ",
+ " |-> ",
+ " |-> ",
};
// compute percent of total cycles used by this bucket
pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
}
}
+
// templated backend function tables
extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
-extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
-extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
+extern PFN_BACKEND_FUNC gBackendSingleSample[2][2][2];
+extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2][2];
+extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
void SetupPipeline(DRAW_CONTEXT *pDC)
{
DRAW_STATE* pState = pDC->pState;
if (psState.pfnPixelShader == nullptr)
{
backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
- // always need to generate I & J per sample for Z interpolation
- backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[1];
}
else
{
const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
+ const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0;
// currently only support 'normal' input coverage
SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
{
// always need to generate I & J per sample for Z interpolation
barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
- backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
- backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
+ backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount][canEarlyZ];
}
else
{
// always need to generate I & J per pixel for Z interpolation
barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
- backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
- backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
+ backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ];
}
break;
case SWR_SHADING_RATE_SAMPLE:
SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
// always need to generate I & J per sample for Z interpolation
barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
- backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
- backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
+ backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ];
break;
default:
SWR_ASSERT(0 && "Invalid shading rate");
break;
}
-
- // setup pointer to function that generates necessary barycentrics required by the PS
- bool bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_PIXEL_MASK) > 0 ? 1 : 0;
- backendFuncs.pfnCalcPixelBarycentrics = gPixelBarycentricTable[bBarycentrics];
-
- bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0;
- backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics];
}
PFN_PROCESS_PRIMS pfnBinner;
#include <smmintrin.h>
-#include "rdtsc_core.h"
#include "backend.h"
#include "depthstencil.h"
#include "tilemgr.h"
return _simd_movemask_ps(vClipMask);
}
-template<bool bGenerateBarycentrics>
-INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
-{
- if(bGenerateBarycentrics)
- {
- // evaluate I,J
- psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
- psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
- psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
- psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
-
- // interpolate 1/w
- psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
- }
-}
-
-template<bool bGenerateBarycentrics>
-INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
-{
- if(bGenerateBarycentrics)
- {
- // evaluate I,J
- psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
- psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
- psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
- psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
-
- // interpolate 1/w
- psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
- }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Centroid behaves exactly as follows :
-// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
-// have a sample location there).
-// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
-// coverage with the SampleMask Rasterizer State.
-// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
-// evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
-// SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename T>
-INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
- const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
-{
- uint32_t inputMask[KNOB_SIMD_WIDTH];
- generateInputCoverage<T>(coverageMask, inputMask, sampleMask);
-
- // Case (2) - partially covered pixel
-
- // scan for first covered sample per pixel in the 4x2 span
- unsigned long sampleNum[KNOB_SIMD_WIDTH];
- (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
- (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
- (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
- (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
- (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
- (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
- (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
- (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
-
- // look up and set the sample offsets from UL pixel corner for first covered sample
- __m256 vXSample = _mm256_set_ps(T::MultisampleT::X(sampleNum[7]),
- T::MultisampleT::X(sampleNum[6]),
- T::MultisampleT::X(sampleNum[5]),
- T::MultisampleT::X(sampleNum[4]),
- T::MultisampleT::X(sampleNum[3]),
- T::MultisampleT::X(sampleNum[2]),
- T::MultisampleT::X(sampleNum[1]),
- T::MultisampleT::X(sampleNum[0]));
-
- __m256 vYSample = _mm256_set_ps(T::MultisampleT::Y(sampleNum[7]),
- T::MultisampleT::Y(sampleNum[6]),
- T::MultisampleT::Y(sampleNum[5]),
- T::MultisampleT::Y(sampleNum[4]),
- T::MultisampleT::Y(sampleNum[3]),
- T::MultisampleT::Y(sampleNum[2]),
- T::MultisampleT::Y(sampleNum[1]),
- T::MultisampleT::Y(sampleNum[0]));
- // add sample offset to UL pixel corner
- vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
- vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
-
- // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
- static const __m256i vFullyCoveredMask = T::MultisampleT::FullSampleMask();
- __m256i vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
- __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
-
- static const __m256i vZero = _simd_setzero_si();
- const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
- __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
- __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
- __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
-
- __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
-
- // set the centroid position based on results from above
- psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
- psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
-
- // Case (3a) No samples covered and partial sample mask
- __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
- // sample mask should never be all 0's for this case, but handle it anyways
- unsigned long firstCoveredSampleMaskSample = 0;
- (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
-
- __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
-
- vXSample = _simd_set1_ps(T::MultisampleT::X(firstCoveredSampleMaskSample));
- vYSample = _simd_set1_ps(T::MultisampleT::Y(firstCoveredSampleMaskSample));
-
- // blend in case 3a pixel locations
- psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
- psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
-}
-
-template<typename T>
-INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
- const uint64_t *const coverageMask, const uint32_t sampleMask,
- const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
-{
- if(T::bIsStandardPattern)
- {
- ///@ todo: don't need to generate input coverage 2x if input coverage and centroid
- CalcCentroidPos<T>(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL);
- }
- else
- {
- static const __m256 pixelCenter = _simd_set1_ps(0.5f);
- psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter);
- psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter);
- }
- // evaluate I,J
- psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
- psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
- psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
- psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
-
- // interpolate 1/w
- psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
-}
-
-template<uint32_t NumRT, uint32_t sampleCountT>
-void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
- const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask)
-{
- // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
- static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
- uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
- simdvector blendOut;
-
- for(uint32_t rt = 0; rt < NumRT; ++rt)
- {
- uint8_t *pColorSample;
- if(sampleCount == SWR_MULTISAMPLE_1X)
- {
- pColorSample = pColorBase[rt];
- }
- else
- {
- pColorSample = pColorBase[rt] + rasterTileColorOffset;
- }
-
- const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
- // pfnBlendFunc may not update all channels. Initialize with PS output.
- /// TODO: move this into the blend JIT.
- blendOut = psContext.shaded[rt];
-
- // Blend outputs and update coverage mask for alpha test
- if(pfnBlendFunc[rt] != nullptr)
- {
- pfnBlendFunc[rt](
- pBlendState,
- psContext.shaded[rt],
- psContext.shaded[1],
- sample,
- pColorSample,
- blendOut,
- &psContext.oMask,
- (simdscalari*)&coverageMask);
- }
-
- // final write mask
- simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
-
- ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
- static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
-
- const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
-
- // store with color mask
- if(!pRTBlend->writeDisableRed)
- {
- _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
- }
- if(!pRTBlend->writeDisableGreen)
- {
- _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
- }
- if(!pRTBlend->writeDisableBlue)
- {
- _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
- }
- if(!pRTBlend->writeDisableAlpha)
- {
- _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
- }
- }
-}
-
template<typename T>
void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
+ RDTSC_START(BESingleSampleBackend);
RDTSC_START(BESetup);
SWR_CONTEXT *pContext = pDC->pContext;
const SWR_RASTSTATE& rastState = state.rastState;
const SWR_PS_STATE *pPSState = &state.psState;
const SWR_BLEND_STATE *pBlendState = &state.blendState;
- const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
uint64_t coverageMask = work.coverageMask[0];
// broadcast scalars
for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
- if(T::bInputCoverage)
- {
- generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
- }
-
if(coverageMask & MASK)
{
- RDTSC_START(BEBarycentric);
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
// pixel center
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
- backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
+ if(T::bInputCoverage)
+ {
+ generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+ }
+
+ RDTSC_START(BEBarycentric);
+ CalcPixelBarycentrics(coeffs, psContext);
if(T::bCentroidPos)
{
// interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
RDTSC_STOP(BEBarycentric, 0, 0);
simdmask clipCoverageMask = coverageMask & MASK;
-
// interpolate user clip distance if available
if(rastState.clipDistanceMask)
{
simdscalar stencilPassMask = vCoverageMask;
// Early-Z?
- if(CanEarlyZ(pPSState))
+ if(T::bCanEarlyZ)
{
RDTSC_START(BEEarlyDepthTest);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
vCoverageMask = _simd_castsi_ps(psContext.activeMask);
// late-Z
- if(!CanEarlyZ(pPSState))
+ if(!T::bCanEarlyZ)
{
RDTSC_START(BELateDepthTest);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
// output merger
RDTSC_START(BEOutputMerger);
- backendFuncs.pfnOutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc,
- vCoverageMask, depthPassMask);
+ OutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
// do final depth write after all pixel kills
if (!pPSState->forceEarlyZ)
RDTSC_STOP(BEEndTile, 0, 0);
}
}
+ RDTSC_STOP(BESingleSampleBackend, 0, 0);
}
template<typename T>
void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
+ RDTSC_START(BESampleRateBackend);
RDTSC_START(BESetup);
SWR_CONTEXT *pContext = pDC->pContext;
const SWR_RASTSTATE& rastState = state.rastState;
const SWR_PS_STATE *pPSState = &state.psState;
const SWR_BLEND_STATE *pBlendState = &state.blendState;
- const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
// broadcast scalars
BarycentricCoeffs coeffs;
psContext.recipDet = work.recipDet;
psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
- const uint32_t numSamples = T::MultisampleT::numSamples;
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
RDTSC_START(BEBarycentric);
- backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
+ CalcPixelBarycentrics(coeffs, psContext);
RDTSC_STOP(BEBarycentric, 0, 0);
if(T::bInputCoverage)
RDTSC_STOP(BEBarycentric, 0, 0);
}
- for(uint32_t sample = 0; sample < numSamples; sample++)
+ for(uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
{
- if (work.coverageMask[sample] & MASK)
+ simdmask coverageMask = work.coverageMask[sample] & MASK;
+ if (coverageMask)
{
RDTSC_START(BEBarycentric);
-
// calculate per sample positions
psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
-
- simdmask coverageMask = work.coverageMask[sample] & MASK;
- simdscalar vCoverageMask = vMask(coverageMask);
- backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
+ CalcSampleBarycentrics(coeffs, psContext);
// interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
RDTSC_STOP(BEBarycentric, 0, 0);
// interpolate user clip distance if available
coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
psContext.vI.sample, psContext.vJ.sample);
}
-
+
+ simdscalar vCoverageMask = vMask(coverageMask);
simdscalar depthPassMask = vCoverageMask;
simdscalar stencilPassMask = vCoverageMask;
// offset depth/stencil buffers current sample
- uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
- uint8_t *pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
+ uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
+ uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
// Early-Z?
- if (CanEarlyZ(pPSState))
+ if (T::bCanEarlyZ)
{
RDTSC_START(BEEarlyDepthTest);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
vCoverageMask = _simd_castsi_ps(psContext.activeMask);
// late-Z
- if (!CanEarlyZ(pPSState))
+ if (!T::bCanEarlyZ)
{
RDTSC_START(BELateDepthTest);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
// output merger
RDTSC_START(BEOutputMerger);
- backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc,
- vCoverageMask, depthPassMask);
+ OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
// do final depth write after all pixel kills
if (!pPSState->forceEarlyZ)
RDTSC_STOP(BEEndTile, 0, 0);
}
}
+ RDTSC_STOP(BESampleRateBackend, 0, 0);
}
template<typename T>
void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
+ RDTSC_START(BEPixelRateBackend);
RDTSC_START(BESetup);
SWR_CONTEXT *pContext = pDC->pContext;
const SWR_RASTSTATE& rastState = state.rastState;
const SWR_PS_STATE *pPSState = &state.psState;
const SWR_BLEND_STATE *pBlendState = &state.blendState;
- const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
// broadcast scalars
BarycentricCoeffs coeffs;
psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
psContext.sampleIndex = 0;
-
- uint32_t numOMSamples;
- // RT has to be single sample if we're in forcedMSAA mode
- if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
- {
- numOMSamples = 1;
- }
- // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
- else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
- {
- numOMSamples = GetNumSamples(pBlendState->sampleCount);
- }
- // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
- else
- {
- numOMSamples = T::MultisampleT::numSamples;
- }
+ PixelRateZTestLoop<T> PixelRateZTest(pDC, work, coeffs, state, pDepthBase, pStencilBase, rastState.clipDistanceMask);
+
for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
- simdscalar vZ[T::MultisampleT::numSamples]{ 0 };
+ if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
+
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
// set pixel center positions
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
+ RDTSC_START(BEBarycentric);
+ CalcPixelBarycentrics(coeffs, psContext);
+ RDTSC_STOP(BEBarycentric, 0, 0);
+
if (T::bInputCoverage)
{
generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
RDTSC_STOP(BEBarycentric, 0, 0);
}
- // if oDepth written to, or there is a potential to discard any samples, we need to
- // run the PS early, then interp or broadcast Z and test
- if(pPSState->writesODepth || pPSState->killsPixel)
+ simdscalar activeLanes;
+ if(T::bForcedSampleCount)
{
- RDTSC_START(BEBarycentric);
- backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
-
- // interpolate and quantize z
- psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
- psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
- RDTSC_STOP(BEBarycentric, 0, 0);
+ // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
+ const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
+ activeLanes = _simd_and_ps(vMask(work.anyCoveredSamples & MASK), vSampleMask);
+ }
- // execute pixel shader
- RDTSC_START(BEPixelShader);
- state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
- RDTSC_STOP(BEPixelShader, 0, 0);
+ // Early-Z?
+ if(T::bCanEarlyZ && !T::bForcedSampleCount)
+ {
+ activeLanes = _simd_setzero_ps();
+ uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
+ UPDATE_STAT(DepthPassCount, depthPassCount);
}
- else
+ // if we can't do early z, set the active mask to any samples covered in the current simd
+ else if(!T::bCanEarlyZ && !T::bForcedSampleCount)
{
- psContext.activeMask = _simd_set1_epi32(-1);
+ activeLanes = vMask(work.anyCoveredSamples & MASK);
}
- // need to declare enough space for all samples
- simdscalar vCoverageMask[T::MultisampleT::numSamples];
- simdscalar depthPassMask[T::MultisampleT::numSamples];
- simdscalar stencilPassMask[T::MultisampleT::numSamples];
- simdscalar anyDepthSamplePassed = _simd_setzero_ps();
- simdscalar anyStencilSamplePassed = _simd_setzero_ps();
- for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
+ // if we have no covered samples that passed depth at this point, go to next tile
+ if(!_simd_movemask_ps(activeLanes))
{
- vCoverageMask[sample] = vMask(work.coverageMask[sample] & MASK);
-
- // pull mask back out for any discards and and with coverage
- vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_castsi_ps(psContext.activeMask));
-
- if (!_simd_movemask_ps(vCoverageMask[sample]))
- {
- vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
- continue;
- }
-
- if(T::bForcedSampleCount)
- {
- // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
- const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
- anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, _simd_and_ps(vCoverageMask[sample], vSampleMask));
- continue;
- }
-
- depthPassMask[sample] = vCoverageMask[sample];
-
- // if oDepth isn't written to, we need to interpolate Z for each sample
- // if clip distances are enabled, we need to interpolate for each sample
- if(!pPSState->writesODepth || rastState.clipDistanceMask)
- {
- RDTSC_START(BEBarycentric);
- if(T::bIsStandardPattern)
- {
- // calculate per sample positions
- psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
- psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
- }
- else
- {
- psContext.vX.sample = psContext.vX.center;
- psContext.vY.sample = psContext.vY.center;
- }
-
- // calc I & J per sample
- backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
-
- // interpolate and quantize z
- if (!pPSState->writesODepth)
- {
- vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
- vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
- }
-
- ///@todo: perspective correct vs non-perspective correct clipping?
- // interpolate clip distances
- if (rastState.clipDistanceMask)
- {
- uint8_t clipMask = ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
- psContext.vI.sample, psContext.vJ.sample);
- vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
- }
- RDTSC_STOP(BEBarycentric, 0, 0);
- }
- // else 'broadcast' and test psContext.vZ written from the PS each sample
- else
- {
- vZ[sample] = psContext.vZ;
- }
-
- // offset depth/stencil buffers current sample
- uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
- uint8_t * pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
-
- // ZTest for this sample
- RDTSC_START(BEEarlyDepthTest);
- stencilPassMask[sample] = vCoverageMask[sample];
- depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing,
- vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
- RDTSC_STOP(BEEarlyDepthTest, 0, 0);
-
- anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
- anyStencilSamplePassed = _simd_or_ps(anyStencilSamplePassed, stencilPassMask[sample]);
- uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
- uint32_t statCount = _mm_popcnt_u32(statMask);
- UPDATE_STAT(DepthPassCount, statCount);
+ goto Endtile;
}
- // if we didn't have to execute the PS early, and at least 1 sample passed the depth test, run the PS
- if(!pPSState->writesODepth && !pPSState->killsPixel && _simd_movemask_ps(anyDepthSamplePassed))
+ if(pPSState->usesSourceDepth)
{
RDTSC_START(BEBarycentric);
- backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
// interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
RDTSC_STOP(BEBarycentric, 0, 0);
+ }
- // execute pixel shader
- RDTSC_START(BEPixelShader);
- state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
- RDTSC_STOP(BEPixelShader, 0, 0);
+ // pixels that are currently active
+ psContext.activeMask = _simd_castps_si(activeLanes);
+ psContext.oMask = T::MultisampleT::FullSampleMask();
+
+ // execute pixel shader
+ RDTSC_START(BEPixelShader);
+ state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+ UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
+ RDTSC_STOP(BEPixelShader, 0, 0);
+
+ // update active lanes to remove any discarded or oMask'd pixels
+ activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
+ if(!_simd_movemask_ps(activeLanes))
+ {
+ goto Endtile;
}
- ///@todo: make sure this works for kill pixel
- else if(!_simd_movemask_ps(anyStencilSamplePassed))
+
+ // late-Z
+ if(!T::bCanEarlyZ && !T::bForcedSampleCount)
+ {
+ uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
+ UPDATE_STAT(DepthPassCount, depthPassCount);
+ }
+
+ // if we have no covered samples that passed depth at this point, skip OM and go to next tile
+ if(!_simd_movemask_ps(activeLanes))
{
goto Endtile;
}
+ // output merger
// loop over all samples, broadcasting the results of the PS to all passing pixels
- for(uint32_t sample = 0; sample < numOMSamples; sample++)
+ for(uint32_t sample = 0; sample < GetNumOMSamples<T>(pBlendState->sampleCount); sample++)
{
- uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
- uint8_t * pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
-
- // output merger
RDTSC_START(BEOutputMerger);
-
- // skip if none of the pixels for this sample passed
- simdscalar coverageMaskSample;
- simdscalar depthMaskSample;
- simdscalar stencilMaskSample;
- simdscalar vInterpolatedZ;
-
- // forcedSampleCount outputs to any pixels with covered samples not masked off by SampleMask
- // depth test is disabled, so just set the z val to 0.
+ // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
+ uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
+ simdscalar coverageMask, depthMask;
if(T::bForcedSampleCount)
{
- coverageMaskSample = depthMaskSample = anyDepthSamplePassed;
- vInterpolatedZ = _simd_setzero_ps();
- }
- else if(T::bIsStandardPattern)
- {
- if(!_simd_movemask_ps(depthPassMask[sample]))
- {
- depthPassMask[sample] = _simd_setzero_ps();
- DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], pDepthSample, depthPassMask[sample],
- vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
- continue;
- }
- coverageMaskSample = vCoverageMask[sample];
- depthMaskSample = depthPassMask[sample];
- stencilMaskSample = stencilPassMask[sample];
- vInterpolatedZ = vZ[sample];
+ coverageMask = depthMask = activeLanes;
}
else
{
- // center pattern only needs to use a single depth test as all samples are at the same position
- if(!_simd_movemask_ps(depthPassMask[0]))
+ coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
+ depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
+ if(!_simd_movemask_ps(depthMask))
{
- depthPassMask[0] = _simd_setzero_ps();
- DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[0], pDepthSample, depthPassMask[0],
- vCoverageMask[0], pStencilSample, stencilPassMask[0]);
+ // stencil should already have been written in early/lateZ tests
+ RDTSC_STOP(BEOutputMerger, 0, 0);
continue;
}
- coverageMaskSample = (vCoverageMask[0]);
- depthMaskSample = depthPassMask[0];
- stencilMaskSample = stencilPassMask[0];
- vInterpolatedZ = vZ[0];
}
+
+ // broadcast the results of the PS to all passing pixels
+ OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, coverageMask, depthMask, pPSState->numRenderTargets);
- // output merger
- RDTSC_START(BEOutputMerger);
- backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc,
- coverageMaskSample, depthMaskSample);
+ if(!pPSState->forceEarlyZ && !T::bForcedSampleCount)
+ {
+ uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
+ uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
- DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vInterpolatedZ, pDepthSample, depthMaskSample,
- coverageMaskSample, pStencilSample, stencilMaskSample);
- RDTSC_STOP(BEOutputMerger, 0, 0);
+ DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
+ pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
+ }
+ RDTSC_STOP(BEOutputMerger, 0, 0);
}
-
Endtile:
RDTSC_START(BEEndTile);
for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
+ work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
RDTSC_STOP(BEEndTile, 0, 0);
}
}
+ RDTSC_STOP(BEPixelRateBackend, 0, 0);
}
// optimized backend flow with NULL PS
template<uint32_t sampleCountT>
void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
+ RDTSC_START(BENullBackend);
///@todo: handle center multisample pattern
typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
RDTSC_START(BESetup);
SWR_CONTEXT *pContext = pDC->pContext;
const API_STATE& state = GetApiState(pDC);
- const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
// broadcast scalars
psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
- backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
+ CalcSampleBarycentrics(coeffs, psContext);
// interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
simdscalar stencilPassMask = vCoverageMask;
// offset depth/stencil buffers current sample
- uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
- uint8_t *pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
+ uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
+ uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
RDTSC_START(BEEarlyDepthTest);
simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
}
}
+ RDTSC_STOP(BENullBackend, 0, 0);
}
void InitClearTilesTable()
}
PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
-PFN_BACKEND_FUNC gBackendSingleSample[2][2] = {};
-PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2] = {};
-PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2] = {};
-PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX] = {};
-PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2] = {};
-PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2] = {};
-
-// Recursive template used to auto-nest conditionals. Converts dynamic enum function
-// arguments to static template arguments.
-template <uint32_t... ArgsT>
-struct OMChooser
-{
- // Last Arg Terminator
- static PFN_OUTPUT_MERGER GetFunc(SWR_MULTISAMPLE_COUNT tArg)
- {
- switch(tArg)
- {
- case SWR_MULTISAMPLE_1X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_1X>; break;
- case SWR_MULTISAMPLE_2X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_2X>; break;
- case SWR_MULTISAMPLE_4X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_4X>; break;
- case SWR_MULTISAMPLE_8X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_8X>; break;
- case SWR_MULTISAMPLE_16X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_16X>; break;
- default:
- SWR_ASSERT(0 && "Invalid sample count\n");
- return nullptr;
- break;
- }
- }
-
- // Recursively parse args
- template <typename... TArgsT>
- static PFN_OUTPUT_MERGER GetFunc(uint32_t tArg, TArgsT... remainingArgs)
- {
- switch(tArg)
- {
- case 0: return OMChooser<ArgsT..., 0>::GetFunc(remainingArgs...); break;
- case 1: return OMChooser<ArgsT..., 1>::GetFunc(remainingArgs...); break;
- case 2: return OMChooser<ArgsT..., 2>::GetFunc(remainingArgs...); break;
- case 3: return OMChooser<ArgsT..., 3>::GetFunc(remainingArgs...); break;
- case 4: return OMChooser<ArgsT..., 4>::GetFunc(remainingArgs...); break;
- case 5: return OMChooser<ArgsT..., 5>::GetFunc(remainingArgs...); break;
- case 6: return OMChooser<ArgsT..., 6>::GetFunc(remainingArgs...); break;
- case 7: return OMChooser<ArgsT..., 7>::GetFunc(remainingArgs...); break;
- case 8: return OMChooser<ArgsT..., 8>::GetFunc(remainingArgs...); break;
- default:
- SWR_ASSERT(0 && "Invalid RT index\n");
- return nullptr;
- break;
- }
- }
-};
+PFN_BACKEND_FUNC gBackendSingleSample[2] // input coverage
+ [2] // centroid
+ [2] // canEarlyZ
+ = {};
+PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX]
+ [SWR_MSAA_SAMPLE_PATTERN_MAX]
+ [SWR_INPUT_COVERAGE_MAX]
+ [2] // centroid
+ [2] // forcedSampleCount
+ [2] // canEarlyZ
+ = {};
+PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX]
+ [2] // centroid
+ [2] // canEarlyZ
+ = {};
// Recursive template used to auto-nest conditionals. Converts dynamic enum function
// arguments to static template arguments.
}
};
-template <uint32_t numRenderTargets, SWR_MULTISAMPLE_COUNT numSampleRates>
-void InitBackendOMFuncTable(PFN_OUTPUT_MERGER (&table)[numRenderTargets][numSampleRates])
+void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[2][2][2])
{
- for(uint32_t rtNum = SWR_ATTACHMENT_COLOR0; rtNum < numRenderTargets; rtNum++)
+ for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++)
{
- for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+ for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
{
- table[rtNum][sampleCount] =
- OMChooser<>::GetFunc((SWR_RENDERTARGET_ATTACHMENT)rtNum, (SWR_MULTISAMPLE_COUNT)sampleCount);
+ for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
+ {
+ table[inputCoverage][isCentroid][canEarlyZ] =
+ BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL),
+ (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
+ }
}
}
}
-template <SWR_MULTISAMPLE_COUNT numSampleRates>
-void InitBackendBarycentricsTables(PFN_CALC_PIXEL_BARYCENTRICS (&pixelTable)[2],
- PFN_CALC_SAMPLE_BARYCENTRICS (&sampleTable)[2])
+void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX]
+ [2][2][2])
{
- pixelTable[0] = CalcPixelBarycentrics<0>;
- pixelTable[1] = CalcPixelBarycentrics<1>;
-
- sampleTable[0] = CalcSampleBarycentrics<0>;
- sampleTable[1] = CalcSampleBarycentrics<1>;
-}
-
-void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[2][2])
-{
- gBackendSingleSample[0][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false, false, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
- gBackendSingleSample[0][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false, true, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
- gBackendSingleSample[1][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, true, false, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
- gBackendSingleSample[1][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, true, true, false, false,(SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
-}
-
-template <SWR_MULTISAMPLE_COUNT numSampleRates, SWR_MSAA_SAMPLE_PATTERN numSamplePatterns, SWR_INPUT_COVERAGE numCoverageModes>
-void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numSamplePatterns][numCoverageModes][2][2])
-{
- for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+ for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_MAX; sampleCount++)
{
- for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < numSamplePatterns; samplePattern++)
+ for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_MAX; samplePattern++)
{
- for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
+ for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++)
{
for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
{
- table[sampleCount][samplePattern][inputCoverage][isCentroid][0] =
- BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), (isCentroid > 0),
- false, false, SWR_BACKEND_MSAA_PIXEL_RATE);
- table[sampleCount][samplePattern][inputCoverage][isCentroid][1] =
- BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), (isCentroid > 0),
- true, false, SWR_BACKEND_MSAA_PIXEL_RATE);
+ for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++)
+ {
+ for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
+ {
+ table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
+ BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL),
+ (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
+ }
+ }
}
}
}
}
}
-template <uint32_t numSampleRates, uint32_t numCoverageModes>
-void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numCoverageModes][2])
+void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2][2])
{
- for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+ for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_MAX; sampleCount++)
{
- for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
+ for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++)
{
- table[sampleCount][inputCoverage][0] =
- BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), false, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
- table[sampleCount][inputCoverage][1] =
- BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), true, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
+ for(uint32_t centroid = 0; centroid < 2; centroid++)
+ {
+ for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
+ {
+ table[sampleCount][inputCoverage][centroid][canEarlyZ] =
+ BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL),
+ (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
+ }
+ }
}
}
}
void InitBackendFuncTables()
{
- InitBackendSampleFuncTable(gBackendSingleSample);
- InitBackendPixelFuncTable<(SWR_MULTISAMPLE_COUNT)SWR_MULTISAMPLE_TYPE_MAX, SWR_MSAA_SAMPLE_PATTERN_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendPixelRateTable);
- InitBackendSampleFuncTable<SWR_MULTISAMPLE_TYPE_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendSampleRateTable);
- InitBackendOMFuncTable<SWR_NUM_RENDERTARGETS+1, SWR_MULTISAMPLE_TYPE_MAX>(gBackendOutputMergerTable);
- InitBackendBarycentricsTables<(SWR_MULTISAMPLE_COUNT)(SWR_MULTISAMPLE_TYPE_MAX)>(gPixelBarycentricTable, gSampleBarycentricTable);
+ InitBackendSingleFuncTable(gBackendSingleSample);
+ InitBackendPixelFuncTable(gBackendPixelRateTable);
+ InitBackendSampleFuncTable(gBackendSampleRateTable);
gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
#include "common/os.h"
#include "core/context.h"
#include "core/multisample.h"
+#include "rdtsc_core.h"
void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
void InitBackendFuncTables();
void InitCPSFuncTables();
+void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
enum SWR_BACKEND_FUNCS
{
#define MASK 0xff
#endif
+INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+{
+ static const uint32_t RasterTileColorOffsets[16]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
+ };
+ assert(sampleNum < 16);
+ return RasterTileColorOffsets[sampleNum];
+}
+
+INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+{
+ static const uint32_t RasterTileDepthOffsets[16]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
+ };
+ assert(sampleNum < 16);
+ return RasterTileDepthOffsets[sampleNum];
+}
+
+INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+{
+ static const uint32_t RasterTileStencilOffsets[16]
+ { 0,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
+ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
+ };
+ assert(sampleNum < 16);
+ return RasterTileStencilOffsets[sampleNum];
+}
+
template<typename T>
INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
{
inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Centroid behaves exactly as follows :
+// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
+// have a sample location there).
+// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
+// coverage with the SampleMask Rasterizer State.
+// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
+// evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
+// SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
+ const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+{
+ uint32_t inputMask[KNOB_SIMD_WIDTH];
+ generateInputCoverage<T>(coverageMask, inputMask, sampleMask);
+
+ // Case (2) - partially covered pixel
+
+ // scan for first covered sample per pixel in the 4x2 span
+ unsigned long sampleNum[KNOB_SIMD_WIDTH];
+ (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
+ (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
+ (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
+ (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
+ (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
+ (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
+ (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
+ (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
+
+ // look up and set the sample offsets from UL pixel corner for first covered sample
+ __m256 vXSample = _mm256_set_ps(T::MultisampleT::X(sampleNum[7]),
+ T::MultisampleT::X(sampleNum[6]),
+ T::MultisampleT::X(sampleNum[5]),
+ T::MultisampleT::X(sampleNum[4]),
+ T::MultisampleT::X(sampleNum[3]),
+ T::MultisampleT::X(sampleNum[2]),
+ T::MultisampleT::X(sampleNum[1]),
+ T::MultisampleT::X(sampleNum[0]));
+
+ __m256 vYSample = _mm256_set_ps(T::MultisampleT::Y(sampleNum[7]),
+ T::MultisampleT::Y(sampleNum[6]),
+ T::MultisampleT::Y(sampleNum[5]),
+ T::MultisampleT::Y(sampleNum[4]),
+ T::MultisampleT::Y(sampleNum[3]),
+ T::MultisampleT::Y(sampleNum[2]),
+ T::MultisampleT::Y(sampleNum[1]),
+ T::MultisampleT::Y(sampleNum[0]));
+ // add sample offset to UL pixel corner
+ vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
+ vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
+
+ // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
+ static const __m256i vFullyCoveredMask = T::MultisampleT::FullSampleMask();
+ __m256i vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
+ __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
+
+ static const __m256i vZero = _simd_setzero_si();
+ const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
+ __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
+ __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
+ __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
+
+ __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
+
+ // set the centroid position based on results from above
+ psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
+ psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
+
+ // Case (3a) No samples covered and partial sample mask
+ __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
+ // sample mask should never be all 0's for this case, but handle it anyways
+ unsigned long firstCoveredSampleMaskSample = 0;
+ (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
+
+ __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
+
+ vXSample = _simd_set1_ps(T::MultisampleT::X(firstCoveredSampleMaskSample));
+ vYSample = _simd_set1_ps(T::MultisampleT::Y(firstCoveredSampleMaskSample));
+
+ // blend in case 3a pixel locations
+ psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
+ psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
+}
+
+template<typename T>
+INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
+ const uint64_t *const coverageMask, const uint32_t sampleMask,
+ const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+{
+ if(T::bIsStandardPattern)
+ {
+ ///@ todo: don't need to generate input coverage 2x if input coverage and centroid
+ CalcCentroidPos<T>(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL);
+ }
+ else
+ {
+ static const __m256 pixelCenter = _simd_set1_ps(0.5f);
+ psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter);
+ psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter);
+ }
+ // evaluate I,J
+ psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
+ psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
+ psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
+ psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
+
+ // interpolate 1/w
+ psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
+}
+
+template<typename T>
+INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
+{
+ // RT has to be single sample if we're in forcedMSAA mode
+ if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
+ {
+ return 1;
+ }
+ // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
+ else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
+ {
+ return GetNumSamples(blendSampleCount);
+ }
+ // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
+ else
+ {
+ return T::MultisampleT::numSamples;
+ }
+}
+
+template<typename T>
+struct PixelRateZTestLoop
+{
+ PixelRateZTestLoop(DRAW_CONTEXT *DC, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
+ uint8_t*& depthBase, uint8_t*& stencilBase, const uint8_t ClipDistanceMask) :
+ work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
+ clipDistanceMask(ClipDistanceMask), pDepthBase(depthBase), pStencilBase(stencilBase) {};
+
+ INLINE
+ uint32_t operator()(simdscalar& anyDepthSamplePassed, SWR_PS_CONTEXT& psContext,
+ const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
+ {
+ uint32_t statCount = 0;
+ for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
+ {
+ const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
+ vCoverageMask[sample] = vMask(pCoverageMask[currentSimdIn8x8] & MASK);
+
+ if(!_simd_movemask_ps(vCoverageMask[sample]))
+ {
+ vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
+ continue;
+ }
+
+ RDTSC_START(BEBarycentric);
+ // calculate per sample positions
+ psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
+ psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
+
+ // calc I & J per sample
+ CalcSampleBarycentrics(coeffs, psContext);
+
+ if(psState.writesODepth)
+ {
+ // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
+ vZ[sample] = psContext.vZ;
+ }
+ else
+ {
+ vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+ vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
+ }
+ RDTSC_STOP(BEBarycentric, 0, 0);
+
+ ///@todo: perspective correct vs non-perspective correct clipping?
+ // if clip distances are enabled, we need to interpolate for each sample
+ if(clipDistanceMask)
+ {
+ uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer,
+ psContext.vI.sample, psContext.vJ.sample);
+ vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
+ }
+
+ // offset depth/stencil buffers current sample
+ uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
+ uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
+
+ // ZTest for this sample
+ RDTSC_START(BEDepthBucket);
+ depthPassMask[sample] = vCoverageMask[sample];
+ stencilPassMask[sample] = vCoverageMask[sample];
+ depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, vZ[sample], pDepthSample,
+ vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
+ RDTSC_STOP(BEDepthBucket, 0, 0);
+
+ // early-exit if no pixels passed depth or earlyZ is forced on
+ if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
+ {
+ DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
+ pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
+
+ if(!_simd_movemask_ps(depthPassMask[sample]))
+ {
+ continue;
+ }
+ }
+ anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
+ uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
+ statCount += _mm_popcnt_u32(statMask);
+ }
+ // return number of samples that passed depth and coverage
+ return statCount;
+ }
+
+ // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
+ simdscalar vZ[T::MultisampleT::numCoverageSamples];
+ simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
+ simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
+ simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
+
+private:
+ // functor inputs
+ const SWR_TRIANGLE_DESC& work;
+ const BarycentricCoeffs& coeffs;
+ const API_STATE& state;
+ const SWR_PS_STATE& psState;
+ const uint8_t clipDistanceMask;
+ uint8_t*& pDepthBase;
+ uint8_t*& pStencilBase;
+};
+
+INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+{
+ // evaluate I,J
+ psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
+ psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
+ psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
+ psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
+
+ // interpolate 1/w
+ psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
+}
+
+INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+{
+ // evaluate I,J
+ psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
+ psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
+ psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
+ psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
+
+ // interpolate 1/w
+ psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
+}
+
+INLINE void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
+ const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT)
+{
+ // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+ const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
+ simdvector blendOut;
+
+ for(uint32_t rt = 0; rt < NumRT; ++rt)
+ {
+ uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
+
+ const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+ // pfnBlendFunc may not update all channels. Initialize with PS output.
+ /// TODO: move this into the blend JIT.
+ blendOut = psContext.shaded[rt];
+
+ // Blend outputs and update coverage mask for alpha test
+ if(pfnBlendFunc[rt] != nullptr)
+ {
+ pfnBlendFunc[rt](
+ pBlendState,
+ psContext.shaded[rt],
+ psContext.shaded[1],
+ sample,
+ pColorSample,
+ blendOut,
+ &psContext.oMask,
+ (simdscalari*)&coverageMask);
+ }
+
+ // final write mask
+ simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
+
+ ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
+ static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
+
+ const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
+
+ // store with color mask
+ if(!pRTBlend->writeDisableRed)
+ {
+ _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
+ }
+ if(!pRTBlend->writeDisableGreen)
+ {
+ _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
+ }
+ if(!pRTBlend->writeDisableBlue)
+ {
+ _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
+ }
+ if(!pRTBlend->writeDisableAlpha)
+ {
+ _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
+ }
+ }
+}
+
template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t samplePattern = SWR_MSAA_STANDARD_PATTERN,
- uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t odepth = 0>
+ uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0>
struct SwrBackendTraits
{
static const bool bIsStandardPattern = (samplePattern == SWR_MSAA_STANDARD_PATTERN);
static const bool bInputCoverage = (coverage == 1);
static const bool bCentroidPos = (centroid == 1);
static const bool bForcedSampleCount = (forced == 1);
- static const bool bWritesODepth = (odepth == 1);
+ static const bool bCanEarlyZ = (canEarlyZ == 1);
typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, (bIsStandardPattern) ? SWR_MSAA_STANDARD_PATTERN : SWR_MSAA_CENTER_PATTERN> MultisampleT;
-};
\ No newline at end of file
+};
struct BACKEND_FUNCS
{
PFN_BACKEND_FUNC pfnBackend;
- PFN_CALC_PIXEL_BARYCENTRICS pfnCalcPixelBarycentrics;
- PFN_CALC_SAMPLE_BARYCENTRICS pfnCalcSampleBarycentrics;
- PFN_CALC_CENTROID_BARYCENTRICS pfnCalcCentroidBarycentrics;
- PFN_OUTPUT_MERGER pfnOutputMerger;
};
-
// Draw State
struct DRAW_STATE
{
INLINE static float Y(uint32_t sampleNum) = delete;
INLINE static __m128i TileSampleOffsetsX() = delete;
INLINE static __m128i TileSampleOffsetsY() = delete;
- INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) = delete;
- INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) = delete;
- INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) = delete;
INLINE static simdscalari FullSampleMask() = delete;
static const uint32_t numSamples = 0;
return tileSampleOffsetY;
}
- INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
- {
- return 0;
- }
-
- INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
- {
- return 0;
- }
-
- INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
- {
- return 0;
- }
-
INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);};
static const uint32_t samplePosXi {0x80};
return _mm_set1_epi32(0x80);
}
- INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
- {
- return 0;
- }
-
- INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
- {
- return 0;
- }
-
- INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
- {
- return 0;
- }
-
INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);};
static const uint32_t numSamples = 1;
return tileSampleOffsetY;
}
- INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileColorOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)
- };
- assert(sampleNum < numSamples);
- return RasterTileColorOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileDepthOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)
- };
- assert(sampleNum < numSamples);
- return RasterTileDepthOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileStencilOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)
- };
- assert(sampleNum < numSamples);
- return RasterTileStencilOffsets[sampleNum];
- }
-
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask =_simd_set1_epi32(0x3);
return _mm_set1_epi32(0x80);
}
- INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileColorOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)
- };
- assert(sampleNum < numSamples);
- return RasterTileColorOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileDepthOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)
- };
- assert(sampleNum < numSamples);
- return RasterTileDepthOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileStencilOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)
- };
- assert(sampleNum < numSamples);
- return RasterTileStencilOffsets[sampleNum];
- }
-
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask =_simd_set1_epi32(0x3);
return tileSampleOffsetY;
}
- INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileColorOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
- };
- assert(sampleNum < numSamples);
- return RasterTileColorOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileDepthOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
- };
- assert(sampleNum < numSamples);
- return RasterTileDepthOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileStencilOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
- };
- assert(sampleNum < numSamples);
- return RasterTileStencilOffsets[sampleNum];
- }
-
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0xF);
return _mm_set1_epi32(0x80);
}
- INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileColorOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
- };
- assert(sampleNum < numSamples);
- return RasterTileColorOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileDepthOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
- };
- assert(sampleNum < numSamples);
- return RasterTileDepthOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileStencilOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
- };
- assert(sampleNum < numSamples);
- return RasterTileStencilOffsets[sampleNum];
- }
-
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0xF);
return tileSampleOffsetY;
}
- INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileColorOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
- };
- assert(sampleNum < numSamples);
- return RasterTileColorOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileDepthOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
- };
- assert(sampleNum < numSamples);
- return RasterTileDepthOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileStencilOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
- };
- assert(sampleNum < numSamples);
- return RasterTileStencilOffsets[sampleNum];
- }
-
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0xFF);
return _mm_set1_epi32(0x80);
}
- INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileColorOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
- };
- assert(sampleNum < numSamples);
- return RasterTileColorOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileDepthOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
- };
- assert(sampleNum < numSamples);
- return RasterTileDepthOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileStencilOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
- };
- assert(sampleNum < numSamples);
- return RasterTileStencilOffsets[sampleNum];
- }
-
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0xFF);
return tileSampleOffsetY;
}
- INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileColorOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
- };
- assert(sampleNum < numSamples);
- return RasterTileColorOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileDepthOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
- };
- assert(sampleNum < numSamples);
- return RasterTileDepthOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileStencilOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
- };
- assert(sampleNum < numSamples);
- return RasterTileStencilOffsets[sampleNum];
- }
-
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0xFFFF);
// BR, BL, UR, UL
return _mm_set1_epi32(0x80);
}
-
- INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileColorOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
- };
- assert(sampleNum < numSamples);
- return RasterTileColorOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileDepthOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
- };
- assert(sampleNum < numSamples);
- return RasterTileDepthOffsets[sampleNum];
- }
-
- INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
- {
- static const uint32_t RasterTileStencilOffsets[numSamples]
- { 0,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
- (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
- };
- assert(sampleNum < numSamples);
- return RasterTileStencilOffsets[sampleNum];
- }
-
+
INLINE static simdscalari FullSampleMask()
{
static const simdscalari mask = _simd_set1_epi32(0xFFFF);
{ "BEBarycentric", "", false, 0xffffffff },
{ "BEEarlyDepthTest", "", false, 0xffffffff },
{ "BEPixelShader", "", false, 0xffffffff },
+ { "BESingleSampleBackend", "", false, 0xffffffff },
+ { "BEPixelRateBackend", "", false, 0xffffffff },
+ { "BESampleRateBackend", "", false, 0xffffffff },
+ { "BENullBackend", "", false, 0xffffffff },
{ "BELateDepthTest", "", false, 0xffffffff },
{ "BEOutputMerger", "", false, 0xffffffff },
{ "BEStoreTiles", "", true, 0xff00cccc },
BEBarycentric,
BEEarlyDepthTest,
BEPixelShader,
+ BESingleSampleBackend,
+ BEPixelRateBackend,
+ BESampleRateBackend,
+ BENullBackend,
BELateDepthTest,
BEOutputMerger,
BEStoreTiles,