From a646ffdacff1d8895c129b556fccc77d79f6c4a3 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Thu, 21 Apr 2016 14:24:33 -0600 Subject: [PATCH] swr: [rasterizer core] more backend refactoring BackendPixelRate should be easier to read/maintain now hopefully. Small perf bump by moving some of the pfn's to inline functions without template params. Reviewed-by: Bruce Cherniak --- .../swr/rasterizer/common/rdtsc_buckets.cpp | 4 +- .../drivers/swr/rasterizer/core/api.cpp | 29 +- .../drivers/swr/rasterizer/core/backend.cpp | 689 ++++-------------- .../drivers/swr/rasterizer/core/backend.h | 394 +++++++++- .../drivers/swr/rasterizer/core/context.h | 5 - .../drivers/swr/rasterizer/core/multisample.h | 407 +---------- .../swr/rasterizer/core/rdtsc_core.cpp | 4 + .../drivers/swr/rasterizer/core/rdtsc_core.h | 4 + 8 files changed, 573 insertions(+), 963 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp index c6768b4c566..eb038b1d74d 100644 --- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp +++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp @@ -80,7 +80,9 @@ void BucketManager::PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint " |-> ", " |-> ", " |-> ", - " |-> " + " |-> ", + " |-> ", + " |-> ", }; // compute percent of total cycles used by this bucket diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 3b02d197111..e3127923b6f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -755,14 +755,12 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC) pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1; } } + // templated backend function tables extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX]; -extern PFN_BACKEND_FUNC gBackendSingleSample[2][2]; -extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2]; -extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2]; -extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX]; -extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2]; -extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2]; +extern PFN_BACKEND_FUNC gBackendSingleSample[2][2][2]; +extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2][2]; +extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2][2]; void SetupPipeline(DRAW_CONTEXT *pDC) { DRAW_STATE* pState = pDC->pState; @@ -775,13 +773,12 @@ void SetupPipeline(DRAW_CONTEXT *pDC) if (psState.pfnPixelShader == nullptr) { backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount]; - // always need to generate I & J per sample for Z interpolation - backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[1]; } else { const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0; const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; + const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0; // currently only support 'normal' input coverage SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL || @@ -797,35 +794,25 @@ void SetupPipeline(DRAW_CONTEXT *pDC) { // always need to generate I & J per sample for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount]; - backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount]; + backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount][canEarlyZ]; } else { // always need to generate I & J per pixel for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK); - backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid]; - backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X]; + backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ]; } break; case SWR_SHADING_RATE_SAMPLE: SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN); // always need to generate I & J per sample for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid]; - backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount]; + backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ]; break; default: SWR_ASSERT(0 && "Invalid shading rate"); break; } - - // setup pointer to function that generates necessary barycentrics required by the PS - bool bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_PIXEL_MASK) > 0 ? 1 : 0; - backendFuncs.pfnCalcPixelBarycentrics = gPixelBarycentricTable[bBarycentrics]; - - bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0; - backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics]; } PFN_PROCESS_PRIMS pfnBinner; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 310a7edcde1..1d923ead6f1 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -29,7 +29,6 @@ #include -#include "rdtsc_core.h" #include "backend.h" #include "depthstencil.h" #include "tilemgr.h" @@ -459,221 +458,10 @@ simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscala return _simd_movemask_ps(vClipMask); } -template -INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) -{ - if(bGenerateBarycentrics) - { - // evaluate I,J - psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center); - psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center); - psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet); - psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet); - - // interpolate 1/w - psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center); - } -} - -template -INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) -{ - if(bGenerateBarycentrics) - { - // evaluate I,J - psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample); - psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample); - psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet); - psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet); - - // interpolate 1/w - psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample); - } -} - - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Centroid behaves exactly as follows : -// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to -// have a sample location there). -// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the -// coverage with the SampleMask Rasterizer State. -// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is -// evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the -// SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point. -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template -INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask, - const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL) -{ - uint32_t inputMask[KNOB_SIMD_WIDTH]; - generateInputCoverage(coverageMask, inputMask, sampleMask); - - // Case (2) - partially covered pixel - - // scan for first covered sample per pixel in the 4x2 span - unsigned long sampleNum[KNOB_SIMD_WIDTH]; - (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0); - (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0); - (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0); - (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0); - (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0); - (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0); - (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0); - (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0); - - // look up and set the sample offsets from UL pixel corner for first covered sample - __m256 vXSample = _mm256_set_ps(T::MultisampleT::X(sampleNum[7]), - T::MultisampleT::X(sampleNum[6]), - T::MultisampleT::X(sampleNum[5]), - T::MultisampleT::X(sampleNum[4]), - T::MultisampleT::X(sampleNum[3]), - T::MultisampleT::X(sampleNum[2]), - T::MultisampleT::X(sampleNum[1]), - T::MultisampleT::X(sampleNum[0])); - - __m256 vYSample = _mm256_set_ps(T::MultisampleT::Y(sampleNum[7]), - T::MultisampleT::Y(sampleNum[6]), - T::MultisampleT::Y(sampleNum[5]), - T::MultisampleT::Y(sampleNum[4]), - T::MultisampleT::Y(sampleNum[3]), - T::MultisampleT::Y(sampleNum[2]), - T::MultisampleT::Y(sampleNum[1]), - T::MultisampleT::Y(sampleNum[0])); - // add sample offset to UL pixel corner - vXSample = _simd_add_ps(vXSamplePosUL, vXSample); - vYSample = _simd_add_ps(vYSamplePosUL, vYSample); - - // Case (1) and case (3b) - All samples covered or not covered with full SampleMask - static const __m256i vFullyCoveredMask = T::MultisampleT::FullSampleMask(); - __m256i vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]); - __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask); - - static const __m256i vZero = _simd_setzero_si(); - const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask); - __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero); - __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask); - __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask); - - __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b); - - // set the centroid position based on results from above - psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter)); - psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter)); - - // Case (3a) No samples covered and partial sample mask - __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask); - // sample mask should never be all 0's for this case, but handle it anyways - unsigned long firstCoveredSampleMaskSample = 0; - (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0); - - __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples); - - vXSample = _simd_set1_ps(T::MultisampleT::X(firstCoveredSampleMaskSample)); - vYSample = _simd_set1_ps(T::MultisampleT::Y(firstCoveredSampleMaskSample)); - - // blend in case 3a pixel locations - psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a)); - psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a)); -} - -template -INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext, - const uint64_t *const coverageMask, const uint32_t sampleMask, - const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL) -{ - if(T::bIsStandardPattern) - { - ///@ todo: don't need to generate input coverage 2x if input coverage and centroid - CalcCentroidPos(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL); - } - else - { - static const __m256 pixelCenter = _simd_set1_ps(0.5f); - psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter); - psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter); - } - // evaluate I,J - psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid); - psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid); - psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet); - psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet); - - // interpolate 1/w - psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid); -} - -template -void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, - const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask) -{ - // type safety guaranteed from template instantiation in BEChooser<>::GetFunc - static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT; - uint32_t rasterTileColorOffset = MultisampleTraits::RasterTileColorOffset(sample); - simdvector blendOut; - - for(uint32_t rt = 0; rt < NumRT; ++rt) - { - uint8_t *pColorSample; - if(sampleCount == SWR_MULTISAMPLE_1X) - { - pColorSample = pColorBase[rt]; - } - else - { - pColorSample = pColorBase[rt] + rasterTileColorOffset; - } - - const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; - // pfnBlendFunc may not update all channels. Initialize with PS output. - /// TODO: move this into the blend JIT. - blendOut = psContext.shaded[rt]; - - // Blend outputs and update coverage mask for alpha test - if(pfnBlendFunc[rt] != nullptr) - { - pfnBlendFunc[rt]( - pBlendState, - psContext.shaded[rt], - psContext.shaded[1], - sample, - pColorSample, - blendOut, - &psContext.oMask, - (simdscalari*)&coverageMask); - } - - // final write mask - simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask)); - - ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. - static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); - - const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float); - - // store with color mask - if(!pRTBlend->writeDisableRed) - { - _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x); - } - if(!pRTBlend->writeDisableGreen) - { - _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y); - } - if(!pRTBlend->writeDisableBlue) - { - _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z); - } - if(!pRTBlend->writeDisableAlpha) - { - _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w); - } - } -} - template void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) { + RDTSC_START(BESingleSampleBackend); RDTSC_START(BESetup); SWR_CONTEXT *pContext = pDC->pContext; @@ -681,7 +469,6 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 const SWR_RASTSTATE& rastState = state.rastState; const SWR_PS_STATE *pPSState = &state.psState; const SWR_BLEND_STATE *pBlendState = &state.blendState; - const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; uint64_t coverageMask = work.coverageMask[0]; // broadcast scalars @@ -736,19 +523,19 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { - if(T::bInputCoverage) - { - generateInputCoverage(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask); - } - if(coverageMask & MASK) { - RDTSC_START(BEBarycentric); psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx)); // pixel center psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx)); - backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); + if(T::bInputCoverage) + { + generateInputCoverage(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask); + } + + RDTSC_START(BEBarycentric); + CalcPixelBarycentrics(coeffs, psContext); if(T::bCentroidPos) { @@ -763,11 +550,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); - RDTSC_STOP(BEBarycentric, 0, 0); simdmask clipCoverageMask = coverageMask & MASK; - // interpolate user clip distance if available if(rastState.clipDistanceMask) { @@ -780,7 +565,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 simdscalar stencilPassMask = vCoverageMask; // Early-Z? - if(CanEarlyZ(pPSState)) + if(T::bCanEarlyZ) { RDTSC_START(BEEarlyDepthTest); depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, @@ -812,7 +597,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 vCoverageMask = _simd_castsi_ps(psContext.activeMask); // late-Z - if(!CanEarlyZ(pPSState)) + if(!T::bCanEarlyZ) { RDTSC_START(BELateDepthTest); depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, @@ -834,8 +619,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 // output merger RDTSC_START(BEOutputMerger); - backendFuncs.pfnOutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, - vCoverageMask, depthPassMask); + OutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets); // do final depth write after all pixel kills if (!pPSState->forceEarlyZ) @@ -859,11 +643,13 @@ Endtile: RDTSC_STOP(BEEndTile, 0, 0); } } + RDTSC_STOP(BESingleSampleBackend, 0, 0); } template void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) { + RDTSC_START(BESampleRateBackend); RDTSC_START(BESetup); SWR_CONTEXT *pContext = pDC->pContext; @@ -871,7 +657,6 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ const SWR_RASTSTATE& rastState = state.rastState; const SWR_PS_STATE *pPSState = &state.psState; const SWR_BLEND_STATE *pBlendState = &state.blendState; - const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; // broadcast scalars BarycentricCoeffs coeffs; @@ -915,7 +700,6 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ psContext.recipDet = work.recipDet; psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX; psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY; - const uint32_t numSamples = T::MultisampleT::numSamples; for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { @@ -931,7 +715,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx)); RDTSC_START(BEBarycentric); - backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); + CalcPixelBarycentrics(coeffs, psContext); RDTSC_STOP(BEBarycentric, 0, 0); if(T::bInputCoverage) @@ -947,25 +731,21 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ RDTSC_STOP(BEBarycentric, 0, 0); } - for(uint32_t sample = 0; sample < numSamples; sample++) + for(uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++) { - if (work.coverageMask[sample] & MASK) + simdmask coverageMask = work.coverageMask[sample] & MASK; + if (coverageMask) { RDTSC_START(BEBarycentric); - // calculate per sample positions psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample)); psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample)); - - simdmask coverageMask = work.coverageMask[sample] & MASK; - simdscalar vCoverageMask = vMask(coverageMask); - backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext); + CalcSampleBarycentrics(coeffs, psContext); // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); - RDTSC_STOP(BEBarycentric, 0, 0); // interpolate user clip distance if available @@ -974,16 +754,17 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); } - + + simdscalar vCoverageMask = vMask(coverageMask); simdscalar depthPassMask = vCoverageMask; simdscalar stencilPassMask = vCoverageMask; // offset depth/stencil buffers current sample - uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample); - uint8_t *pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample); + uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample); + uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample); // Early-Z? - if (CanEarlyZ(pPSState)) + if (T::bCanEarlyZ) { RDTSC_START(BEEarlyDepthTest); depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, @@ -1016,7 +797,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ vCoverageMask = _simd_castsi_ps(psContext.activeMask); // late-Z - if (!CanEarlyZ(pPSState)) + if (!T::bCanEarlyZ) { RDTSC_START(BELateDepthTest); depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, @@ -1040,8 +821,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ // output merger RDTSC_START(BEOutputMerger); - backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, - vCoverageMask, depthPassMask); + OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets); // do final depth write after all pixel kills if (!pPSState->forceEarlyZ) @@ -1064,11 +844,13 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ RDTSC_STOP(BEEndTile, 0, 0); } } + RDTSC_STOP(BESampleRateBackend, 0, 0); } template void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) { + RDTSC_START(BEPixelRateBackend); RDTSC_START(BESetup); SWR_CONTEXT *pContext = pDC->pContext; @@ -1076,7 +858,6 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t const SWR_RASTSTATE& rastState = state.rastState; const SWR_PS_STATE *pPSState = &state.psState; const SWR_BLEND_STATE *pBlendState = &state.blendState; - const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; // broadcast scalars BarycentricCoeffs coeffs; @@ -1120,35 +901,25 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX; psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY; psContext.sampleIndex = 0; - - uint32_t numOMSamples; - // RT has to be single sample if we're in forcedMSAA mode - if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X)) - { - numOMSamples = 1; - } - // unless we're forced to single sample, in which case we run the OM at the sample count of the RT - else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X)) - { - numOMSamples = GetNumSamples(pBlendState->sampleCount); - } - // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count - else - { - numOMSamples = T::MultisampleT::numSamples; - } + PixelRateZTestLoop PixelRateZTest(pDC, work, coeffs, state, pDepthBase, pStencilBase, rastState.clipDistanceMask); + for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy)); psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy)); for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { - simdscalar vZ[T::MultisampleT::numSamples]{ 0 }; + if(!(work.anyCoveredSamples & MASK)) {goto Endtile;}; + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx)); // set pixel center positions psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx)); + RDTSC_START(BEBarycentric); + CalcPixelBarycentrics(coeffs, psContext); + RDTSC_STOP(BEBarycentric, 0, 0); + if (T::bInputCoverage) { generateInputCoverage(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask); @@ -1162,201 +933,109 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t RDTSC_STOP(BEBarycentric, 0, 0); } - // if oDepth written to, or there is a potential to discard any samples, we need to - // run the PS early, then interp or broadcast Z and test - if(pPSState->writesODepth || pPSState->killsPixel) + simdscalar activeLanes; + if(T::bForcedSampleCount) { - RDTSC_START(BEBarycentric); - backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); - - // interpolate and quantize z - psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); - psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); - RDTSC_STOP(BEBarycentric, 0, 0); + // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set + const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si())); + activeLanes = _simd_and_ps(vMask(work.anyCoveredSamples & MASK), vSampleMask); + } - // execute pixel shader - RDTSC_START(BEPixelShader); - state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); - RDTSC_STOP(BEPixelShader, 0, 0); + // Early-Z? + if(T::bCanEarlyZ && !T::bForcedSampleCount) + { + activeLanes = _simd_setzero_ps(); + uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest); + UPDATE_STAT(DepthPassCount, depthPassCount); } - else + // if we can't do early z, set the active mask to any samples covered in the current simd + else if(!T::bCanEarlyZ && !T::bForcedSampleCount) { - psContext.activeMask = _simd_set1_epi32(-1); + activeLanes = vMask(work.anyCoveredSamples & MASK); } - // need to declare enough space for all samples - simdscalar vCoverageMask[T::MultisampleT::numSamples]; - simdscalar depthPassMask[T::MultisampleT::numSamples]; - simdscalar stencilPassMask[T::MultisampleT::numSamples]; - simdscalar anyDepthSamplePassed = _simd_setzero_ps(); - simdscalar anyStencilSamplePassed = _simd_setzero_ps(); - for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) + // if we have no covered samples that passed depth at this point, go to next tile + if(!_simd_movemask_ps(activeLanes)) { - vCoverageMask[sample] = vMask(work.coverageMask[sample] & MASK); - - // pull mask back out for any discards and and with coverage - vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_castsi_ps(psContext.activeMask)); - - if (!_simd_movemask_ps(vCoverageMask[sample])) - { - vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps(); - continue; - } - - if(T::bForcedSampleCount) - { - // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set - const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si())); - anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, _simd_and_ps(vCoverageMask[sample], vSampleMask)); - continue; - } - - depthPassMask[sample] = vCoverageMask[sample]; - - // if oDepth isn't written to, we need to interpolate Z for each sample - // if clip distances are enabled, we need to interpolate for each sample - if(!pPSState->writesODepth || rastState.clipDistanceMask) - { - RDTSC_START(BEBarycentric); - if(T::bIsStandardPattern) - { - // calculate per sample positions - psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample)); - psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample)); - } - else - { - psContext.vX.sample = psContext.vX.center; - psContext.vY.sample = psContext.vY.center; - } - - // calc I & J per sample - backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext); - - // interpolate and quantize z - if (!pPSState->writesODepth) - { - vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); - vZ[sample] = state.pfnQuantizeDepth(vZ[sample]); - } - - ///@todo: perspective correct vs non-perspective correct clipping? - // interpolate clip distances - if (rastState.clipDistanceMask) - { - uint8_t clipMask = ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer, - psContext.vI.sample, psContext.vJ.sample); - vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask)); - } - RDTSC_STOP(BEBarycentric, 0, 0); - } - // else 'broadcast' and test psContext.vZ written from the PS each sample - else - { - vZ[sample] = psContext.vZ; - } - - // offset depth/stencil buffers current sample - uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample); - uint8_t * pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample); - - // ZTest for this sample - RDTSC_START(BEEarlyDepthTest); - stencilPassMask[sample] = vCoverageMask[sample]; - depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, - vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]); - RDTSC_STOP(BEEarlyDepthTest, 0, 0); - - anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]); - anyStencilSamplePassed = _simd_or_ps(anyStencilSamplePassed, stencilPassMask[sample]); - uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]); - uint32_t statCount = _mm_popcnt_u32(statMask); - UPDATE_STAT(DepthPassCount, statCount); + goto Endtile; } - // if we didn't have to execute the PS early, and at least 1 sample passed the depth test, run the PS - if(!pPSState->writesODepth && !pPSState->killsPixel && _simd_movemask_ps(anyDepthSamplePassed)) + if(pPSState->usesSourceDepth) { RDTSC_START(BEBarycentric); - backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); RDTSC_STOP(BEBarycentric, 0, 0); + } - // execute pixel shader - RDTSC_START(BEPixelShader); - state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); - RDTSC_STOP(BEPixelShader, 0, 0); + // pixels that are currently active + psContext.activeMask = _simd_castps_si(activeLanes); + psContext.oMask = T::MultisampleT::FullSampleMask(); + + // execute pixel shader + RDTSC_START(BEPixelShader); + state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); + UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes))); + RDTSC_STOP(BEPixelShader, 0, 0); + + // update active lanes to remove any discarded or oMask'd pixels + activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si()))); + if(!_simd_movemask_ps(activeLanes)) + { + goto Endtile; } - ///@todo: make sure this works for kill pixel - else if(!_simd_movemask_ps(anyStencilSamplePassed)) + + // late-Z + if(!T::bCanEarlyZ && !T::bForcedSampleCount) + { + uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest); + UPDATE_STAT(DepthPassCount, depthPassCount); + } + + // if we have no covered samples that passed depth at this point, skip OM and go to next tile + if(!_simd_movemask_ps(activeLanes)) { goto Endtile; } + // output merger // loop over all samples, broadcasting the results of the PS to all passing pixels - for(uint32_t sample = 0; sample < numOMSamples; sample++) + for(uint32_t sample = 0; sample < GetNumOMSamples(pBlendState->sampleCount); sample++) { - uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample); - uint8_t * pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample); - - // output merger RDTSC_START(BEOutputMerger); - - // skip if none of the pixels for this sample passed - simdscalar coverageMaskSample; - simdscalar depthMaskSample; - simdscalar stencilMaskSample; - simdscalar vInterpolatedZ; - - // forcedSampleCount outputs to any pixels with covered samples not masked off by SampleMask - // depth test is disabled, so just set the z val to 0. + // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples + uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0; + simdscalar coverageMask, depthMask; if(T::bForcedSampleCount) { - coverageMaskSample = depthMaskSample = anyDepthSamplePassed; - vInterpolatedZ = _simd_setzero_ps(); - } - else if(T::bIsStandardPattern) - { - if(!_simd_movemask_ps(depthPassMask[sample])) - { - depthPassMask[sample] = _simd_setzero_ps(); - DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], pDepthSample, depthPassMask[sample], - vCoverageMask[sample], pStencilSample, stencilPassMask[sample]); - continue; - } - coverageMaskSample = vCoverageMask[sample]; - depthMaskSample = depthPassMask[sample]; - stencilMaskSample = stencilPassMask[sample]; - vInterpolatedZ = vZ[sample]; + coverageMask = depthMask = activeLanes; } else { - // center pattern only needs to use a single depth test as all samples are at the same position - if(!_simd_movemask_ps(depthPassMask[0])) + coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum]; + depthMask = PixelRateZTest.depthPassMask[coverageSampleNum]; + if(!_simd_movemask_ps(depthMask)) { - depthPassMask[0] = _simd_setzero_ps(); - DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[0], pDepthSample, depthPassMask[0], - vCoverageMask[0], pStencilSample, stencilPassMask[0]); + // stencil should already have been written in early/lateZ tests + RDTSC_STOP(BEOutputMerger, 0, 0); continue; } - coverageMaskSample = (vCoverageMask[0]); - depthMaskSample = depthPassMask[0]; - stencilMaskSample = stencilPassMask[0]; - vInterpolatedZ = vZ[0]; } + + // broadcast the results of the PS to all passing pixels + OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, coverageMask, depthMask, pPSState->numRenderTargets); - // output merger - RDTSC_START(BEOutputMerger); - backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, - coverageMaskSample, depthMaskSample); + if(!pPSState->forceEarlyZ && !T::bForcedSampleCount) + { + uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample); + uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample); - DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vInterpolatedZ, pDepthSample, depthMaskSample, - coverageMaskSample, pStencilSample, stencilMaskSample); - RDTSC_STOP(BEOutputMerger, 0, 0); + DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum], + pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]); + } + RDTSC_STOP(BEOutputMerger, 0, 0); } - Endtile: RDTSC_START(BEEndTile); for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) @@ -1364,6 +1043,7 @@ Endtile: work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); } + work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; @@ -1374,18 +1054,19 @@ Endtile: RDTSC_STOP(BEEndTile, 0, 0); } } + RDTSC_STOP(BEPixelRateBackend, 0, 0); } // optimized backend flow with NULL PS template void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) { + RDTSC_START(BENullBackend); ///@todo: handle center multisample pattern typedef SwrBackendTraits T; RDTSC_START(BESetup); SWR_CONTEXT *pContext = pDC->pContext; const API_STATE& state = GetApiState(pDC); - const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; const SWR_RASTSTATE& rastState = pDC->pState->state.rastState; // broadcast scalars @@ -1433,7 +1114,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample)); psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample)); - backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext); + CalcSampleBarycentrics(coeffs, psContext); // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); @@ -1452,8 +1133,8 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, simdscalar stencilPassMask = vCoverageMask; // offset depth/stencil buffers current sample - uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample); - uint8_t *pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample); + uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample); + uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample); RDTSC_START(BEEarlyDepthTest); simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, @@ -1472,6 +1153,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; } } + RDTSC_STOP(BENullBackend, 0, 0); } void InitClearTilesTable() @@ -1486,57 +1168,21 @@ void InitClearTilesTable() } PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX]; -PFN_BACKEND_FUNC gBackendSingleSample[2][2] = {}; -PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2] = {}; -PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2] = {}; -PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX] = {}; -PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2] = {}; -PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2] = {}; - -// Recursive template used to auto-nest conditionals. Converts dynamic enum function -// arguments to static template arguments. -template -struct OMChooser -{ - // Last Arg Terminator - static PFN_OUTPUT_MERGER GetFunc(SWR_MULTISAMPLE_COUNT tArg) - { - switch(tArg) - { - case SWR_MULTISAMPLE_1X: return OutputMerger; break; - case SWR_MULTISAMPLE_2X: return OutputMerger; break; - case SWR_MULTISAMPLE_4X: return OutputMerger; break; - case SWR_MULTISAMPLE_8X: return OutputMerger; break; - case SWR_MULTISAMPLE_16X: return OutputMerger; break; - default: - SWR_ASSERT(0 && "Invalid sample count\n"); - return nullptr; - break; - } - } - - // Recursively parse args - template - static PFN_OUTPUT_MERGER GetFunc(uint32_t tArg, TArgsT... remainingArgs) - { - switch(tArg) - { - case 0: return OMChooser::GetFunc(remainingArgs...); break; - case 1: return OMChooser::GetFunc(remainingArgs...); break; - case 2: return OMChooser::GetFunc(remainingArgs...); break; - case 3: return OMChooser::GetFunc(remainingArgs...); break; - case 4: return OMChooser::GetFunc(remainingArgs...); break; - case 5: return OMChooser::GetFunc(remainingArgs...); break; - case 6: return OMChooser::GetFunc(remainingArgs...); break; - case 7: return OMChooser::GetFunc(remainingArgs...); break; - case 8: return OMChooser::GetFunc(remainingArgs...); break; - default: - SWR_ASSERT(0 && "Invalid RT index\n"); - return nullptr; - break; - } - } -}; +PFN_BACKEND_FUNC gBackendSingleSample[2] // input coverage + [2] // centroid + [2] // canEarlyZ + = {}; +PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX] + [SWR_MSAA_SAMPLE_PATTERN_MAX] + [SWR_INPUT_COVERAGE_MAX] + [2] // centroid + [2] // forcedSampleCount + [2] // canEarlyZ + = {}; +PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX] + [2] // centroid + [2] // canEarlyZ + = {}; // Recursive template used to auto-nest conditionals. Converts dynamic enum function // arguments to static template arguments. @@ -1604,83 +1250,72 @@ struct BEChooser } }; -template -void InitBackendOMFuncTable(PFN_OUTPUT_MERGER (&table)[numRenderTargets][numSampleRates]) +void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[2][2][2]) { - for(uint32_t rtNum = SWR_ATTACHMENT_COLOR0; rtNum < numRenderTargets; rtNum++) + for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++) { - for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++) + for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) { - table[rtNum][sampleCount] = - OMChooser<>::GetFunc((SWR_RENDERTARGET_ATTACHMENT)rtNum, (SWR_MULTISAMPLE_COUNT)sampleCount); + for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) + { + table[inputCoverage][isCentroid][canEarlyZ] = + BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), + (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE); + } } } } -template -void InitBackendBarycentricsTables(PFN_CALC_PIXEL_BARYCENTRICS (&pixelTable)[2], - PFN_CALC_SAMPLE_BARYCENTRICS (&sampleTable)[2]) +void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX] + [2][2][2]) { - pixelTable[0] = CalcPixelBarycentrics<0>; - pixelTable[1] = CalcPixelBarycentrics<1>; - - sampleTable[0] = CalcSampleBarycentrics<0>; - sampleTable[1] = CalcSampleBarycentrics<1>; -} - -void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[2][2]) -{ - gBackendSingleSample[0][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false, false, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE); - gBackendSingleSample[0][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false, true, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE); - gBackendSingleSample[1][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, true, false, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE); - gBackendSingleSample[1][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, true, true, false, false,(SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE); -} - -template -void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numSamplePatterns][numCoverageModes][2][2]) -{ - for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++) + for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_MAX; sampleCount++) { - for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < numSamplePatterns; samplePattern++) + for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_MAX; samplePattern++) { - for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++) + for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++) { for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) { - table[sampleCount][samplePattern][inputCoverage][isCentroid][0] = - BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), (isCentroid > 0), - false, false, SWR_BACKEND_MSAA_PIXEL_RATE); - table[sampleCount][samplePattern][inputCoverage][isCentroid][1] = - BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), (isCentroid > 0), - true, false, SWR_BACKEND_MSAA_PIXEL_RATE); + for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++) + { + for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) + { + table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] = + BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), + (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE); + } + } } } } } } -template -void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numCoverageModes][2]) +void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2][2]) { - for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++) + for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_MAX; sampleCount++) { - for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++) + for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++) { - table[sampleCount][inputCoverage][0] = - BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), false, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE); - table[sampleCount][inputCoverage][1] = - BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), true, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE); + for(uint32_t centroid = 0; centroid < 2; centroid++) + { + for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) + { + table[sampleCount][inputCoverage][centroid][canEarlyZ] = + BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), + (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE); + } + } } } } void InitBackendFuncTables() { - InitBackendSampleFuncTable(gBackendSingleSample); - InitBackendPixelFuncTable<(SWR_MULTISAMPLE_COUNT)SWR_MULTISAMPLE_TYPE_MAX, SWR_MSAA_SAMPLE_PATTERN_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendPixelRateTable); - InitBackendSampleFuncTable(gBackendSampleRateTable); - InitBackendOMFuncTable(gBackendOutputMergerTable); - InitBackendBarycentricsTables<(SWR_MULTISAMPLE_COUNT)(SWR_MULTISAMPLE_TYPE_MAX)>(gPixelBarycentricTable, gSampleBarycentricTable); + InitBackendSingleFuncTable(gBackendSingleSample); + InitBackendPixelFuncTable(gBackendPixelRateTable); + InitBackendSampleFuncTable(gBackendSampleRateTable); gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ; gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 022e60a9413..24ba69ec87a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -31,6 +31,7 @@ #include "common/os.h" #include "core/context.h" #include "core/multisample.h" +#include "rdtsc_core.h" void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer); void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); @@ -43,6 +44,7 @@ void InitClearTilesTable(); simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ); void InitBackendFuncTables(); void InitCPSFuncTables(); +void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext); enum SWR_BACKEND_FUNCS { @@ -60,6 +62,78 @@ extern const __m256 vULOffsetsY; #define MASK 0xff #endif +INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) +{ + static const uint32_t RasterTileColorOffsets[16] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, + }; + assert(sampleNum < 16); + return RasterTileColorOffsets[sampleNum]; +} + +INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) +{ + static const uint32_t RasterTileDepthOffsets[16] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, + }; + assert(sampleNum < 16); + return RasterTileDepthOffsets[sampleNum]; +} + +INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) +{ + static const uint32_t RasterTileStencilOffsets[16] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, + }; + assert(sampleNum < 16); + return RasterTileStencilOffsets[sampleNum]; +} + template INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) { @@ -209,14 +283,328 @@ INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &in inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0])); } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Centroid behaves exactly as follows : +// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to +// have a sample location there). +// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the +// coverage with the SampleMask Rasterizer State. +// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is +// evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the +// SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point. +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +template +INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask, + const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL) +{ + uint32_t inputMask[KNOB_SIMD_WIDTH]; + generateInputCoverage(coverageMask, inputMask, sampleMask); + + // Case (2) - partially covered pixel + + // scan for first covered sample per pixel in the 4x2 span + unsigned long sampleNum[KNOB_SIMD_WIDTH]; + (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0); + (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0); + (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0); + (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0); + (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0); + (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0); + (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0); + (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0); + + // look up and set the sample offsets from UL pixel corner for first covered sample + __m256 vXSample = _mm256_set_ps(T::MultisampleT::X(sampleNum[7]), + T::MultisampleT::X(sampleNum[6]), + T::MultisampleT::X(sampleNum[5]), + T::MultisampleT::X(sampleNum[4]), + T::MultisampleT::X(sampleNum[3]), + T::MultisampleT::X(sampleNum[2]), + T::MultisampleT::X(sampleNum[1]), + T::MultisampleT::X(sampleNum[0])); + + __m256 vYSample = _mm256_set_ps(T::MultisampleT::Y(sampleNum[7]), + T::MultisampleT::Y(sampleNum[6]), + T::MultisampleT::Y(sampleNum[5]), + T::MultisampleT::Y(sampleNum[4]), + T::MultisampleT::Y(sampleNum[3]), + T::MultisampleT::Y(sampleNum[2]), + T::MultisampleT::Y(sampleNum[1]), + T::MultisampleT::Y(sampleNum[0])); + // add sample offset to UL pixel corner + vXSample = _simd_add_ps(vXSamplePosUL, vXSample); + vYSample = _simd_add_ps(vYSamplePosUL, vYSample); + + // Case (1) and case (3b) - All samples covered or not covered with full SampleMask + static const __m256i vFullyCoveredMask = T::MultisampleT::FullSampleMask(); + __m256i vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]); + __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask); + + static const __m256i vZero = _simd_setzero_si(); + const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask); + __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero); + __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask); + __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask); + + __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b); + + // set the centroid position based on results from above + psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter)); + psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter)); + + // Case (3a) No samples covered and partial sample mask + __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask); + // sample mask should never be all 0's for this case, but handle it anyways + unsigned long firstCoveredSampleMaskSample = 0; + (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0); + + __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples); + + vXSample = _simd_set1_ps(T::MultisampleT::X(firstCoveredSampleMaskSample)); + vYSample = _simd_set1_ps(T::MultisampleT::Y(firstCoveredSampleMaskSample)); + + // blend in case 3a pixel locations + psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a)); + psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a)); +} + +template +INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext, + const uint64_t *const coverageMask, const uint32_t sampleMask, + const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL) +{ + if(T::bIsStandardPattern) + { + ///@ todo: don't need to generate input coverage 2x if input coverage and centroid + CalcCentroidPos(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL); + } + else + { + static const __m256 pixelCenter = _simd_set1_ps(0.5f); + psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter); + psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter); + } + // evaluate I,J + psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid); + psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid); + psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet); + psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet); + + // interpolate 1/w + psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid); +} + +template +INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount) +{ + // RT has to be single sample if we're in forcedMSAA mode + if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X)) + { + return 1; + } + // unless we're forced to single sample, in which case we run the OM at the sample count of the RT + else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X)) + { + return GetNumSamples(blendSampleCount); + } + // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count + else + { + return T::MultisampleT::numSamples; + } +} + +template +struct PixelRateZTestLoop +{ + PixelRateZTestLoop(DRAW_CONTEXT *DC, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState, + uint8_t*& depthBase, uint8_t*& stencilBase, const uint8_t ClipDistanceMask) : + work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState), + clipDistanceMask(ClipDistanceMask), pDepthBase(depthBase), pStencilBase(stencilBase) {}; + + INLINE + uint32_t operator()(simdscalar& anyDepthSamplePassed, SWR_PS_CONTEXT& psContext, + const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0) + { + uint32_t statCount = 0; + for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) + { + const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample]; + vCoverageMask[sample] = vMask(pCoverageMask[currentSimdIn8x8] & MASK); + + if(!_simd_movemask_ps(vCoverageMask[sample])) + { + vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps(); + continue; + } + + RDTSC_START(BEBarycentric); + // calculate per sample positions + psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample)); + psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample)); + + // calc I & J per sample + CalcSampleBarycentrics(coeffs, psContext); + + if(psState.writesODepth) + { + // broadcast and test oDepth(psContext.vZ) written from the PS for each sample + vZ[sample] = psContext.vZ; + } + else + { + vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); + vZ[sample] = state.pfnQuantizeDepth(vZ[sample]); + } + RDTSC_STOP(BEBarycentric, 0, 0); + + ///@todo: perspective correct vs non-perspective correct clipping? + // if clip distances are enabled, we need to interpolate for each sample + if(clipDistanceMask) + { + uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, + psContext.vI.sample, psContext.vJ.sample); + vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask)); + } + + // offset depth/stencil buffers current sample + uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample); + uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample); + + // ZTest for this sample + RDTSC_START(BEDepthBucket); + depthPassMask[sample] = vCoverageMask[sample]; + stencilPassMask[sample] = vCoverageMask[sample]; + depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, vZ[sample], pDepthSample, + vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]); + RDTSC_STOP(BEDepthBucket, 0, 0); + + // early-exit if no pixels passed depth or earlyZ is forced on + if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample])) + { + DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], + pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]); + + if(!_simd_movemask_ps(depthPassMask[sample])) + { + continue; + } + } + anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]); + uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]); + statCount += _mm_popcnt_u32(statMask); + } + // return number of samples that passed depth and coverage + return statCount; + } + + // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite + simdscalar vZ[T::MultisampleT::numCoverageSamples]; + simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples]; + simdscalar depthPassMask[T::MultisampleT::numCoverageSamples]; + simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples]; + +private: + // functor inputs + const SWR_TRIANGLE_DESC& work; + const BarycentricCoeffs& coeffs; + const API_STATE& state; + const SWR_PS_STATE& psState; + const uint8_t clipDistanceMask; + uint8_t*& pDepthBase; + uint8_t*& pStencilBase; +}; + +INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) +{ + // evaluate I,J + psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center); + psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center); + psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet); + psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet); + + // interpolate 1/w + psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center); +} + +INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) +{ + // evaluate I,J + psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample); + psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample); + psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet); + psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet); + + // interpolate 1/w + psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample); +} + +INLINE void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, + const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT) +{ + // type safety guaranteed from template instantiation in BEChooser<>::GetFunc + const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); + simdvector blendOut; + + for(uint32_t rt = 0; rt < NumRT; ++rt) + { + uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset; + + const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; + // pfnBlendFunc may not update all channels. Initialize with PS output. + /// TODO: move this into the blend JIT. + blendOut = psContext.shaded[rt]; + + // Blend outputs and update coverage mask for alpha test + if(pfnBlendFunc[rt] != nullptr) + { + pfnBlendFunc[rt]( + pBlendState, + psContext.shaded[rt], + psContext.shaded[1], + sample, + pColorSample, + blendOut, + &psContext.oMask, + (simdscalari*)&coverageMask); + } + + // final write mask + simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask)); + + ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. + static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); + + const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float); + + // store with color mask + if(!pRTBlend->writeDisableRed) + { + _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x); + } + if(!pRTBlend->writeDisableGreen) + { + _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y); + } + if(!pRTBlend->writeDisableBlue) + { + _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z); + } + if(!pRTBlend->writeDisableAlpha) + { + _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w); + } + } +} + template + uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0> struct SwrBackendTraits { static const bool bIsStandardPattern = (samplePattern == SWR_MSAA_STANDARD_PATTERN); static const bool bInputCoverage = (coverage == 1); static const bool bCentroidPos = (centroid == 1); static const bool bForcedSampleCount = (forced == 1); - static const bool bWritesODepth = (odepth == 1); + static const bool bCanEarlyZ = (canEarlyZ == 1); typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, (bIsStandardPattern) ? SWR_MSAA_STANDARD_PATTERN : SWR_MSAA_CENTER_PATTERN> MultisampleT; -}; \ No newline at end of file +}; diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 540c690556a..03e583796de 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -357,13 +357,8 @@ typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_C struct BACKEND_FUNCS { PFN_BACKEND_FUNC pfnBackend; - PFN_CALC_PIXEL_BARYCENTRICS pfnCalcPixelBarycentrics; - PFN_CALC_SAMPLE_BARYCENTRICS pfnCalcSampleBarycentrics; - PFN_CALC_CENTROID_BARYCENTRICS pfnCalcCentroidBarycentrics; - PFN_OUTPUT_MERGER pfnOutputMerger; }; - // Draw State struct DRAW_STATE { diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.h b/src/gallium/drivers/swr/rasterizer/core/multisample.h index c5096ed31c7..7213a386da3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/multisample.h +++ b/src/gallium/drivers/swr/rasterizer/core/multisample.h @@ -65,9 +65,6 @@ struct MultisampleTraits INLINE static float Y(uint32_t sampleNum) = delete; INLINE static __m128i TileSampleOffsetsX() = delete; INLINE static __m128i TileSampleOffsetsY() = delete; - INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) = delete; - INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) = delete; - INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) = delete; INLINE static simdscalari FullSampleMask() = delete; static const uint32_t numSamples = 0; @@ -121,21 +118,6 @@ struct MultisampleTraits return tileSampleOffsetY; } - INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) - { - return 0; - } - - INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) - { - return 0; - } - - INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) - { - return 0; - } - INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);}; static const uint32_t samplePosXi {0x80}; @@ -185,21 +167,6 @@ struct MultisampleTraits return _mm_set1_epi32(0x80); } - INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) - { - return 0; - } - - INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) - { - return 0; - } - - INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) - { - return 0; - } - INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);}; static const uint32_t numSamples = 1; @@ -261,36 +228,6 @@ struct MultisampleTraits return tileSampleOffsetY; } - INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileColorOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) - }; - assert(sampleNum < numSamples); - return RasterTileColorOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileDepthOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) - }; - assert(sampleNum < numSamples); - return RasterTileDepthOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileStencilOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) - }; - assert(sampleNum < numSamples); - return RasterTileStencilOffsets[sampleNum]; - } - INLINE static simdscalari FullSampleMask() { static const simdscalari mask =_simd_set1_epi32(0x3); @@ -344,36 +281,6 @@ struct MultisampleTraits return _mm_set1_epi32(0x80); } - INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileColorOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) - }; - assert(sampleNum < numSamples); - return RasterTileColorOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileDepthOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) - }; - assert(sampleNum < numSamples); - return RasterTileDepthOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileStencilOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) - }; - assert(sampleNum < numSamples); - return RasterTileStencilOffsets[sampleNum]; - } - INLINE static simdscalari FullSampleMask() { static const simdscalari mask =_simd_set1_epi32(0x3); @@ -442,42 +349,6 @@ struct MultisampleTraits return tileSampleOffsetY; } - INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileColorOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - }; - assert(sampleNum < numSamples); - return RasterTileColorOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileDepthOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - }; - assert(sampleNum < numSamples); - return RasterTileDepthOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileStencilOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - }; - assert(sampleNum < numSamples); - return RasterTileStencilOffsets[sampleNum]; - } - INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xF); @@ -531,42 +402,6 @@ struct MultisampleTraits return _mm_set1_epi32(0x80); } - INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileColorOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - }; - assert(sampleNum < numSamples); - return RasterTileColorOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileDepthOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - }; - assert(sampleNum < numSamples); - return RasterTileDepthOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileStencilOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - }; - assert(sampleNum < numSamples); - return RasterTileStencilOffsets[sampleNum]; - } - INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xF); @@ -639,54 +474,6 @@ struct MultisampleTraits return tileSampleOffsetY; } - INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileColorOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, - }; - assert(sampleNum < numSamples); - return RasterTileColorOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileDepthOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, - }; - assert(sampleNum < numSamples); - return RasterTileDepthOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileStencilOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, - }; - assert(sampleNum < numSamples); - return RasterTileStencilOffsets[sampleNum]; - } - INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xFF); @@ -740,54 +527,6 @@ struct MultisampleTraits return _mm_set1_epi32(0x80); } - INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileColorOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, - }; - assert(sampleNum < numSamples); - return RasterTileColorOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileDepthOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, - }; - assert(sampleNum < numSamples); - return RasterTileDepthOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileStencilOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, - }; - assert(sampleNum < numSamples); - return RasterTileStencilOffsets[sampleNum]; - } - INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xFF); @@ -868,78 +607,6 @@ struct MultisampleTraits return tileSampleOffsetY; } - INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileColorOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, - }; - assert(sampleNum < numSamples); - return RasterTileColorOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileDepthOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, - }; - assert(sampleNum < numSamples); - return RasterTileDepthOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileStencilOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, - }; - assert(sampleNum < numSamples); - return RasterTileStencilOffsets[sampleNum]; - } - INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xFFFF); @@ -992,79 +659,7 @@ struct MultisampleTraits // BR, BL, UR, UL return _mm_set1_epi32(0x80); } - - INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileColorOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, - }; - assert(sampleNum < numSamples); - return RasterTileColorOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileDepthOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, - }; - assert(sampleNum < numSamples); - return RasterTileDepthOffsets[sampleNum]; - } - - INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) - { - static const uint32_t RasterTileStencilOffsets[numSamples] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, - }; - assert(sampleNum < numSamples); - return RasterTileStencilOffsets[sampleNum]; - } - + INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xFFFF); diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp index 4b6b536075b..df8bad32023 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp @@ -77,6 +77,10 @@ BUCKET_DESC gCoreBuckets[] = { { "BEBarycentric", "", false, 0xffffffff }, { "BEEarlyDepthTest", "", false, 0xffffffff }, { "BEPixelShader", "", false, 0xffffffff }, + { "BESingleSampleBackend", "", false, 0xffffffff }, + { "BEPixelRateBackend", "", false, 0xffffffff }, + { "BESampleRateBackend", "", false, 0xffffffff }, + { "BENullBackend", "", false, 0xffffffff }, { "BELateDepthTest", "", false, 0xffffffff }, { "BEOutputMerger", "", false, 0xffffffff }, { "BEStoreTiles", "", true, 0xff00cccc }, diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h index 5fcc40bf8ee..e1dde61b386 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h +++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h @@ -82,6 +82,10 @@ enum CORE_BUCKETS BEBarycentric, BEEarlyDepthTest, BEPixelShader, + BESingleSampleBackend, + BEPixelRateBackend, + BESampleRateBackend, + BENullBackend, BELateDepthTest, BEOutputMerger, BEStoreTiles, -- 2.30.2