From d2759c1eb3b77e9d86c52f2f8e6471a8f339228d Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Wed, 15 Feb 2017 13:45:16 -0800 Subject: [PATCH] swr: [rasterizer core/scripts] Autogen backend initialization function(s) Autogen functions that instantiates different BackendPixelRate templates. Functions get split into separate files after reaching a user defined threshold (currently 512 per file) to speed up compilation. This change will enable the addition of more template flags in the pixel back end. Reviewed-by: Bruce Cherniak --- src/gallium/drivers/swr/.gitignore | 1 + src/gallium/drivers/swr/Makefile.am | 22 +- src/gallium/drivers/swr/SConscript | 13 + .../drivers/swr/rasterizer/core/backend.cpp | 226 +----------------- .../drivers/swr/rasterizer/core/backend.h | 199 +++++++++++++++ .../swr/rasterizer/scripts/gen_backends.py | 125 ++++++++++ .../scripts/templates/backend_template.cpp | 38 +++ 7 files changed, 398 insertions(+), 226 deletions(-) create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/gen_backends.py create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/templates/backend_template.cpp diff --git a/src/gallium/drivers/swr/.gitignore b/src/gallium/drivers/swr/.gitignore index 172f3bfbcb7..b6c5faa110a 100644 --- a/src/gallium/drivers/swr/.gitignore +++ b/src/gallium/drivers/swr/.gitignore @@ -10,3 +10,4 @@ rasterizer/jitter/builder_x86.h rasterizer/jitter/state_llvm.h rasterizer/scripts/gen_knobs.cpp rasterizer/scripts/gen_knobs.h +rasterizer/core/BackendPixelRate0.cpp diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am index b22ded0a191..c67eadc05df 100644 --- a/src/gallium/drivers/swr/Makefile.am +++ b/src/gallium/drivers/swr/Makefile.am @@ -61,7 +61,8 @@ BUILT_SOURCES = \ rasterizer/archrast/gen_ar_event.h \ rasterizer/archrast/gen_ar_event.cpp \ rasterizer/archrast/gen_ar_eventhandler.h \ - rasterizer/archrast/gen_ar_eventhandlerfile.h + rasterizer/archrast/gen_ar_eventhandlerfile.h \ + rasterizer/core/BackendPixelRate0.cpp MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D) PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS) @@ -156,6 +157,21 @@ rasterizer/archrast/gen_ar_eventhandlerfile.h: rasterizer/scripts/gen_archrast.p --output rasterizer/archrast/gen_ar_eventhandlerfile.h \ --gen_eventhandlerfile_h +# 5 SWR_MULTISAMPLE_TYPE_COUNT +# 2 SWR_MSAA_SAMPLE_PATTERN_COUNT +# 3 SWR_INPUT_COVERAGE_COUNT +# 2 centroid +# 2 forcedSampleCount +# 2 canEarlyZ +rasterizer/core/BackendPixelRate0.cpp: rasterizer/scripts/gen_backends.py rasterizer/scripts/templates/backend_template.cpp + $(MKDIR_GEN) + $(PYTHON_GEN) \ + $(srcdir)/rasterizer/scripts/gen_backends.py \ + --outdir rasterizer/core \ + --dim 5 2 3 2 2 2 \ + --split 0 \ + --cpp + COMMON_LIBADD = \ $(top_builddir)/src/gallium/auxiliary/libgallium.la \ $(top_builddir)/src/mesa/libmesagallium.la \ @@ -250,6 +266,7 @@ EXTRA_DIST = \ rasterizer/jitter/scripts/gen_llvm_ir_macros.py \ rasterizer/jitter/scripts/gen_llvm_types.py \ rasterizer/scripts/gen_archrast.py \ + rasterizer/scripts/gen_backends.py \ rasterizer/scripts/gen_knobs.py \ rasterizer/scripts/knob_defs.py \ rasterizer/scripts/mako/ast.py \ @@ -273,4 +290,5 @@ EXTRA_DIST = \ rasterizer/scripts/templates/ar_event_h.template \ rasterizer/scripts/templates/ar_event_cpp.template \ rasterizer/scripts/templates/ar_eventhandler_h.template \ - rasterizer/scripts/templates/ar_eventhandlerfile_h.template + rasterizer/scripts/templates/ar_eventhandlerfile_h.template \ + rasterizer/scripts/templates/backend_template.cpp diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript index c0677afc97f..dafeb9229d7 100644 --- a/src/gallium/drivers/swr/SConscript +++ b/src/gallium/drivers/swr/SConscript @@ -132,12 +132,25 @@ env.CodeGenerate( command = python_cmd + ' $SCRIPT --proto $SOURCE --output $TARGET --gen_eventhandlerfile_h' ) +# 5 SWR_MULTISAMPLE_TYPE_COUNT +# 2 SWR_MSAA_SAMPLE_PATTERN_COUNT +# 3 SWR_INPUT_COVERAGE_COUNT +# 2 centroid +# 2 forcedSampleCount +# 2 canEarlyZ +env.CodeGenerate( + target = 'rasterizer/core/BackendPixelRate0.cpp', + script = swrroot + 'rasterizer/scripts/gen_backends.py', + command = python_cmd + ' $SCRIPT --output rasterizer/core --dim 5 2 3 2 2 2 --split 0 --cpp' +) + # Auto-generated .cpp files (that need to generate object files) built_sources = [ 'rasterizer/scripts/gen_knobs.cpp', 'rasterizer/jitter/builder_gen.cpp', 'rasterizer/jitter/builder_x86.cpp', 'rasterizer/archrast/gen_ar_event.cpp', + 'rasterizer/core/BackendPixelRate0.cpp', ] source = built_sources diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 3f63b28e71c..032a2206d62 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -30,7 +30,6 @@ #include #include "backend.h" -#include "depthstencil.h" #include "tilemgr.h" #include "memory/tilingtraits.h" #include "core/multisample.h" @@ -862,203 +861,6 @@ Endtile: AR_END(BESampleRateBackend, 0); } - -template -void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) -{ - SWR_CONTEXT *pContext = pDC->pContext; - - AR_BEGIN(BEPixelRateBackend, pDC->drawId); - AR_BEGIN(BESetup, pDC->drawId); - - const API_STATE &state = GetApiState(pDC); - - BarycentricCoeffs coeffs; - SetupBarycentricCoeffs(&coeffs, work); - - uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); - - SWR_PS_CONTEXT psContext; - SetupPixelShaderContext(&psContext, work); - - AR_END(BESetup, 0); - - PixelRateZTestLoop PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask); - - psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast(y))); - psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast(y))); - - const simdscalar dy = _simd_set1_ps(static_cast(SIMD_TILE_Y_DIM)); - - for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) - { - psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast(x))); - psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast(x))); - - const simdscalar dx = _simd_set1_ps(static_cast(SIMD_TILE_X_DIM)); - - for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) - { -#if USE_8x2_TILE_BACKEND - const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); - -#endif - simdscalar activeLanes; - if(!(work.anyCoveredSamples & MASK)) {goto Endtile;}; - activeLanes = vMask(work.anyCoveredSamples & MASK); - - if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) - { - const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; - - generateInputCoverage(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); - } - - AR_BEGIN(BEBarycentric, pDC->drawId); - - CalcPixelBarycentrics(coeffs, psContext); - - CalcCentroid(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask); - - AR_END(BEBarycentric, 0); - - if(T::bForcedSampleCount) - { - // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set - const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si())); - activeLanes = _simd_and_ps(activeLanes, vSampleMask); - } - - // Early-Z? - if(T::bCanEarlyZ && !T::bForcedSampleCount) - { - uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest); - UPDATE_STAT_BE(DepthPassCount, depthPassCount); - AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes))); - } - - // if we have no covered samples that passed depth at this point, go to next tile - if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; - - if(state.psState.usesSourceDepth) - { - AR_BEGIN(BEBarycentric, pDC->drawId); - // interpolate and quantize z - psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); - psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); - AR_END(BEBarycentric, 0); - } - - // pixels that are currently active - psContext.activeMask = _simd_castps_si(activeLanes); - psContext.oMask = T::MultisampleT::FullSampleMask(); - - // execute pixel shader - AR_BEGIN(BEPixelShader, pDC->drawId); - state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); - UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes))); - AR_END(BEPixelShader, 0); - - // update active lanes to remove any discarded or oMask'd pixels - activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si()))); - if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; - - // late-Z - if(!T::bCanEarlyZ && !T::bForcedSampleCount) - { - uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest); - UPDATE_STAT_BE(DepthPassCount, depthPassCount); - AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes))); - } - - // if we have no covered samples that passed depth at this point, skip OM and go to next tile - if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; - - // output merger - // loop over all samples, broadcasting the results of the PS to all passing pixels - for(uint32_t sample = 0; sample < GetNumOMSamples(state.blendState.sampleCount); sample++) - { - AR_BEGIN(BEOutputMerger, pDC->drawId); - // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples - uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0; - simdscalar coverageMask, depthMask; - if(T::bForcedSampleCount) - { - coverageMask = depthMask = activeLanes; - } - else - { - coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum]; - depthMask = PixelRateZTest.depthPassMask[coverageSampleNum]; - if(!_simd_movemask_ps(depthMask)) - { - // stencil should already have been written in early/lateZ tests - AR_END(BEOutputMerger, 0); - continue; - } - } - - // broadcast the results of the PS to all passing pixels -#if USE_8x2_TILE_BACKEND - OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); -#else - OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets); -#endif - - if(!state.psState.forceEarlyZ && !T::bForcedSampleCount) - { - uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); - uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); - - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum], - pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]); - } - AR_END(BEOutputMerger, 0); - } -Endtile: - AR_BEGIN(BEEndTile, pDC->drawId); - - for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) - { - work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - } - - if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) - { - work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - } - work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - -#if USE_8x2_TILE_BACKEND - if (useAlternateOffset) - { - for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) - { - pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - } - } -#else - for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) - { - pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - } - pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; -#endif - - AR_END(BEEndTile, 0); - - psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); - psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); - } - - psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); - psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); - } - - AR_END(BEPixelRateBackend, 0); -} // optimized backend flow with NULL PS template void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) @@ -1302,31 +1104,6 @@ void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COU } } -void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2]) -{ - for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++) - { - for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_COUNT; samplePattern++) - { - for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) - { - for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) - { - for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++) - { - for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) - { - table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] = - BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (SWR_INPUT_COVERAGE)inputCoverage, - (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE); - } - } - } - } - } - } -} - void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]) { for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++) @@ -1346,10 +1123,11 @@ void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_C } } +void InitBackendPixelRate0(); void InitBackendFuncTables() { InitBackendSingleFuncTable(gBackendSingleSample); - InitBackendPixelFuncTable(gBackendPixelRateTable); + InitBackendPixelRate0(); InitBackendSampleFuncTable(gBackendSampleRateTable); gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 80ee1defdad..c3585cc930c 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -31,6 +31,7 @@ #include "common/os.h" #include "core/context.h" #include "core/multisample.h" +#include "depthstencil.h" #include "rdtsc_core.h" void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer); @@ -835,6 +836,204 @@ INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW } #endif + +template +void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) +{ + SWR_CONTEXT *pContext = pDC->pContext; + + AR_BEGIN(BEPixelRateBackend, pDC->drawId); + AR_BEGIN(BESetup, pDC->drawId); + + const API_STATE &state = GetApiState(pDC); + + BarycentricCoeffs coeffs; + SetupBarycentricCoeffs(&coeffs, work); + + uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer; + SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); + + SWR_PS_CONTEXT psContext; + SetupPixelShaderContext(&psContext, work); + + AR_END(BESetup, 0); + + PixelRateZTestLoop PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask); + + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast(y))); + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast(y))); + + const simdscalar dy = _simd_set1_ps(static_cast(SIMD_TILE_Y_DIM)); + + for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) + { + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast(x))); + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast(x))); + + const simdscalar dx = _simd_set1_ps(static_cast(SIMD_TILE_X_DIM)); + + for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) + { +#if USE_8x2_TILE_BACKEND + const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); + +#endif + simdscalar activeLanes; + if(!(work.anyCoveredSamples & MASK)) {goto Endtile;}; + activeLanes = vMask(work.anyCoveredSamples & MASK); + + if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) + { + const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; + + generateInputCoverage(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); + } + + AR_BEGIN(BEBarycentric, pDC->drawId); + + CalcPixelBarycentrics(coeffs, psContext); + + CalcCentroid(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask); + + AR_END(BEBarycentric, 0); + + if(T::bForcedSampleCount) + { + // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set + const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si())); + activeLanes = _simd_and_ps(activeLanes, vSampleMask); + } + + // Early-Z? + if(T::bCanEarlyZ && !T::bForcedSampleCount) + { + uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest); + UPDATE_STAT_BE(DepthPassCount, depthPassCount); + AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes))); + } + + // if we have no covered samples that passed depth at this point, go to next tile + if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; + + if(state.psState.usesSourceDepth) + { + AR_BEGIN(BEBarycentric, pDC->drawId); + // interpolate and quantize z + psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); + psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); + AR_END(BEBarycentric, 0); + } + + // pixels that are currently active + psContext.activeMask = _simd_castps_si(activeLanes); + psContext.oMask = T::MultisampleT::FullSampleMask(); + + // execute pixel shader + AR_BEGIN(BEPixelShader, pDC->drawId); + state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); + UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes))); + AR_END(BEPixelShader, 0); + + // update active lanes to remove any discarded or oMask'd pixels + activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si()))); + if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; + + // late-Z + if(!T::bCanEarlyZ && !T::bForcedSampleCount) + { + uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest); + UPDATE_STAT_BE(DepthPassCount, depthPassCount); + AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes))); + } + + // if we have no covered samples that passed depth at this point, skip OM and go to next tile + if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; + + // output merger + // loop over all samples, broadcasting the results of the PS to all passing pixels + for(uint32_t sample = 0; sample < GetNumOMSamples(state.blendState.sampleCount); sample++) + { + AR_BEGIN(BEOutputMerger, pDC->drawId); + // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples + uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0; + simdscalar coverageMask, depthMask; + if(T::bForcedSampleCount) + { + coverageMask = depthMask = activeLanes; + } + else + { + coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum]; + depthMask = PixelRateZTest.depthPassMask[coverageSampleNum]; + if(!_simd_movemask_ps(depthMask)) + { + // stencil should already have been written in early/lateZ tests + AR_END(BEOutputMerger, 0); + continue; + } + } + + // broadcast the results of the PS to all passing pixels +#if USE_8x2_TILE_BACKEND + OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); +#else + OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets); +#endif + + if(!state.psState.forceEarlyZ && !T::bForcedSampleCount) + { + uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); + uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); + + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum], + pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]); + } + AR_END(BEOutputMerger, 0); + } +Endtile: + AR_BEGIN(BEEndTile, pDC->drawId); + + for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) + { + work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + } + + if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) + { + work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + } + work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + +#if USE_8x2_TILE_BACKEND + if (useAlternateOffset) + { + for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) + { + pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + } + } +#else + for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) + { + pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + } + pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; +#endif + + AR_END(BEEndTile, 0); + + psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); + psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); + } + + psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); + psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); + } + + AR_END(BEPixelRateBackend, 0); +} + template struct SwrBackendTraits diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_backends.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_backends.py new file mode 100644 index 00000000000..cbbc3780a68 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/gen_backends.py @@ -0,0 +1,125 @@ +# Copyright (C) 2017 Intel Corporation. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +# Python source +# Compatible with Python2.X and Python3.X + +from __future__ import print_function +import itertools +import math +import argparse +import os +import sys +from mako.template import Template +from mako.exceptions import RichTraceback + +def write_template_to_string(template_filename, **kwargs): + try: + template = Template(filename=os.path.abspath(template_filename)) + # Split + Join fixes line-endings for whatever platform you are using + return '\n'.join(template.render(**kwargs).splitlines()) + except: + traceback = RichTraceback() + for (filename, lineno, function, line) in traceback.traceback: + print("File %s, line %s, in %s" % (filename, lineno, function)) + print(line, "\n") + print("%s: %s" % (str(traceback.error.__class__.__name__), traceback.error)) + +def write_template_to_file(template_filename, output_filename, **kwargs): + output_dirname = os.path.dirname(output_filename) + if not os.path.exists(output_dirname): + os.makedirs(output_dirname) + with open(output_filename, "w") as outfile: + print(write_template_to_string(template_filename, **kwargs), file=outfile) + + +def main(args=sys.argv[1:]): + thisDir = os.path.dirname(os.path.realpath(__file__)) + parser = argparse.ArgumentParser("Generate files and initialization functions for all permutuations of BackendPixelRate.") + parser.add_argument('--dim', help="gBackendPixelRateTable array dimensions", nargs='+', type=int, required=True) + parser.add_argument('--outdir', help="output directory", nargs='?', type=str, default=thisDir) + parser.add_argument('--split', help="how many lines of initialization per file [0=no split]", nargs='?', type=int, default='512') + parser.add_argument('--cpp', help="Generate cpp file(s)", action='store_true', default=False) + parser.add_argument('--cmake', help="Generate cmake file", action='store_true', default=False) + + + args = parser.parse_args(args); + + output_list = [] + for x in args.dim: + output_list.append(list(range(x))) + + # generate all permutations possible for template paremeter inputs + output_combinations = list(itertools.product(*output_list)) + output_list = [] + + # for each permutation + for x in range(len(output_combinations)): + # separate each template peram into its own list member + new_list = [output_combinations[x][i] for i in range(len(output_combinations[x]))] + tempStr = 'gBackendPixelRateTable' + #print each list member as an index in the multidimensional array + for i in new_list: + tempStr += '[' + str(i) + ']' + #map each entry in the permuation as its own string member, store as the template instantiation string + tempStr += " = BackendPixelRate>;' + #append the line of c++ code in the list of output lines + output_list.append(tempStr) + + # how many files should we split the global template initialization into? + if (args.split == 0): + numFiles = 1 + else: + numFiles = (len(output_list) + args.split - 1) // args.split + linesPerFile = (len(output_list) + numFiles - 1) // numFiles + chunkedList = [output_list[x:x+linesPerFile] for x in range(0, len(output_list), linesPerFile)] + + # generate .cpp files + if args.cpp: + baseCppName = os.path.join(args.outdir, 'BackendPixelRate%s.cpp') + templateCpp = os.path.join(thisDir, 'templates', 'backend_template.cpp') + + for fileNum in range(numFiles): + filename = baseCppName % str(fileNum) + print('Generating', filename) + write_template_to_file( + templateCpp, + baseCppName % str(fileNum), + fileNum=fileNum, + funcList=chunkedList[fileNum]) + + # generate gen_backend.cmake file + if args.cmake: + templateCmake = os.path.join(thisDir, 'templates', 'backend_template.cmake') + cmakeFile = os.path.join(args.outdir, 'gen_backends.cmake') + print('Generating', cmakeFile) + write_template_to_file( + templateCmake, + cmakeFile, + numFiles=numFiles, + baseCppName=baseCppName.replace('\\','/')) + + print("Generated %d template instantiations in %d files" % (len(output_list), numFiles)) + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/backend_template.cpp b/src/gallium/drivers/swr/rasterizer/scripts/templates/backend_template.cpp new file mode 100644 index 00000000000..f015f5f179c --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/backend_template.cpp @@ -0,0 +1,38 @@ +/**************************************************************************** +* Copyright (C) 2017 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file BackendPixelRate${fileNum}.cpp +* +* @brief auto-generated file +* +* DO NOT EDIT +* +******************************************************************************/ + +#include "core/backend.h" + +void InitBackendPixelRate${fileNum}() +{ + %for func in funcList: + ${func} + %endfor +} -- 2.30.2