From: Tim Rowley Date: Mon, 26 Jun 2017 17:32:01 +0000 (-0500) Subject: swr/rast: Support render target mask instead of render target count X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=d8ebcad540e4d34a32d039779bd69f8652e0a450;p=mesa.git swr/rast: Support render target mask instead of render target count WIP to support read-only render targets. Reviewed-by: Bruce Cherniak --- diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 087a24a95ed..49058903c12 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -957,20 +957,26 @@ void SetupPipeline(DRAW_CONTEXT *pDC) (pState->state.depthStencilState.stencilTestEnable || pState->state.depthStencilState.stencilWriteEnable)) ? true : false; - uint32_t numRTs = pState->state.psState.numRenderTargets; - pState->state.colorHottileEnable = 0; + pState->state.colorHottileEnable = pState->state.psState.renderTargetMask; + + // Disable hottile for surfaces with no writes if (psState.pfnPixelShader != nullptr) { - for (uint32_t rt = 0; rt < numRTs; ++rt) + DWORD rt; + uint32_t rtMask = pState->state.colorHottileEnable; + while (_BitScanForward(&rt, rtMask)) { - pState->state.colorHottileEnable |= - (!pState->state.blendState.renderTarget[rt].writeDisableAlpha || - !pState->state.blendState.renderTarget[rt].writeDisableRed || - !pState->state.blendState.renderTarget[rt].writeDisableGreen || - !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0; + rtMask &= ~(1 << rt); + + if (pState->state.blendState.renderTarget[rt].writeDisableAlpha && + pState->state.blendState.renderTarget[rt].writeDisableRed && + pState->state.blendState.renderTarget[rt].writeDisableGreen && + pState->state.blendState.renderTarget[rt].writeDisableBlue) + { + pState->state.colorHottileEnable &= ~(1 << rt); + } } } - // Setup depth quantization function if (pState->state.depthHottileEnable) { diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h index 2e32e4d32cb..b6a86b59ecb 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h @@ -475,16 +475,15 @@ inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); } -inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers) +inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorHotTileMask, RenderOutputBuffers &renderBuffers) { - assert(colorBufferCount <= SWR_NUM_RENDERTARGETS); - - if (pColorBuffer) + + DWORD index; + while (_BitScanForward(&index, colorHotTileMask)) { - for (uint32_t index = 0; index < colorBufferCount; index += 1) - { - pColorBuffer[index] = renderBuffers.pColor[index]; - } + assert(index < SWR_NUM_RENDERTARGETS); + colorHotTileMask &= ~(1 << index); + pColorBuffer[index] = renderBuffers.pColor[index]; } if (pDepthBuffer) @@ -712,14 +711,16 @@ static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_P // Merge Output to 4x2 SIMD Tile Format INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, - const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT) + const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, uint32_t renderTargetMask) { // type safety guaranteed from template instantiation in BEChooser<>::GetFunc const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); simdvector blendOut; - for(uint32_t rt = 0; rt < NumRT; ++rt) + DWORD rt = 0; + while (_BitScanForward(&rt, renderTargetMask)) { + renderTargetMask &= ~(1 << rt); uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset; const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; @@ -776,7 +777,7 @@ INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW #if USE_8x2_TILE_BACKEND // Merge Output to 8x2 SIMD16 Tile Format INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, - const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, const uint32_t colorBufferEnableMask, bool useAlternateOffset) + const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, uint32_t renderTargetMask, bool useAlternateOffset) { // type safety guaranteed from template instantiation in BEChooser<>::GetFunc uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); @@ -789,20 +790,27 @@ INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW simdvector blendSrc; simdvector blendOut; - uint32_t colorBufferBit = 1; - for (uint32_t rt = 0; rt < NumRT; rt += 1, colorBufferBit <<= 1) + DWORD rt; + while (_BitScanForward(&rt, renderTargetMask)) { - simdscalar *pColorSample = reinterpret_cast(pColorBase[rt] + rasterTileColorOffset); + renderTargetMask &= ~(1 << rt); const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; - if (colorBufferBit & colorBufferEnableMask) + simdscalar* pColorSample; + bool hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed || !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue; + if (hotTileEnable) { + pColorSample = reinterpret_cast(pColorBase[rt] + rasterTileColorOffset); blendSrc[0] = pColorSample[0]; blendSrc[1] = pColorSample[2]; blendSrc[2] = pColorSample[4]; blendSrc[3] = pColorSample[6]; } + else + { + pColorSample = nullptr; + } { // pfnBlendFunc may not update all channels. Initialize with PS output. @@ -874,7 +882,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t SetupPixelShaderContext(&psContext, samplePos, work); uint8_t *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); + SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers); AR_END(BESetup, 0); @@ -994,9 +1002,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t // broadcast the results of the PS to all passing pixels #if USE_8x2_TILE_BACKEND - OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); + OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask, useAlternateOffset); #else // USE_8x2_TILE_BACKEND - OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets); + OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask); #endif // USE_8x2_TILE_BACKEND if(!state.psState.forceEarlyZ && !T::bForcedSampleCount) @@ -1026,14 +1034,20 @@ Endtile: #if USE_8x2_TILE_BACKEND if (useAlternateOffset) { - for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) + DWORD rt; + uint32_t rtMask = state.colorHottileEnable; + while (_BitScanForward(&rt, rtMask)) { + rtMask &= ~(1 << rt); psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; } } #else - for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) + DWORD rt; + uint32_t rtMask = state.colorHottileEnable; + while (_BitScanForward(&rt, rtMask)) { + rtMask &= ~(1 << rt); psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; } #endif diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp index 2dca5d8bf9f..d81352aee57 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp @@ -55,7 +55,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ SetupPixelShaderContext(&psContext, samplePos, work); uint8_t *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); + SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers); AR_END(BESetup, 0); @@ -198,9 +198,9 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ // output merger AR_BEGIN(BEOutputMerger, pDC->drawId); #if USE_8x2_TILE_BACKEND - OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); + OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset); #else - OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets); + OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask); #endif // do final depth write after all pixel kills @@ -227,14 +227,20 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ #if USE_8x2_TILE_BACKEND if (useAlternateOffset) { - for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) + DWORD rt; + uint32_t rtMask = state.colorHottileEnable; + while (_BitScanForward(&rt, rtMask)) { + rtMask &= ~(1 << rt); psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; } } #else - for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) + DWORD rt; + uint32_t rtMask = state.colorHottileEnable; + while (_BitScanForward(&rt, rtMask)) { + rtMask &= ~(1 << rt); psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; } #endif diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp index 8ae2cf41dfc..34875d342d5 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp @@ -55,7 +55,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 SetupPixelShaderContext(&psContext, samplePos, work); uint8_t *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); + SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers); AR_END(BESetup, 1); @@ -183,9 +183,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 // output merger AR_BEGIN(BEOutputMerger, pDC->drawId); #if USE_8x2_TILE_BACKEND - OutputMerger8x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); + OutputMerger8x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset); #else - OutputMerger4x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets); + OutputMerger4x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask); #endif // do final depth write after all pixel kills @@ -209,14 +209,20 @@ Endtile: #if USE_8x2_TILE_BACKEND if (useAlternateOffset) { - for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) + DWORD rt; + uint32_t rtMask = state.colorHottileEnable; + while(_BitScanForward(&rt, rtMask)) { + rtMask &= ~(1 << rt); psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; } } #else - for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) + DWORD rt; + uint32_t rtMask = state.colorHottileEnable; + while (_BitScanForward(&rt, rtMask)) { + rtMask &= ~(1 << rt); psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; } #endif diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h index b73a99b4540..081e4dd67d7 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h @@ -42,9 +42,9 @@ extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPU template void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex); template -void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers); +void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers); template -void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow); +void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow); #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3} static const __m256d gMaskToVecpd[] = @@ -1281,7 +1281,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, { vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX)); } - StepRasterTileX(state.psState.numRenderTargets, renderBuffers); + StepRasterTileX(state.colorHottileEnable, renderBuffers); } // step to the next tile in Y @@ -1289,7 +1289,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, { vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY)); } - StepRasterTileY(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow); + StepRasterTileY(state.colorHottileEnable, renderBuffers, currentRenderBufferRow); } AR_END(BERasterizeTriangle, 1); @@ -1348,10 +1348,12 @@ void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint } template -INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers) +INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers) { - for(uint32_t rt = 0; rt < NumRT; ++rt) + DWORD rt = 0; + while (_BitScanForward(&rt, colorHotTileMask)) { + colorHotTileMask &= ~(1 << rt); buffers.pColor[rt] += RT::colorRasterTileStep; } @@ -1360,10 +1362,12 @@ INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers) } template -INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow) +INLINE void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow) { - for(uint32_t rt = 0; rt < NumRT; ++rt) + DWORD rt = 0; + while (_BitScanForward(&rt, colorHotTileMask)) { + colorHotTileMask &= ~(1 << rt); startBufferRow.pColor[rt] += RT::colorRasterTileRowStep; buffers.pColor[rt] = startBufferRow.pColor[rt]; } diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 4fbd74ddc4a..d9e92807411 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -1139,7 +1139,7 @@ struct SWR_PS_STATE uint32_t writesODepth : 1; // pixel shader writes to depth uint32_t usesSourceDepth : 1; // pixel shader reads depth uint32_t shadingRate : 2; // shading per pixel / sample / coarse pixel - uint32_t numRenderTargets : 4; // number of render target outputs in use (0-8) + uint32_t renderTargetMask : 8; // number of render target outputs in use (0-8) uint32_t posOffset : 2; // type of offset (none, sample, centroid) to add to pixel position uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate attributes with uint32_t usesUAV : 1; // pixel shader accesses UAV diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp index 534f3c59b1c..501fdea7880 100644 --- a/src/gallium/drivers/swr/swr_state.cpp +++ b/src/gallium/drivers/swr/swr_state.cpp @@ -1461,7 +1461,7 @@ swr_update_derived(struct pipe_context *pipe, psState.writesODepth = ctx->fs->info.base.writes_z; psState.usesSourceDepth = ctx->fs->info.base.reads_z; psState.shadingRate = SWR_SHADING_RATE_PIXEL; - psState.numRenderTargets = ctx->framebuffer.nr_cbufs; + psState.renderTargetMask = (1 << ctx->framebuffer.nr_cbufs) - 1; psState.posOffset = SWR_PS_POSITION_SAMPLE_NONE; uint32_t barycentricsMask = 0; #if 0