swr/rast: Support render target mask instead of render target count
authorTim Rowley <timothy.o.rowley@intel.com>
Mon, 26 Jun 2017 17:32:01 +0000 (12:32 -0500)
committerTim Rowley <timothy.o.rowley@intel.com>
Thu, 13 Jul 2017 13:47:10 +0000 (08:47 -0500)
WIP to support read-only render targets.

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
src/gallium/drivers/swr/rasterizer/core/api.cpp
src/gallium/drivers/swr/rasterizer/core/backend_impl.h
src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
src/gallium/drivers/swr/rasterizer/core/state.h
src/gallium/drivers/swr/swr_state.cpp

index 087a24a95edf989b911aef80e1104765f70f5737..49058903c12727ab55c5460959a1e03643560957 100644 (file)
@@ -957,20 +957,26 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
                                           (pState->state.depthStencilState.stencilTestEnable  ||
                                            pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
 
-    uint32_t numRTs = pState->state.psState.numRenderTargets;
-    pState->state.colorHottileEnable = 0;
+    pState->state.colorHottileEnable = pState->state.psState.renderTargetMask;
+
+    // Disable hottile for surfaces with no writes
     if (psState.pfnPixelShader != nullptr)
     {
-        for (uint32_t rt = 0; rt < numRTs; ++rt)
+        DWORD rt;
+        uint32_t rtMask = pState->state.colorHottileEnable;
+        while (_BitScanForward(&rt, rtMask))
         {
-            pState->state.colorHottileEnable |=  
-                (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
-                 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
-                 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
-                 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
+            rtMask &= ~(1 << rt);
+
+            if (pState->state.blendState.renderTarget[rt].writeDisableAlpha &&
+                pState->state.blendState.renderTarget[rt].writeDisableRed &&
+                pState->state.blendState.renderTarget[rt].writeDisableGreen &&
+                pState->state.blendState.renderTarget[rt].writeDisableBlue)
+            {
+                pState->state.colorHottileEnable &= ~(1 << rt);
+            }
         }
     }
-
     // Setup depth quantization function
     if (pState->state.depthHottileEnable)
     {
index 2e32e4d32cb0299afc6c7ea88e13a4b6e84f6179..b6a86b59ecb183f2c1ba2bcf920407effd84d733 100644 (file)
@@ -475,16 +475,15 @@ inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE
     coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
 }
 
-inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers)
+inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorHotTileMask, RenderOutputBuffers &renderBuffers)
 {
-    assert(colorBufferCount <= SWR_NUM_RENDERTARGETS);
-
-    if (pColorBuffer)
+    
+    DWORD index;
+    while (_BitScanForward(&index, colorHotTileMask))
     {
-        for (uint32_t index = 0; index < colorBufferCount; index += 1)
-        {
-            pColorBuffer[index] = renderBuffers.pColor[index];
-        }
+        assert(index < SWR_NUM_RENDERTARGETS);
+        colorHotTileMask &= ~(1 << index);
+        pColorBuffer[index] = renderBuffers.pColor[index];
     }
 
     if (pDepthBuffer)
@@ -712,14 +711,16 @@ static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_P
 
 // Merge Output to 4x2 SIMD Tile Format
 INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
-    const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT)
+    const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, uint32_t renderTargetMask)
 {
     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
     const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
     simdvector blendOut;
 
-    for(uint32_t rt = 0; rt < NumRT; ++rt)
+    DWORD rt = 0;
+    while (_BitScanForward(&rt, renderTargetMask))
     {
+        renderTargetMask &= ~(1 << rt);
         uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
 
         const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
@@ -776,7 +777,7 @@ INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW
 #if USE_8x2_TILE_BACKEND
 // Merge Output to 8x2 SIMD16 Tile Format
 INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
-    const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, const uint32_t colorBufferEnableMask, bool useAlternateOffset)
+    const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, uint32_t renderTargetMask, bool useAlternateOffset)
 {
     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
     uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
@@ -789,20 +790,27 @@ INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW
     simdvector blendSrc;
     simdvector blendOut;
 
-    uint32_t colorBufferBit = 1;
-    for (uint32_t rt = 0; rt < NumRT; rt += 1, colorBufferBit <<= 1)
+    DWORD rt;
+    while (_BitScanForward(&rt, renderTargetMask))
     {
-        simdscalar *pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
+        renderTargetMask &= ~(1 << rt);
 
         const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
 
-        if (colorBufferBit & colorBufferEnableMask)
+        simdscalar* pColorSample;
+        bool hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed || !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue;
+        if (hotTileEnable)
         {
+            pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
             blendSrc[0] = pColorSample[0];
             blendSrc[1] = pColorSample[2];
             blendSrc[2] = pColorSample[4];
             blendSrc[3] = pColorSample[6];
         }
+        else
+        {
+            pColorSample = nullptr;
+        }
 
         {
             // pfnBlendFunc may not update all channels.  Initialize with PS output.
@@ -874,7 +882,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
     SetupPixelShaderContext<T>(&psContext, samplePos, work);
 
     uint8_t *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
+    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
 
     AR_END(BESetup, 0);
 
@@ -994,9 +1002,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                 
                 // broadcast the results of the PS to all passing pixels
 #if USE_8x2_TILE_BACKEND
-                OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
+                OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask, useAlternateOffset);
 #else // USE_8x2_TILE_BACKEND
-                OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
+                OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask);
 #endif // USE_8x2_TILE_BACKEND
 
                 if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
@@ -1026,14 +1034,20 @@ Endtile:
 #if USE_8x2_TILE_BACKEND
             if (useAlternateOffset)
             {
-                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+                DWORD rt;
+                uint32_t rtMask = state.colorHottileEnable;
+                while (_BitScanForward(&rt, rtMask))
                 {
+                    rtMask &= ~(1 << rt);
                     psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
                 }
             }
 #else
-            for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+            DWORD rt;
+            uint32_t rtMask = state.colorHottileEnable;
+            while (_BitScanForward(&rt, rtMask))
             {
+                rtMask &= ~(1 << rt);
                 psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
             }
 #endif
index 2dca5d8bf9f1a6ca3aba48d706b71005fae5e3eb..d81352aee5753061aeb7f037c41750a742329894 100644 (file)
@@ -55,7 +55,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
     SetupPixelShaderContext<T>(&psContext, samplePos, work);
 
     uint8_t *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
+    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
 
     AR_END(BESetup, 0);
 
@@ -198,9 +198,9 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     // output merger
                     AR_BEGIN(BEOutputMerger, pDC->drawId);
 #if USE_8x2_TILE_BACKEND
-                    OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
+                    OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset);
 #else
-                    OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
+                    OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask);
 #endif
 
                     // do final depth write after all pixel kills
@@ -227,14 +227,20 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
 #if USE_8x2_TILE_BACKEND
             if (useAlternateOffset)
             {
-                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+                DWORD rt;
+                uint32_t rtMask = state.colorHottileEnable;
+                while (_BitScanForward(&rt, rtMask))
                 {
+                    rtMask &= ~(1 << rt);
                     psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
                 }
             }
 #else
-            for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+            DWORD rt;
+            uint32_t rtMask = state.colorHottileEnable;
+            while (_BitScanForward(&rt, rtMask))
             {
+                rtMask &= ~(1 << rt);
                 psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
             }
 #endif
index 8ae2cf41dfc6124bd2d6a09b5d36acfd510331f9..34875d342d5a32519f91ded155620d5999982960 100644 (file)
@@ -55,7 +55,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
     SetupPixelShaderContext<T>(&psContext, samplePos, work);
 
     uint8_t *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
+    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
 
     AR_END(BESetup, 1);
 
@@ -183,9 +183,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 // output merger
                 AR_BEGIN(BEOutputMerger, pDC->drawId);
 #if USE_8x2_TILE_BACKEND
-                OutputMerger8x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
+                OutputMerger8x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset);
 #else
-                OutputMerger4x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
+                OutputMerger4x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask);
 #endif
 
                 // do final depth write after all pixel kills
@@ -209,14 +209,20 @@ Endtile:
 #if USE_8x2_TILE_BACKEND
             if (useAlternateOffset)
             {
-                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+                DWORD rt;
+                uint32_t rtMask = state.colorHottileEnable;
+                while(_BitScanForward(&rt, rtMask))
                 {
+                    rtMask &= ~(1 << rt);
                     psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
                 }
             }
 #else
-            for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+            DWORD rt;
+            uint32_t rtMask = state.colorHottileEnable;
+            while (_BitScanForward(&rt, rtMask))
             {
+                rtMask &= ~(1 << rt);
                 psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
             }
 #endif
index b73a99b4540c95bc2d043c9ab60d6ae4c0050234..081e4dd67d79f177f33624ce04f6b137e6853090 100644 (file)
@@ -42,9 +42,9 @@ extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPU
 template <uint32_t numSamples = 1>
 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
 template <typename RT>
-void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers);
+void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers);
 template <typename RT>
-void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
+void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
 
 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
 static const __m256d gMaskToVecpd[] =
@@ -1281,7 +1281,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
             {
                 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
             }
-            StepRasterTileX<RT>(state.psState.numRenderTargets, renderBuffers);
+            StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers);
         }
 
         // step to the next tile in Y
@@ -1289,7 +1289,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
         {
             vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
         }
-        StepRasterTileY<RT>(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow);
+        StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow);
     }
 
     AR_END(BERasterizeTriangle, 1);
@@ -1348,10 +1348,12 @@ void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint
 }
 
 template <typename RT>
-INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers)
+INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers)
 {
-    for(uint32_t rt = 0; rt < NumRT; ++rt)
+    DWORD rt = 0;
+    while (_BitScanForward(&rt, colorHotTileMask))
     {
+        colorHotTileMask &= ~(1 << rt);
         buffers.pColor[rt] += RT::colorRasterTileStep;
     }
     
@@ -1360,10 +1362,12 @@ INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers)
 }
 
 template <typename RT>
-INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
+INLINE void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
 {
-    for(uint32_t rt = 0; rt < NumRT; ++rt)
+    DWORD rt = 0;
+    while (_BitScanForward(&rt, colorHotTileMask))
     {
+        colorHotTileMask &= ~(1 << rt);
         startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
         buffers.pColor[rt] = startBufferRow.pColor[rt];
     }
index 4fbd74ddc4a897d148610d1f2ce6819e2876af2c..d9e92807411b7b875eae012875420e32840dbf8c 100644 (file)
@@ -1139,7 +1139,7 @@ struct SWR_PS_STATE
     uint32_t writesODepth           : 1;    // pixel shader writes to depth
     uint32_t usesSourceDepth        : 1;    // pixel shader reads depth
     uint32_t shadingRate            : 2;    // shading per pixel / sample / coarse pixel
-    uint32_t numRenderTargets       : 4;    // number of render target outputs in use (0-8)
+    uint32_t renderTargetMask       : 8;    // number of render target outputs in use (0-8)
     uint32_t posOffset              : 2;    // type of offset (none, sample, centroid) to add to pixel position
     uint32_t barycentricsMask       : 3;    // which type(s) of barycentric coords does the PS interpolate attributes with
     uint32_t usesUAV                : 1;    // pixel shader accesses UAV 
index 534f3c59b1c9dc1cf2f096b6fe853355d46eb272..501fdea7880d41d556ddcb3c24b1c17bfd8fea22 100644 (file)
@@ -1461,7 +1461,7 @@ swr_update_derived(struct pipe_context *pipe,
       psState.writesODepth = ctx->fs->info.base.writes_z;
       psState.usesSourceDepth = ctx->fs->info.base.reads_z;
       psState.shadingRate = SWR_SHADING_RATE_PIXEL;
-      psState.numRenderTargets = ctx->framebuffer.nr_cbufs;
+      psState.renderTargetMask = (1 << ctx->framebuffer.nr_cbufs) - 1;
       psState.posOffset = SWR_PS_POSITION_SAMPLE_NONE;
       uint32_t barycentricsMask = 0;
 #if 0