swr: [rasterizer core/scripts] Autogen backend initialization function(s)
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / backend.h
index 80ee1defdad93b333e190830a1d6816550acd283..c3585cc930c26deabb66214937b81e5f4030b011 100644 (file)
@@ -31,6 +31,7 @@
 #include "common/os.h"
 #include "core/context.h"
 #include "core/multisample.h"
+#include "depthstencil.h"
 #include "rdtsc_core.h"
 
 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
@@ -835,6 +836,204 @@ INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW
 }
 
 #endif
+
+template<typename T>
+void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(BEPixelRateBackend, pDC->drawId);
+    AR_BEGIN(BESetup, pDC->drawId);
+
+    const API_STATE &state = GetApiState(pDC);
+
+    BarycentricCoeffs coeffs;
+    SetupBarycentricCoeffs(&coeffs, work);
+
+    uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
+    SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
+
+    SWR_PS_CONTEXT psContext;
+    SetupPixelShaderContext<T>(&psContext, work);
+
+    AR_END(BESetup, 0);
+
+    PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
+
+    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
+    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+
+    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
+
+    for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+    {
+        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
+        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+
+        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
+
+        for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+        {
+#if USE_8x2_TILE_BACKEND
+            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
+
+#endif
+            simdscalar activeLanes;
+            if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
+            activeLanes = vMask(work.anyCoveredSamples & MASK);
+
+            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
+            {
+                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+
+                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+            }
+
+            AR_BEGIN(BEBarycentric, pDC->drawId);
+
+            CalcPixelBarycentrics(coeffs, psContext);
+
+            CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
+
+            AR_END(BEBarycentric, 0);
+
+            if(T::bForcedSampleCount)
+            {
+                // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
+                const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
+                activeLanes = _simd_and_ps(activeLanes, vSampleMask);
+            }
+
+            // Early-Z?
+            if(T::bCanEarlyZ && !T::bForcedSampleCount)
+            {
+                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
+                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
+                AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
+            }
+
+            // if we have no covered samples that passed depth at this point, go to next tile
+            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+            if(state.psState.usesSourceDepth)
+            {
+                AR_BEGIN(BEBarycentric, pDC->drawId);
+                // interpolate and quantize z
+                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+                AR_END(BEBarycentric, 0);
+            }
+
+            // pixels that are currently active
+            psContext.activeMask = _simd_castps_si(activeLanes);
+            psContext.oMask = T::MultisampleT::FullSampleMask();
+
+            // execute pixel shader
+            AR_BEGIN(BEPixelShader, pDC->drawId);
+            state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+            UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
+            AR_END(BEPixelShader, 0);
+
+            // update active lanes to remove any discarded or oMask'd pixels
+            activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
+            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+            // late-Z
+            if(!T::bCanEarlyZ && !T::bForcedSampleCount)
+            {
+                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
+                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
+                AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
+            }
+
+            // if we have no covered samples that passed depth at this point, skip OM and go to next tile
+            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+            // output merger
+            // loop over all samples, broadcasting the results of the PS to all passing pixels
+            for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
+            {
+                AR_BEGIN(BEOutputMerger, pDC->drawId);
+                // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
+                uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
+                simdscalar coverageMask, depthMask;
+                if(T::bForcedSampleCount)
+                {
+                    coverageMask = depthMask = activeLanes;
+                }
+                else
+                {
+                    coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
+                    depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
+                    if(!_simd_movemask_ps(depthMask))
+                    {
+                        // stencil should already have been written in early/lateZ tests
+                        AR_END(BEOutputMerger, 0);
+                        continue;
+                    }
+                }
+                
+                // broadcast the results of the PS to all passing pixels
+#if USE_8x2_TILE_BACKEND
+                OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
+#else
+                OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
+#endif
+
+                if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
+                {
+                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+                    uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+
+                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
+                                      pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
+                }
+                AR_END(BEOutputMerger, 0);
+            }
+Endtile:
+            AR_BEGIN(BEEndTile, pDC->drawId);
+
+            for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
+            {
+                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+
+            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+            {
+                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+            work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+
+#if USE_8x2_TILE_BACKEND
+            if (useAlternateOffset)
+            {
+                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+                {
+                    pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                }
+            }
+#else
+            for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+            {
+                pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+            }
+            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+#endif
+
+            AR_END(BEEndTile, 0);
+
+            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
+            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
+        }
+
+        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
+        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
+    }
+
+    AR_END(BEPixelRateBackend, 0);
+}
+
 template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t samplePattern = SWR_MSAA_STANDARD_PATTERN,
          uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0>
 struct SwrBackendTraits