swr: [rasterizer core/scripts] Autogen backend initialization function(s)
authorTim Rowley <timothy.o.rowley@intel.com>
Wed, 15 Feb 2017 21:45:16 +0000 (13:45 -0800)
committerTim Rowley <timothy.o.rowley@intel.com>
Mon, 20 Mar 2017 23:04:53 +0000 (18:04 -0500)
Autogen functions that instantiates different BackendPixelRate templates.
Functions get split into separate files after reaching a user defined
threshold (currently 512 per file) to speed up compilation.

This change will enable the addition of more template flags in the pixel
back end.

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
src/gallium/drivers/swr/.gitignore
src/gallium/drivers/swr/Makefile.am
src/gallium/drivers/swr/SConscript
src/gallium/drivers/swr/rasterizer/core/backend.cpp
src/gallium/drivers/swr/rasterizer/core/backend.h
src/gallium/drivers/swr/rasterizer/scripts/gen_backends.py [new file with mode: 0644]
src/gallium/drivers/swr/rasterizer/scripts/templates/backend_template.cpp [new file with mode: 0644]

index 172f3bfbcb72745d80b06b9b58abb072b2257b34..b6c5faa110ad9f95ae70b5c0b1760c144b14f719 100644 (file)
@@ -10,3 +10,4 @@ rasterizer/jitter/builder_x86.h
 rasterizer/jitter/state_llvm.h
 rasterizer/scripts/gen_knobs.cpp
 rasterizer/scripts/gen_knobs.h
+rasterizer/core/BackendPixelRate0.cpp
index b22ded0a191d317afda5720b82bface296b9678e..c67eadc05df2295d8a3f3eec26e6b2f597dab4c1 100644 (file)
@@ -61,7 +61,8 @@ BUILT_SOURCES = \
        rasterizer/archrast/gen_ar_event.h \
        rasterizer/archrast/gen_ar_event.cpp \
        rasterizer/archrast/gen_ar_eventhandler.h \
-       rasterizer/archrast/gen_ar_eventhandlerfile.h
+       rasterizer/archrast/gen_ar_eventhandlerfile.h \
+       rasterizer/core/BackendPixelRate0.cpp
 
 MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
 PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS)
@@ -156,6 +157,21 @@ rasterizer/archrast/gen_ar_eventhandlerfile.h: rasterizer/scripts/gen_archrast.p
                --output rasterizer/archrast/gen_ar_eventhandlerfile.h \
                --gen_eventhandlerfile_h
 
+# 5 SWR_MULTISAMPLE_TYPE_COUNT
+# 2 SWR_MSAA_SAMPLE_PATTERN_COUNT
+# 3 SWR_INPUT_COVERAGE_COUNT
+# 2 centroid
+# 2 forcedSampleCount
+# 2 canEarlyZ
+rasterizer/core/BackendPixelRate0.cpp: rasterizer/scripts/gen_backends.py rasterizer/scripts/templates/backend_template.cpp
+       $(MKDIR_GEN)
+       $(PYTHON_GEN) \
+               $(srcdir)/rasterizer/scripts/gen_backends.py \
+               --outdir rasterizer/core \
+               --dim 5 2 3 2 2 2 \
+               --split 0 \
+               --cpp
+
 COMMON_LIBADD = \
        $(top_builddir)/src/gallium/auxiliary/libgallium.la \
        $(top_builddir)/src/mesa/libmesagallium.la \
@@ -250,6 +266,7 @@ EXTRA_DIST = \
        rasterizer/jitter/scripts/gen_llvm_ir_macros.py \
        rasterizer/jitter/scripts/gen_llvm_types.py \
        rasterizer/scripts/gen_archrast.py \
+       rasterizer/scripts/gen_backends.py \
        rasterizer/scripts/gen_knobs.py \
        rasterizer/scripts/knob_defs.py \
        rasterizer/scripts/mako/ast.py \
@@ -273,4 +290,5 @@ EXTRA_DIST = \
        rasterizer/scripts/templates/ar_event_h.template \
        rasterizer/scripts/templates/ar_event_cpp.template \
        rasterizer/scripts/templates/ar_eventhandler_h.template \
-       rasterizer/scripts/templates/ar_eventhandlerfile_h.template
+       rasterizer/scripts/templates/ar_eventhandlerfile_h.template \
+       rasterizer/scripts/templates/backend_template.cpp
index c0677afc97fda3b0fced3df3b63ce8967c9faeba..dafeb9229d7a79c097dc3fd68af37cd53c740662 100644 (file)
@@ -132,12 +132,25 @@ env.CodeGenerate(
     command = python_cmd + ' $SCRIPT --proto $SOURCE --output $TARGET --gen_eventhandlerfile_h'
 )
 
+# 5 SWR_MULTISAMPLE_TYPE_COUNT
+# 2 SWR_MSAA_SAMPLE_PATTERN_COUNT
+# 3 SWR_INPUT_COVERAGE_COUNT
+# 2 centroid
+# 2 forcedSampleCount
+# 2 canEarlyZ
+env.CodeGenerate(
+    target = 'rasterizer/core/BackendPixelRate0.cpp',
+    script = swrroot + 'rasterizer/scripts/gen_backends.py',
+    command = python_cmd + ' $SCRIPT --output rasterizer/core --dim 5 2 3 2 2 2 --split 0 --cpp'
+)
+
 # Auto-generated .cpp files (that need to generate object files)
 built_sources = [
     'rasterizer/scripts/gen_knobs.cpp',
     'rasterizer/jitter/builder_gen.cpp',
     'rasterizer/jitter/builder_x86.cpp',
     'rasterizer/archrast/gen_ar_event.cpp',
+    'rasterizer/core/BackendPixelRate0.cpp',
     ]
 
 source = built_sources
index 3f63b28e71cf481f9f22896cf3270cea1f4de23f..032a2206d62d1f7d0113c8b99531e7cdd8582c99 100644 (file)
@@ -30,7 +30,6 @@
 #include <smmintrin.h>
 
 #include "backend.h"
-#include "depthstencil.h"
 #include "tilemgr.h"
 #include "memory/tilingtraits.h"
 #include "core/multisample.h"
@@ -862,203 +861,6 @@ Endtile:
 
     AR_END(BESampleRateBackend, 0);
 }
-
-template<typename T>
-void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-{
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-    AR_BEGIN(BEPixelRateBackend, pDC->drawId);
-    AR_BEGIN(BESetup, pDC->drawId);
-
-    const API_STATE &state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
-
-    SWR_PS_CONTEXT psContext;
-    SetupPixelShaderContext<T>(&psContext, work);
-
-    AR_END(BESetup, 0);
-
-    PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
-
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
-    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
-    for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
-        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-#if USE_8x2_TILE_BACKEND
-            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-
-#endif
-            simdscalar activeLanes;
-            if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
-            activeLanes = vMask(work.anyCoveredSamples & MASK);
-
-            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
-            {
-                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
-
-                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
-            }
-
-            AR_BEGIN(BEBarycentric, pDC->drawId);
-
-            CalcPixelBarycentrics(coeffs, psContext);
-
-            CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
-
-            AR_END(BEBarycentric, 0);
-
-            if(T::bForcedSampleCount)
-            {
-                // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
-                const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
-                activeLanes = _simd_and_ps(activeLanes, vSampleMask);
-            }
-
-            // Early-Z?
-            if(T::bCanEarlyZ && !T::bForcedSampleCount)
-            {
-                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
-                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
-                AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
-            }
-
-            // if we have no covered samples that passed depth at this point, go to next tile
-            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
-
-            if(state.psState.usesSourceDepth)
-            {
-                AR_BEGIN(BEBarycentric, pDC->drawId);
-                // interpolate and quantize z
-                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
-                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-                AR_END(BEBarycentric, 0);
-            }
-
-            // pixels that are currently active
-            psContext.activeMask = _simd_castps_si(activeLanes);
-            psContext.oMask = T::MultisampleT::FullSampleMask();
-
-            // execute pixel shader
-            AR_BEGIN(BEPixelShader, pDC->drawId);
-            state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-            UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
-            AR_END(BEPixelShader, 0);
-
-            // update active lanes to remove any discarded or oMask'd pixels
-            activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
-            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
-
-            // late-Z
-            if(!T::bCanEarlyZ && !T::bForcedSampleCount)
-            {
-                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
-                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
-                AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
-            }
-
-            // if we have no covered samples that passed depth at this point, skip OM and go to next tile
-            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
-
-            // output merger
-            // loop over all samples, broadcasting the results of the PS to all passing pixels
-            for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
-            {
-                AR_BEGIN(BEOutputMerger, pDC->drawId);
-                // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
-                uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
-                simdscalar coverageMask, depthMask;
-                if(T::bForcedSampleCount)
-                {
-                    coverageMask = depthMask = activeLanes;
-                }
-                else
-                {
-                    coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
-                    depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
-                    if(!_simd_movemask_ps(depthMask))
-                    {
-                        // stencil should already have been written in early/lateZ tests
-                        AR_END(BEOutputMerger, 0);
-                        continue;
-                    }
-                }
-                
-                // broadcast the results of the PS to all passing pixels
-#if USE_8x2_TILE_BACKEND
-                OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
-#else
-                OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
-#endif
-
-                if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
-                {
-                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
-                    uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
-                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
-                                      pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
-                }
-                AR_END(BEOutputMerger, 0);
-            }
-Endtile:
-            AR_BEGIN(BEEndTile, pDC->drawId);
-
-            for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
-            {
-                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-            {
-                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-            work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-
-#if USE_8x2_TILE_BACKEND
-            if (useAlternateOffset)
-            {
-                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-                {
-                    pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-                }
-            }
-#else
-            for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-            {
-                pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-            }
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-#endif
-
-            AR_END(BEEndTile, 0);
-
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
-            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
-        }
-
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
-        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
-    }
-
-    AR_END(BEPixelRateBackend, 0);
-}
 // optimized backend flow with NULL PS
 template<uint32_t sampleCountT>
 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
@@ -1302,31 +1104,6 @@ void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COU
     }
 }
 
-void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2])
-{
-    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
-    {
-        for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_COUNT; samplePattern++)
-        {
-            for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
-            {
-                for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
-                {
-                    for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++)
-                    {
-                        for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
-                        {
-                            table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
-                                BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (SWR_INPUT_COVERAGE)inputCoverage, 
-                                                        (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
 {
     for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
@@ -1346,10 +1123,11 @@ void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_C
     }
 }
 
+void InitBackendPixelRate0();
 void InitBackendFuncTables()
 {    
     InitBackendSingleFuncTable(gBackendSingleSample);
-    InitBackendPixelFuncTable(gBackendPixelRateTable);
+    InitBackendPixelRate0();
     InitBackendSampleFuncTable(gBackendSampleRateTable);
 
     gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
index 80ee1defdad93b333e190830a1d6816550acd283..c3585cc930c26deabb66214937b81e5f4030b011 100644 (file)
@@ -31,6 +31,7 @@
 #include "common/os.h"
 #include "core/context.h"
 #include "core/multisample.h"
+#include "depthstencil.h"
 #include "rdtsc_core.h"
 
 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
@@ -835,6 +836,204 @@ INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW
 }
 
 #endif
+
+template<typename T>
+void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(BEPixelRateBackend, pDC->drawId);
+    AR_BEGIN(BESetup, pDC->drawId);
+
+    const API_STATE &state = GetApiState(pDC);
+
+    BarycentricCoeffs coeffs;
+    SetupBarycentricCoeffs(&coeffs, work);
+
+    uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
+    SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
+
+    SWR_PS_CONTEXT psContext;
+    SetupPixelShaderContext<T>(&psContext, work);
+
+    AR_END(BESetup, 0);
+
+    PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
+
+    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
+    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+
+    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
+
+    for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+    {
+        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
+        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+
+        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
+
+        for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+        {
+#if USE_8x2_TILE_BACKEND
+            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
+
+#endif
+            simdscalar activeLanes;
+            if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
+            activeLanes = vMask(work.anyCoveredSamples & MASK);
+
+            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
+            {
+                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+
+                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+            }
+
+            AR_BEGIN(BEBarycentric, pDC->drawId);
+
+            CalcPixelBarycentrics(coeffs, psContext);
+
+            CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask);
+
+            AR_END(BEBarycentric, 0);
+
+            if(T::bForcedSampleCount)
+            {
+                // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
+                const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
+                activeLanes = _simd_and_ps(activeLanes, vSampleMask);
+            }
+
+            // Early-Z?
+            if(T::bCanEarlyZ && !T::bForcedSampleCount)
+            {
+                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
+                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
+                AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
+            }
+
+            // if we have no covered samples that passed depth at this point, go to next tile
+            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+            if(state.psState.usesSourceDepth)
+            {
+                AR_BEGIN(BEBarycentric, pDC->drawId);
+                // interpolate and quantize z
+                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+                AR_END(BEBarycentric, 0);
+            }
+
+            // pixels that are currently active
+            psContext.activeMask = _simd_castps_si(activeLanes);
+            psContext.oMask = T::MultisampleT::FullSampleMask();
+
+            // execute pixel shader
+            AR_BEGIN(BEPixelShader, pDC->drawId);
+            state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+            UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
+            AR_END(BEPixelShader, 0);
+
+            // update active lanes to remove any discarded or oMask'd pixels
+            activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
+            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+            // late-Z
+            if(!T::bCanEarlyZ && !T::bForcedSampleCount)
+            {
+                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
+                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
+                AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
+            }
+
+            // if we have no covered samples that passed depth at this point, skip OM and go to next tile
+            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+            // output merger
+            // loop over all samples, broadcasting the results of the PS to all passing pixels
+            for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
+            {
+                AR_BEGIN(BEOutputMerger, pDC->drawId);
+                // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
+                uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
+                simdscalar coverageMask, depthMask;
+                if(T::bForcedSampleCount)
+                {
+                    coverageMask = depthMask = activeLanes;
+                }
+                else
+                {
+                    coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
+                    depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
+                    if(!_simd_movemask_ps(depthMask))
+                    {
+                        // stencil should already have been written in early/lateZ tests
+                        AR_END(BEOutputMerger, 0);
+                        continue;
+                    }
+                }
+                
+                // broadcast the results of the PS to all passing pixels
+#if USE_8x2_TILE_BACKEND
+                OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
+#else
+                OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
+#endif
+
+                if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
+                {
+                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+                    uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+
+                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
+                                      pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
+                }
+                AR_END(BEOutputMerger, 0);
+            }
+Endtile:
+            AR_BEGIN(BEEndTile, pDC->drawId);
+
+            for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
+            {
+                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+
+            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+            {
+                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+            work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+
+#if USE_8x2_TILE_BACKEND
+            if (useAlternateOffset)
+            {
+                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+                {
+                    pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                }
+            }
+#else
+            for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+            {
+                pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+            }
+            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+#endif
+
+            AR_END(BEEndTile, 0);
+
+            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
+            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
+        }
+
+        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
+        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
+    }
+
+    AR_END(BEPixelRateBackend, 0);
+}
+
 template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t samplePattern = SWR_MSAA_STANDARD_PATTERN,
          uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0>
 struct SwrBackendTraits
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_backends.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_backends.py
new file mode 100644 (file)
index 0000000..cbbc378
--- /dev/null
@@ -0,0 +1,125 @@
+# Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+# Python source
+# Compatible with Python2.X and Python3.X
+
+from __future__ import print_function
+import itertools
+import math
+import argparse
+import os
+import sys
+from mako.template import Template
+from mako.exceptions import RichTraceback
+
+def write_template_to_string(template_filename, **kwargs):
+    try:
+        template = Template(filename=os.path.abspath(template_filename))
+        # Split + Join fixes line-endings for whatever platform you are using
+        return '\n'.join(template.render(**kwargs).splitlines())
+    except:
+        traceback = RichTraceback()
+        for (filename, lineno, function, line) in traceback.traceback:
+            print("File %s, line %s, in %s" % (filename, lineno, function))
+            print(line, "\n")
+        print("%s: %s" % (str(traceback.error.__class__.__name__), traceback.error))
+
+def write_template_to_file(template_filename, output_filename, **kwargs):
+    output_dirname = os.path.dirname(output_filename)
+    if not os.path.exists(output_dirname):
+        os.makedirs(output_dirname)
+    with open(output_filename, "w") as outfile:
+        print(write_template_to_string(template_filename, **kwargs), file=outfile)
+
+
+def main(args=sys.argv[1:]):
+    thisDir = os.path.dirname(os.path.realpath(__file__))
+    parser = argparse.ArgumentParser("Generate files and initialization functions for all permutuations of BackendPixelRate.")
+    parser.add_argument('--dim', help="gBackendPixelRateTable array dimensions", nargs='+', type=int, required=True)
+    parser.add_argument('--outdir', help="output directory", nargs='?', type=str, default=thisDir)
+    parser.add_argument('--split', help="how many lines of initialization per file [0=no split]", nargs='?', type=int, default='512')
+    parser.add_argument('--cpp', help="Generate cpp file(s)", action='store_true', default=False)
+    parser.add_argument('--cmake', help="Generate cmake file", action='store_true', default=False)
+
+
+    args = parser.parse_args(args);
+
+    output_list = []
+    for x in args.dim:
+        output_list.append(list(range(x)))
+
+    # generate all permutations possible for template paremeter inputs
+    output_combinations = list(itertools.product(*output_list))
+    output_list = []
+
+    # for each permutation
+    for x in range(len(output_combinations)):
+        # separate each template peram into its own list member
+        new_list = [output_combinations[x][i] for i in range(len(output_combinations[x]))]
+        tempStr = 'gBackendPixelRateTable'
+        #print each list member as an index in the multidimensional array
+        for i in new_list:
+            tempStr += '[' + str(i) + ']'
+        #map each entry in the permuation as its own string member, store as the template instantiation string
+        tempStr += " = BackendPixelRate<SwrBackendTraits<" + ','.join(map(str, output_combinations[x])) + '>>;'
+        #append the line of c++ code in the list of output lines
+        output_list.append(tempStr)
+
+    # how many files should we split the global template initialization into?
+    if (args.split == 0):
+        numFiles = 1
+    else:
+        numFiles = (len(output_list) + args.split - 1) // args.split
+    linesPerFile = (len(output_list) + numFiles - 1) // numFiles
+    chunkedList = [output_list[x:x+linesPerFile] for x in range(0, len(output_list), linesPerFile)]
+
+    # generate .cpp files
+    if args.cpp:
+        baseCppName = os.path.join(args.outdir, 'BackendPixelRate%s.cpp')
+        templateCpp = os.path.join(thisDir, 'templates', 'backend_template.cpp')
+
+        for fileNum in range(numFiles):
+            filename = baseCppName % str(fileNum)
+            print('Generating', filename)
+            write_template_to_file(
+                templateCpp,
+                baseCppName % str(fileNum),
+                fileNum=fileNum,
+                funcList=chunkedList[fileNum])
+
+    # generate gen_backend.cmake file
+    if args.cmake:
+        templateCmake = os.path.join(thisDir, 'templates', 'backend_template.cmake')
+        cmakeFile = os.path.join(args.outdir, 'gen_backends.cmake')
+        print('Generating', cmakeFile)
+        write_template_to_file(
+            templateCmake,
+            cmakeFile,
+            numFiles=numFiles,
+            baseCppName=baseCppName.replace('\\','/'))
+
+    print("Generated %d template instantiations in %d files" % (len(output_list), numFiles))
+
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/backend_template.cpp b/src/gallium/drivers/swr/rasterizer/scripts/templates/backend_template.cpp
new file mode 100644 (file)
index 0000000..f015f5f
--- /dev/null
@@ -0,0 +1,38 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file BackendPixelRate${fileNum}.cpp
+* 
+* @brief auto-generated file
+* 
+* DO NOT EDIT
+* 
+******************************************************************************/
+
+#include "core/backend.h"
+
+void InitBackendPixelRate${fileNum}()
+{
+    %for func in funcList:
+    ${func}
+    %endfor
+}