From: Tim Rowley Date: Mon, 26 Jun 2017 18:00:27 +0000 (-0500) Subject: swr/rast: Split backend.cpp to improve compile time X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=cae53b24d7a739647193711e9a16c7face7ec72a;p=mesa.git swr/rast: Split backend.cpp to improve compile time Hardcode split to four files currently. Decreases swr build time on a quad-core by ~10%. Reviewed-by: Bruce Cherniak --- diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am index 6650abda5ae..578f15909b6 100644 --- a/src/gallium/drivers/swr/Makefile.am +++ b/src/gallium/drivers/swr/Makefile.am @@ -34,6 +34,7 @@ COMMON_CXXFLAGS = \ $(LLVM_CXXFLAGS) \ $(SWR_CXX11_CXXFLAGS) \ -I$(builddir)/rasterizer/codegen \ + -I$(builddir)/rasterizer/core \ -I$(builddir)/rasterizer/jitter \ -I$(builddir)/rasterizer/archrast \ -I$(srcdir)/rasterizer \ @@ -62,7 +63,11 @@ BUILT_SOURCES = \ rasterizer/archrast/gen_ar_event.cpp \ rasterizer/archrast/gen_ar_eventhandler.hpp \ rasterizer/archrast/gen_ar_eventhandlerfile.hpp \ - rasterizer/core/gen_BackendPixelRate0.cpp + rasterizer/core/backends/gen_BackendPixelRate0.cpp \ + rasterizer/core/backends/gen_BackendPixelRate1.cpp \ + rasterizer/core/backends/gen_BackendPixelRate2.cpp \ + rasterizer/core/backends/gen_BackendPixelRate3.cpp \ + rasterizer/core/backends/gen_BackendPixelRate.hpp MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D) PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS) @@ -140,20 +145,33 @@ rasterizer/archrast/gen_ar_eventhandlerfile.hpp: rasterizer/codegen/gen_archrast --output rasterizer/archrast/gen_ar_eventhandlerfile.hpp \ --gen_eventhandlerfile_h +rasterizer/core/backends/gen_BackendPixelRate0.cpp \ +rasterizer/core/backends/gen_BackendPixelRate1.cpp \ +rasterizer/core/backends/gen_BackendPixelRate2.cpp \ +rasterizer/core/backends/gen_BackendPixelRate3.cpp \ +rasterizer/core/backends/gen_BackendPixelRate.hpp: \ +backend.intermediate + # 5 SWR_MULTISAMPLE_TYPE_COUNT # 2 SWR_MSAA_SAMPLE_PATTERN_COUNT # 3 SWR_INPUT_COVERAGE_COUNT # 2 centroid # 2 forcedSampleCount # 2 canEarlyZ -rasterizer/core/gen_BackendPixelRate0.cpp: rasterizer/codegen/gen_backends.py rasterizer/codegen/templates/gen_backend.cpp + +# use intermediate rule to tell make that all files can be +# generated in one invocation of gen_backends.py (prevents +# parallel make race condition) +.INTERMEDIATE: backend.intermediate +backend.intermediate: rasterizer/codegen/gen_backends.py rasterizer/codegen/templates/gen_backend.cpp rasterizer/codegen/templates/gen_header_init.hpp $(MKDIR_GEN) $(PYTHON_GEN) \ $(srcdir)/rasterizer/codegen/gen_backends.py \ - --outdir rasterizer/core \ + --outdir rasterizer/core/backends \ --dim 5 2 3 2 2 2 \ - --split 0 \ - --cpp + --numfiles 4 \ + --cpp \ + --hpp COMMON_LIBADD = \ $(top_builddir)/src/gallium/auxiliary/libgallium.la \ @@ -227,5 +245,6 @@ EXTRA_DIST = \ rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp \ rasterizer/codegen/templates/gen_backend.cpp \ rasterizer/codegen/templates/gen_builder.hpp \ + rasterizer/codegen/templates/gen_header_init.hpp \ rasterizer/codegen/templates/gen_knobs.cpp \ rasterizer/codegen/templates/gen_llvm.hpp diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources index a1172b72cad..d9894c26015 100644 --- a/src/gallium/drivers/swr/Makefile.sources +++ b/src/gallium/drivers/swr/Makefile.sources @@ -73,7 +73,11 @@ CORE_CXX_SOURCES := \ rasterizer/core/api.h \ rasterizer/core/arena.h \ rasterizer/core/backend.cpp \ + rasterizer/core/backend_clear.cpp \ + rasterizer/core/backend_sample.cpp \ + rasterizer/core/backend_singlesample.cpp \ rasterizer/core/backend.h \ + rasterizer/core/backend_impl.h \ rasterizer/core/binner.cpp \ rasterizer/core/binner.h \ rasterizer/core/blend.h \ diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript index cdb85e2cad4..0f3cd6c8aa3 100644 --- a/src/gallium/drivers/swr/SConscript +++ b/src/gallium/drivers/swr/SConscript @@ -140,12 +140,22 @@ Depends('rasterizer/jitter/gen_state_llvm.h', # 2 centroid # 2 forcedSampleCount # 2 canEarlyZ +backendPixelRateFileCount = 4 +backendPixelRateFilePat = "rasterizer/core/backends/gen_BackendPixelRate%s.cpp" +backendPixelRateFiles = map(lambda x: backendPixelRateFilePat % x, + range(0, backendPixelRateFileCount)) env.CodeGenerate( - target = 'rasterizer/core/gen_BackendPixelRate0.cpp', + target = 'rasterizer/core/backends/gen_BackendPixelRate.hpp', script = swrroot + 'rasterizer/codegen/gen_backends.py', source = '', - command = python_cmd + ' $SCRIPT --outdir ' + bldroot + '/rasterizer/core --dim 5 2 3 2 2 2 --split 0 --cpp' -) + command = python_cmd + ' $SCRIPT --outdir ' + bldroot + '/rasterizer/core/backends --dim 5 2 3 2 2 2 --numfiles ' + str(backendPixelRateFileCount) + ' --cpp --hpp' + ) +Depends(backendPixelRateFiles, + ['rasterizer/core/backends/gen_BackendPixelRate.hpp', + 'rasterizer/archrast/gen_ar_event.hpp', + 'rasterizer/codegen/gen_knobs.h'] + ) + Depends('rasterizer/jitter/gen_state_llvm.h', swrroot + 'rasterizer/codegen/templates/gen_backend.cpp') @@ -153,9 +163,10 @@ Depends('rasterizer/jitter/gen_state_llvm.h', built_sources = [ 'rasterizer/codegen/gen_knobs.cpp', 'rasterizer/archrast/gen_ar_event.cpp', - 'rasterizer/core/gen_BackendPixelRate0.cpp', ] +built_sources += backendPixelRateFiles + source = built_sources source += env.ParseSourceList(swrroot + 'Makefile.sources', [ 'CXX_SOURCES', diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py index f65f7648c41..3f0790c8dae 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py @@ -35,7 +35,9 @@ def main(args=sys.argv[1:]): parser.add_argument('--dim', help='gBackendPixelRateTable array dimensions', nargs='+', type=int, required=True) parser.add_argument('--outdir', help='output directory', nargs='?', type=str, default=thisDir) parser.add_argument('--split', help='how many lines of initialization per file [0=no split]', nargs='?', type=int, default='512') + parser.add_argument('--numfiles', help='how many output files to generate', nargs='?', type=int, default='0') parser.add_argument('--cpp', help='Generate cpp file(s)', action='store_true', default=False) + parser.add_argument('--hpp', help='Generate hpp file', action='store_true', default=False) parser.add_argument('--cmake', help='Generate cmake file', action='store_true', default=False) args = parser.parse_args(args); @@ -43,11 +45,14 @@ def main(args=sys.argv[1:]): class backendStrs : def __init__(self) : self.outFileName = 'gen_BackendPixelRate%s.cpp' + self.outHeaderName = 'gen_BackendPixelRate.hpp' self.functionTableName = 'gBackendPixelRateTable' self.funcInstanceHeader = ' = BackendPixelRate #include "backend.h" +#include "backend_impl.h" #include "tilemgr.h" #include "memory/tilingtraits.h" #include "core/multisample.h" +#include "backends/gen_BackendPixelRate.hpp" #include -typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect); -static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS]; - ////////////////////////////////////////////////////////////////////////// /// @brief Process compute work. @@ -103,238 +102,6 @@ void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi SWR_ASSERT(x == 0 && y == 0); } -template -void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value) -{ - auto lambda = [&](int32_t comp) - { - FormatTraits::storeSOA(comp, pTileBuffer, value.v[comp]); - - pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits::GetBPC(comp) / 8); - }; - - const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM); - - for (uint32_t i = 0; i < numIter; ++i) - { - UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); - } -} - -#if USE_8x2_TILE_BACKEND -template -void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value) -{ - auto lambda = [&](int32_t comp) - { - FormatTraits::storeSOA(comp, pTileBuffer, value.v[comp]); - - pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits::GetBPC(comp) / 8); - }; - - const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM); - - for (uint32_t i = 0; i < numIter; ++i) - { - UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); - } -} - -#endif -template -INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect) -{ - // convert clear color to hottile format - // clear color is in RGBA float/uint32 -#if USE_8x2_TILE_BACKEND - simd16vector vClear; - for (uint32_t comp = 0; comp < FormatTraits::numComps; ++comp) - { - simd16scalar vComp; - vComp = _simd16_load1_ps((const float*)&clear[comp]); - if (FormatTraits::isNormalized(comp)) - { - vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits::fromFloat(comp))); - vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp)); - } - vComp = FormatTraits::pack(comp, vComp); - vClear.v[FormatTraits::swizzle(comp)] = vComp; - } - -#else - simdvector vClear; - for (uint32_t comp = 0; comp < FormatTraits::numComps; ++comp) - { - simdscalar vComp; - vComp = _simd_load1_ps((const float*)&clear[comp]); - if (FormatTraits::isNormalized(comp)) - { - vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits::fromFloat(comp))); - vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp)); - } - vComp = FormatTraits::pack(comp, vComp); - vClear.v[FormatTraits::swizzle(comp)] = vComp; - } - -#endif - uint32_t tileX, tileY; - MacroTileMgr::getTileIndices(macroTile, tileX, tileY); - - // Init to full macrotile - SWR_RECT clearTile = - { - KNOB_MACROTILE_X_DIM * int32_t(tileX), - KNOB_MACROTILE_Y_DIM * int32_t(tileY), - KNOB_MACROTILE_X_DIM * int32_t(tileX + 1), - KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1), - }; - - // intersect with clear rect - clearTile &= rect; - - // translate to local hottile origin - clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM); - - // Make maximums inclusive (needed for convert to raster tiles) - clearTile.xmax -= 1; - clearTile.ymax -= 1; - - // convert to raster tiles - clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT); - clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT); - clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT); - clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT); - - const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount); - // compute steps between raster tile samples / raster tiles / macro tile rows - const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8; - const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * numSamples; - const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep; - const uint32_t pitch = (FormatTraits::bpp * KNOB_MACROTILE_X_DIM / 8); - - HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex); - uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples; - uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits::bpp > >(pitch, x, y)) * numSamples; - - // loop over all raster tiles in the current hot tile - for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y) - { - uint8_t* pRasterTile = pRasterTileRow; - for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x) - { - for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++) - { - ClearRasterTile(pRasterTile, vClear); - pRasterTile += rasterTileSampleStep; - } - } - pRasterTileRow += macroTileRowStep; - } - - pHotTile->state = HOTTILE_DIRTY; -} - - -void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) -{ - SWR_CONTEXT *pContext = pDC->pContext; - - if (KNOB_FAST_CLEAR) - { - CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; - SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount; - uint32_t numSamples = GetNumSamples(sampleCount); - - SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason. - - AR_BEGIN(BEClear, pDC->drawId); - - if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR) - { - unsigned long rt = 0; - uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; - while (_BitScanForward(&rt, mask)) - { - mask &= ~(1 << rt); - - HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex); - - // All we want to do here is to mark the hot tile as being in a "needs clear" state. - pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]); - pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]); - pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]); - pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]); - pHotTile->state = HOTTILE_CLEAR; - } - } - - if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT) - { - HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex); - pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth; - pHotTile->state = HOTTILE_CLEAR; - } - - if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT) - { - HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex); - - pHotTile->clearData[0] = pClear->clearStencil; - pHotTile->state = HOTTILE_CLEAR; - } - - AR_END(BEClear, 1); - } - else - { - // Legacy clear - CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; - AR_BEGIN(BEClear, pDC->drawId); - - if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR) - { - DWORD clearData[4]; - clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]); - clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]); - clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]); - clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]); - - PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT]; - SWR_ASSERT(pfnClearTiles != nullptr); - - unsigned long rt = 0; - uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; - while (_BitScanForward(&rt, mask)) - { - mask &= ~(1 << rt); - - pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); - } - } - - if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT) - { - DWORD clearData[4]; - clearData[0] = *(DWORD*)&pClear->clearDepth; - PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT]; - SWR_ASSERT(pfnClearTiles != nullptr); - - pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); - } - - if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT) - { - DWORD clearData[4]; - clearData[0] = pClear->clearStencil; - PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT]; - - pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); - } - - AR_END(BEClear, 1); - } -} - void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc, SWR_RENDERTARGET_ATTACHMENT attachment) { @@ -368,7 +135,7 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile // clear if clear is pending (i.e., not rendered to), then mark as dirty for store. if (pHotTile->state == HOTTILE_CLEAR) { - PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat]; + PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat]; SWR_ASSERT(pfnClearTiles != nullptr); pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect); @@ -429,457 +196,6 @@ void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint3 } } -#if KNOB_SIMD_WIDTH == 8 -const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5}; -const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5}; -const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; -const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; -#else -#error Unsupported vector width -#endif - -simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ) -{ - simdscalar vClipMask = _simd_setzero_ps(); - uint32_t numClipDistance = _mm_popcnt_u32(clipMask); - - for (uint32_t i = 0; i < numClipDistance; ++i) - { - // pull triangle clip distance values from clip buffer - simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++); - simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++); - simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++); - - // interpolate - simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ); - - // clip if interpolated clip distance is < 0 || NAN - simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ); - - vClipMask = _simd_or_ps(vClipMask, vCull); - } - - return _simd_movemask_ps(vClipMask); -} - -template -void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) -{ - SWR_CONTEXT *pContext = pDC->pContext; - - AR_BEGIN(BESingleSampleBackend, pDC->drawId); - AR_BEGIN(BESetup, pDC->drawId); - - const API_STATE &state = GetApiState(pDC); - - BarycentricCoeffs coeffs; - SetupBarycentricCoeffs(&coeffs, work); - - uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); - - SWR_PS_CONTEXT psContext; - const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; - SetupPixelShaderContext(&psContext, samplePos, work); - - AR_END(BESetup, 1); - - psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast(y))); - psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast(y))); - - const simdscalar dy = _simd_set1_ps(static_cast(SIMD_TILE_Y_DIM)); - - for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) - { - psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast(x))); - psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast(x))); - - const simdscalar dx = _simd_set1_ps(static_cast(SIMD_TILE_X_DIM)); - - for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) - { -#if USE_8x2_TILE_BACKEND - const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); - -#endif - simdmask coverageMask = work.coverageMask[0] & MASK; - - if (coverageMask) - { - if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) - { - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); - - const simdscalar z = _simd_load_ps(reinterpret_cast(pDepthBuffer)); - - const float minz = state.depthBoundsState.depthBoundsTestMinValue; - const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; - - coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); - } - - if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) - { - const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; - - generateInputCoverage(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); - } - - AR_BEGIN(BEBarycentric, pDC->drawId); - - CalcPixelBarycentrics(coeffs, psContext); - - CalcCentroid(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); - - // interpolate and quantize z - psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); - psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); - - AR_END(BEBarycentric, 1); - - // interpolate user clip distance if available - if (state.rastState.clipDistanceMask) - { - coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center); - } - - simdscalar vCoverageMask = vMask(coverageMask); - simdscalar depthPassMask = vCoverageMask; - simdscalar stencilPassMask = vCoverageMask; - - // Early-Z? - if (T::bCanEarlyZ) - { - AR_BEGIN(BEEarlyDepthTest, pDC->drawId); - depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, - psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask); - AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); - AR_END(BEEarlyDepthTest, 0); - - // early-exit if no pixels passed depth or earlyZ is forced on - if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask)) - { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, - pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); - - if (!_simd_movemask_ps(depthPassMask)) - { - goto Endtile; - } - } - } - - psContext.sampleIndex = 0; - psContext.activeMask = _simd_castps_si(vCoverageMask); - - // execute pixel shader - AR_BEGIN(BEPixelShader, pDC->drawId); - UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask))); - state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); - AR_END(BEPixelShader, 0); - - vCoverageMask = _simd_castsi_ps(psContext.activeMask); - - // late-Z - if (!T::bCanEarlyZ) - { - AR_BEGIN(BELateDepthTest, pDC->drawId); - depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, - psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask); - AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); - AR_END(BELateDepthTest, 0); - - if (!_simd_movemask_ps(depthPassMask)) - { - // need to call depth/stencil write for stencil write - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, - pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); - goto Endtile; - } - } else { - // for early z, consolidate discards from shader - // into depthPassMask - depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask); - } - - uint32_t statMask = _simd_movemask_ps(depthPassMask); - uint32_t statCount = _mm_popcnt_u32(statMask); - UPDATE_STAT_BE(DepthPassCount, statCount); - - // output merger - AR_BEGIN(BEOutputMerger, pDC->drawId); -#if USE_8x2_TILE_BACKEND - OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); -#else - OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets); -#endif - - // do final depth write after all pixel kills - if (!state.psState.forceEarlyZ) - { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, - pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); - } - AR_END(BEOutputMerger, 0); - } - -Endtile: - AR_BEGIN(BEEndTile, pDC->drawId); - - work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) - { - work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - } - -#if USE_8x2_TILE_BACKEND - if (useAlternateOffset) - { - for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) - { - pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - } - } -#else - for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) - { - pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - } -#endif - pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - - AR_END(BEEndTile, 0); - - psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); - psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); - } - - psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); - psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); - } - - AR_END(BESingleSampleBackend, 0); -} - -template -void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) -{ - SWR_CONTEXT *pContext = pDC->pContext; - - AR_BEGIN(BESampleRateBackend, pDC->drawId); - AR_BEGIN(BESetup, pDC->drawId); - - const API_STATE &state = GetApiState(pDC); - - BarycentricCoeffs coeffs; - SetupBarycentricCoeffs(&coeffs, work); - - uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); - - SWR_PS_CONTEXT psContext; - const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; - SetupPixelShaderContext(&psContext, samplePos, work); - - AR_END(BESetup, 0); - - psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast(y))); - psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast(y))); - - const simdscalar dy = _simd_set1_ps(static_cast(SIMD_TILE_Y_DIM)); - - for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) - { - psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast(x))); - psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast(x))); - - const simdscalar dx = _simd_set1_ps(static_cast(SIMD_TILE_X_DIM)); - - for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) - { -#if USE_8x2_TILE_BACKEND - const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); - -#endif - if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) - { - const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; - - generateInputCoverage(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); - } - - AR_BEGIN(BEBarycentric, pDC->drawId); - - CalcPixelBarycentrics(coeffs, psContext); - - CalcCentroid(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); - - AR_END(BEBarycentric, 0); - - for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++) - { - simdmask coverageMask = work.coverageMask[sample] & MASK; - - if (coverageMask) - { - // offset depth/stencil buffers current sample - uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); - uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); - - if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) - { - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); - - const simdscalar z = _simd_load_ps(reinterpret_cast(pDepthSample)); - - const float minz = state.depthBoundsState.depthBoundsTestMinValue; - const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; - - coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); - } - - AR_BEGIN(BEBarycentric, pDC->drawId); - - // calculate per sample positions - psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample)); - psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample)); - - CalcSampleBarycentrics(coeffs, psContext); - - // interpolate and quantize z - psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); - psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); - - AR_END(BEBarycentric, 0); - - // interpolate user clip distance if available - if (state.rastState.clipDistanceMask) - { - coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); - } - - simdscalar vCoverageMask = vMask(coverageMask); - simdscalar depthPassMask = vCoverageMask; - simdscalar stencilPassMask = vCoverageMask; - - // Early-Z? - if (T::bCanEarlyZ) - { - AR_BEGIN(BEEarlyDepthTest, pDC->drawId); - depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, - psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); - AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); - AR_END(BEEarlyDepthTest, 0); - - // early-exit if no samples passed depth or earlyZ is forced on. - if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask)) - { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, - pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); - - if (!_simd_movemask_ps(depthPassMask)) - { - work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - continue; - } - } - } - - psContext.sampleIndex = sample; - psContext.activeMask = _simd_castps_si(vCoverageMask); - - // execute pixel shader - AR_BEGIN(BEPixelShader, pDC->drawId); - UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask))); - state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); - AR_END(BEPixelShader, 0); - - vCoverageMask = _simd_castsi_ps(psContext.activeMask); - - // late-Z - if (!T::bCanEarlyZ) - { - AR_BEGIN(BELateDepthTest, pDC->drawId); - depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, - psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); - AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); - AR_END(BELateDepthTest, 0); - - if (!_simd_movemask_ps(depthPassMask)) - { - // need to call depth/stencil write for stencil write - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, - pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); - - work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - continue; - } - } - - uint32_t statMask = _simd_movemask_ps(depthPassMask); - uint32_t statCount = _mm_popcnt_u32(statMask); - UPDATE_STAT_BE(DepthPassCount, statCount); - - // output merger - AR_BEGIN(BEOutputMerger, pDC->drawId); -#if USE_8x2_TILE_BACKEND - OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); -#else - OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets); -#endif - - // do final depth write after all pixel kills - if (!state.psState.forceEarlyZ) - { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, - pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); - } - AR_END(BEOutputMerger, 0); - } - work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - } - -Endtile: - ATTR_UNUSED; - - AR_BEGIN(BEEndTile, pDC->drawId); - - if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) - { - work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - } - -#if USE_8x2_TILE_BACKEND - if (useAlternateOffset) - { - for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) - { - pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - } - } -#else - for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) - { - pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - } -#endif - pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - - AR_END(BEEndTile, 0); - - psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); - psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); - } - - psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); - psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); - } - - AR_END(BESampleRateBackend, 0); -} -// optimized backend flow with NULL PS template void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) { @@ -977,7 +293,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, UPDATE_STAT_BE(DepthPassCount, statCount); } -Endtile: + Endtile: ATTR_UNUSED; work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); } @@ -994,17 +310,7 @@ Endtile: AR_END(BENullBackend, 0); } -void InitClearTilesTable() -{ - memset(sClearTilesTable, 0, sizeof(sClearTilesTable)); - - sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile; - sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile; - sClearTilesTable[R32_FLOAT] = ClearMacroTile; - sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile; - sClearTilesTable[R8_UINT] = ClearMacroTile; -} - +PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {}; PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT]; PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT] [2] // centroid @@ -1023,113 +329,10 @@ PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT] [2] // canEarlyZ = {}; -// Recursive template used to auto-nest conditionals. Converts dynamic enum function -// arguments to static template arguments. -template -struct BEChooser -{ - // Last Arg Terminator - static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg) - { - switch(tArg) - { - case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample>; break; - case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate>; break; - case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate>; break; - default: - SWR_ASSERT(0 && "Invalid backend func\n"); - return nullptr; - break; - } - } - - // Recursively parse args - template - static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs) - { - switch(tArg) - { - case SWR_INPUT_COVERAGE_NONE: return BEChooser::GetFunc(remainingArgs...); break; - case SWR_INPUT_COVERAGE_NORMAL: return BEChooser::GetFunc(remainingArgs...); break; - case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser::GetFunc(remainingArgs...); break; - default: - SWR_ASSERT(0 && "Invalid sample pattern\n"); - return BEChooser::GetFunc(remainingArgs...); - break; - } - } - - // Recursively parse args - template - static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs) - { - switch(tArg) - { - case SWR_MULTISAMPLE_1X: return BEChooser::GetFunc(remainingArgs...); break; - case SWR_MULTISAMPLE_2X: return BEChooser::GetFunc(remainingArgs...); break; - case SWR_MULTISAMPLE_4X: return BEChooser::GetFunc(remainingArgs...); break; - case SWR_MULTISAMPLE_8X: return BEChooser::GetFunc(remainingArgs...); break; - case SWR_MULTISAMPLE_16X: return BEChooser::GetFunc(remainingArgs...); break; - default: - SWR_ASSERT(0 && "Invalid sample count\n"); - return BEChooser::GetFunc(remainingArgs...); - break; - } - } - - // Recursively parse args - template - static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs) - { - if(tArg == true) - { - return BEChooser::GetFunc(remainingArgs...); - } - - return BEChooser::GetFunc(remainingArgs...); - } -}; - -void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]) -{ - for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) - { - for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) - { - for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) - { - table[inputCoverage][isCentroid][canEarlyZ] = - BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage, - (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE); - } - } - } -} - -void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]) -{ - for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++) - { - for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) - { - for(uint32_t centroid = 0; centroid < 2; centroid++) - { - for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) - { - table[sampleCount][inputCoverage][centroid][canEarlyZ] = - BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage, - (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE); - } - } - } - } -} - -void InitBackendPixelRate0(); void InitBackendFuncTables() { + InitBackendPixelRate(); InitBackendSingleFuncTable(gBackendSingleSample); - InitBackendPixelRate0(); InitBackendSampleFuncTable(gBackendSampleRateTable); gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 035948652bc..c8c37e65257 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -40,1022 +40,23 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); -void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers); -simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ); -void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext); -extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT] - [2] // isCenterPattern - [SWR_INPUT_COVERAGE_COUNT] - [2] // centroid - [2] // forcedSampleCount - [2] // canEarlyZ - ; - -enum SWR_BACKEND_FUNCS -{ - SWR_BACKEND_SINGLE_SAMPLE, - SWR_BACKEND_MSAA_PIXEL_RATE, - SWR_BACKEND_MSAA_SAMPLE_RATE, - SWR_BACKEND_FUNCS_MAX, -}; - -#if KNOB_SIMD_WIDTH == 8 -extern const simdscalar vCenterOffsetsX; -extern const simdscalar vCenterOffsetsY; -extern const simdscalar vULOffsetsX; -extern const simdscalar vULOffsetsY; -#define MASK 0xff -#endif - -INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) -{ - static const uint32_t RasterTileColorOffsets[16] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, - }; - assert(sampleNum < 16); - return RasterTileColorOffsets[sampleNum]; -} - -INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) -{ - static const uint32_t RasterTileDepthOffsets[16] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, - }; - assert(sampleNum < 16); - return RasterTileDepthOffsets[sampleNum]; -} - -INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) -{ - static const uint32_t RasterTileStencilOffsets[16] - { 0, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, - (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, - }; - assert(sampleNum < 16); - return RasterTileStencilOffsets[sampleNum]; -} - -template -struct generateInputCoverage -{ - INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) - { - // will need to update for avx512 - assert(KNOB_SIMD_WIDTH == 8); - - simdscalari mask[2]; - simdscalari sampleCoverage[2]; - - if(T::bIsCenterPattern) - { - // center coverage is the same for all samples; just broadcast to the sample slots - uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); - if(T::MultisampleT::numSamples == 1) - { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); - } - else if(T::MultisampleT::numSamples == 2) - { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); - } - else if(T::MultisampleT::numSamples == 4) - { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); - } - else if(T::MultisampleT::numSamples == 8) - { - sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); - } - else if(T::MultisampleT::numSamples == 16) - { - sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); - sampleCoverage[1] = _mm256_set1_epi32(centerCoverage); - } - } - else - { - __m256i src = _mm256_set1_epi32(0); - __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; - - if(T::MultisampleT::numSamples == 1) - { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); - } - else if(T::MultisampleT::numSamples == 2) - { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); - } - else if(T::MultisampleT::numSamples == 4) - { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); - } - else if(T::MultisampleT::numSamples == 8) - { - mask[0] = _mm256_set1_epi32(-1); - } - else if(T::MultisampleT::numSamples == 16) - { - mask[0] = _mm256_set1_epi32(-1); - mask[1] = _mm256_set1_epi32(-1); - index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); - } - - // gather coverage for samples 0-7 - sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8)); - if(T::MultisampleT::numSamples > 8) - { - // gather coverage for samples 8-15 - sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8)); - } - } - - mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0); - // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane - simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); - - simdscalari packedCoverage1; - if(T::MultisampleT::numSamples > 8) - { - // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane - packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]); - } - - #if (KNOB_ARCH == KNOB_ARCH_AVX) - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane - simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); - simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); - packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); - - simdscalari packedSampleCoverage; - if(T::MultisampleT::numSamples > 8) - { - // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane - hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); - shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); - shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); - packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); - packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); - } - else - { - packedSampleCoverage = packedCoverage0; - } - #else - simdscalari permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane - packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); - - simdscalari packedSampleCoverage; - if(T::MultisampleT::numSamples > 8) - { - permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); - // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane - packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask); - - // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane - packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C); - } - else - { - packedSampleCoverage = packedCoverage0; - } - #endif - - for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) - { - // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2 - inputMask[i] = _simd_movemask_epi8(packedSampleCoverage); - - if(!T::bForcedSampleCount) - { - // input coverage has to be anded with sample mask if MSAA isn't forced on - inputMask[i] &= sampleMask; - } - - // shift to the next pixel in the 4x2 - packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1); - } - } - - INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask) - { - uint32_t inputMask[KNOB_SIMD_WIDTH]; - generateInputCoverage(coverageMask, inputMask, sampleMask); - inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0])); - } - -}; - -template -struct generateInputCoverage -{ - INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask) - { - // will need to update for avx512 - assert(KNOB_SIMD_WIDTH == 8); - simdscalari vec = _mm256_set1_epi32(coverageMask[0]); - const simdscalari bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); - vec = _simd_and_si(vec, bit); - vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec); - vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec); - inputCoverage = _simd_castsi_ps(vec); - } - - INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) - { - uint32_t simdCoverage = (coverageMask[0] & MASK); - static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1; - for(int i = 0; i < KNOB_SIMD_WIDTH; i++) - { - // set all samples to covered if conservative coverage mask is set for that pixel - inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0; - } - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Centroid behaves exactly as follows : -// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to -// have a sample location there). -// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the -// coverage with the SampleMask Rasterizer State. -// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is -// evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the -// SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point. -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template -INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos, - const uint64_t *const coverageMask, const uint32_t sampleMask, - const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL) -{ - uint32_t inputMask[KNOB_SIMD_WIDTH]; - generateInputCoverage(coverageMask, inputMask, sampleMask); - - // Case (2) - partially covered pixel - - // scan for first covered sample per pixel in the 4x2 span - unsigned long sampleNum[KNOB_SIMD_WIDTH]; - (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0); - (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0); - (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0); - (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0); - (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0); - (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0); - (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0); - (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0); - - // look up and set the sample offsets from UL pixel corner for first covered sample - __m256 vXSample = _mm256_set_ps(samplePos.X(sampleNum[7]), - samplePos.X(sampleNum[6]), - samplePos.X(sampleNum[5]), - samplePos.X(sampleNum[4]), - samplePos.X(sampleNum[3]), - samplePos.X(sampleNum[2]), - samplePos.X(sampleNum[1]), - samplePos.X(sampleNum[0])); - - __m256 vYSample = _mm256_set_ps(samplePos.Y(sampleNum[7]), - samplePos.Y(sampleNum[6]), - samplePos.Y(sampleNum[5]), - samplePos.Y(sampleNum[4]), - samplePos.Y(sampleNum[3]), - samplePos.Y(sampleNum[2]), - samplePos.Y(sampleNum[1]), - samplePos.Y(sampleNum[0])); - // add sample offset to UL pixel corner - vXSample = _simd_add_ps(vXSamplePosUL, vXSample); - vYSample = _simd_add_ps(vYSamplePosUL, vYSample); - - // Case (1) and case (3b) - All samples covered or not covered with full SampleMask - static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask(); - simdscalari vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]); - simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask); - - static const simdscalari vZero = _simd_setzero_si(); - const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask); - simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero); - simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask); - simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask); - - simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b); - - // set the centroid position based on results from above - psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter)); - psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter)); - - // Case (3a) No samples covered and partial sample mask - simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask); - // sample mask should never be all 0's for this case, but handle it anyways - unsigned long firstCoveredSampleMaskSample = 0; - (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0); - - simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples); - - vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample)); - vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample)); - - // blend in case 3a pixel locations - psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a)); - psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a)); -} - -INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext, - const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL) -{ - // evaluate I,J - psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid); - psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid); - psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet); - psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet); - - // interpolate 1/w - psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid); -} - -INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar z, float minz, float maxz) -{ - const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz)); - const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz)); - - return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask)); -} - -template -INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount) -{ - // RT has to be single sample if we're in forcedMSAA mode - if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X)) - { - return 1; - } - // unless we're forced to single sample, in which case we run the OM at the sample count of the RT - else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X)) - { - return GetNumSamples(blendSampleCount); - } - // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count - else - { - return T::MultisampleT::numSamples; - } -} - -inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work) -{ - // broadcast scalars - - coeffs->vIa = _simd_broadcast_ss(&work.I[0]); - coeffs->vIb = _simd_broadcast_ss(&work.I[1]); - coeffs->vIc = _simd_broadcast_ss(&work.I[2]); - - coeffs->vJa = _simd_broadcast_ss(&work.J[0]); - coeffs->vJb = _simd_broadcast_ss(&work.J[1]); - coeffs->vJc = _simd_broadcast_ss(&work.J[2]); - - coeffs->vZa = _simd_broadcast_ss(&work.Z[0]); - coeffs->vZb = _simd_broadcast_ss(&work.Z[1]); - coeffs->vZc = _simd_broadcast_ss(&work.Z[2]); - - coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet); - - coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); - coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); - coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); -} - -inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers) -{ - assert(colorBufferCount <= SWR_NUM_RENDERTARGETS); - - if (pColorBuffer) - { - for (uint32_t index = 0; index < colorBufferCount; index += 1) - { - pColorBuffer[index] = renderBuffers.pColor[index]; - } - } - - if (pDepthBuffer) - { - *pDepthBuffer = renderBuffers.pDepth; - } - - if (pStencilBuffer) - { - *pStencilBuffer = renderBuffers.pStencil;; - } -} - -template -void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work) -{ - psContext->pAttribs = work.pAttribs; - psContext->pPerspAttribs = work.pPerspAttribs; - psContext->frontFace = work.triFlags.frontFacing; - psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex; - - // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs - psContext->I = work.I; - psContext->J = work.J; - - psContext->recipDet = work.recipDet; - psContext->pRecipW = work.pRecipW; - psContext->pSamplePosX = samplePos.X();//reinterpret_cast(&T::MultisampleT::samplePosX); - psContext->pSamplePosY = samplePos.Y();//reinterpret_cast(&T::MultisampleT::samplePosY); - psContext->rasterizerSampleCount = T::MultisampleT::numSamples; - psContext->sampleIndex = 0; -} - -template -void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, - const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask) -{ - if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different - { - // for 1x case, centroid is pixel center - psContext->vX.centroid = psContext->vX.center; - psContext->vY.centroid = psContext->vY.center; - psContext->vI.centroid = psContext->vI.center; - psContext->vJ.centroid = psContext->vJ.center; - psContext->vOneOverW.centroid = psContext->vOneOverW.center; - } - else - { - if (T::bCentroidPos) - { - ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid - if (T::bIsCenterPattern) - { - psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f)); - psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f)); - } - else - { - // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'.. - CalcCentroidPos(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL); - } - - CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL); - } - else - { - psContext->vX.centroid = psContext->vX.sample; - psContext->vY.centroid = psContext->vY.sample; - } - } -} - -template -struct PixelRateZTestLoop -{ - PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState, - uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) : - pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState), - samplePos(state.rastState.samplePositions), - clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){}; - - INLINE - uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext, - const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0) - { - SWR_CONTEXT *pContext = pDC->pContext; - - uint32_t statCount = 0; - simdscalar anyDepthSamplePassed = _simd_setzero_ps(); - for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) - { - const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample]; - vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK)); - - if(!_simd_movemask_ps(vCoverageMask[sample])) - { - vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps(); - continue; - } - - // offset depth/stencil buffers current sample - uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); - uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); - - if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) - { - static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); - - const simdscalar z = _simd_load_ps(reinterpret_cast(pDepthSample)); - - const float minz = state.depthBoundsState.depthBoundsTestMinValue; - const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; - - vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz))); - } - - AR_BEGIN(BEBarycentric, pDC->drawId); - - // calculate per sample positions - psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample)); - psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample)); - - // calc I & J per sample - CalcSampleBarycentrics(coeffs, psContext); - - if(psState.writesODepth) - { - { - // broadcast and test oDepth(psContext.vZ) written from the PS for each sample - vZ[sample] = psContext.vZ; - } - } - else - { - vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); - vZ[sample] = state.pfnQuantizeDepth(vZ[sample]); - } - - AR_END(BEBarycentric, 0); - - ///@todo: perspective correct vs non-perspective correct clipping? - // if clip distances are enabled, we need to interpolate for each sample - if(clipDistanceMask) - { - uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); - - vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask)); - } +typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect); - // ZTest for this sample - ///@todo Need to uncomment out this bucket. - //AR_BEGIN(BEDepthBucket, pDC->drawId); - depthPassMask[sample] = vCoverageMask[sample]; - stencilPassMask[sample] = vCoverageMask[sample]; - depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, - vZ[sample], pDepthSample, vCoverageMask[sample], - pStencilSample, &stencilPassMask[sample]); - //AR_END(BEDepthBucket, 0); - - // early-exit if no pixels passed depth or earlyZ is forced on - if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample])) - { - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], - pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]); - - if(!_simd_movemask_ps(depthPassMask[sample])) - { - continue; - } - } - anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]); - uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]); - statCount += _mm_popcnt_u32(statMask); - } - - activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes); - // return number of samples that passed depth and coverage - return statCount; - } - - // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite - simdscalar vZ[T::MultisampleT::numCoverageSamples]; - simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples]; - simdscalar depthPassMask[T::MultisampleT::numCoverageSamples]; - simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples]; - -private: - // functor inputs - DRAW_CONTEXT* pDC; - uint32_t workerId; - - const SWR_TRIANGLE_DESC& work; - const BarycentricCoeffs& coeffs; - const API_STATE& state; - const SWR_PS_STATE& psState; - const SWR_MULTISAMPLE_POS& samplePos; - const uint8_t clipDistanceMask; - uint8_t*& pDepthBuffer; - uint8_t*& pStencilBuffer; -}; - -INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) -{ - // evaluate I,J - psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center); - psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center); - psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet); - psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet); - - // interpolate 1/w - psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center); -} - -INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) -{ - // evaluate I,J - psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample); - psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample); - psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet); - psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet); - - // interpolate 1/w - psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample); -} - -// Merge Output to 4x2 SIMD Tile Format -INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, - const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT) -{ - // type safety guaranteed from template instantiation in BEChooser<>::GetFunc - const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); - simdvector blendOut; - - for(uint32_t rt = 0; rt < NumRT; ++rt) - { - uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset; - - const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; - - { - // pfnBlendFunc may not update all channels. Initialize with PS output. - /// TODO: move this into the blend JIT. - blendOut = psContext.shaded[rt]; - - // Blend outputs and update coverage mask for alpha test - if(pfnBlendFunc[rt] != nullptr) - { - pfnBlendFunc[rt]( - pBlendState, - psContext.shaded[rt], - psContext.shaded[1], - psContext.shaded[0].w, - sample, - pColorSample, - blendOut, - &psContext.oMask, - (simdscalari*)&coverageMask); - } - } - - // final write mask - simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask)); - - ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. - static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); - - const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float); - - // store with color mask - if(!pRTBlend->writeDisableRed) - { - _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x); - } - if(!pRTBlend->writeDisableGreen) - { - _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y); - } - if(!pRTBlend->writeDisableBlue) - { - _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z); - } - if(!pRTBlend->writeDisableAlpha) - { - _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w); - } - } -} - -#if USE_8x2_TILE_BACKEND -// Merge Output to 8x2 SIMD16 Tile Format -INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, - const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, const uint32_t colorBufferEnableMask, bool useAlternateOffset) -{ - // type safety guaranteed from template instantiation in BEChooser<>::GetFunc - uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); - - if (useAlternateOffset) - { - rasterTileColorOffset += sizeof(simdscalar); - } - - simdvector blendSrc; - simdvector blendOut; - - uint32_t colorBufferBit = 1; - for (uint32_t rt = 0; rt < NumRT; rt += 1, colorBufferBit <<= 1) - { - simdscalar *pColorSample = reinterpret_cast(pColorBase[rt] + rasterTileColorOffset); - - const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; - - if (colorBufferBit & colorBufferEnableMask) - { - blendSrc[0] = pColorSample[0]; - blendSrc[1] = pColorSample[2]; - blendSrc[2] = pColorSample[4]; - blendSrc[3] = pColorSample[6]; - } - - { - // pfnBlendFunc may not update all channels. Initialize with PS output. - /// TODO: move this into the blend JIT. - blendOut = psContext.shaded[rt]; - - // Blend outputs and update coverage mask for alpha test - if(pfnBlendFunc[rt] != nullptr) - { - pfnBlendFunc[rt]( - pBlendState, - psContext.shaded[rt], - psContext.shaded[1], - psContext.shaded[0].w, - sample, - reinterpret_cast(&blendSrc), - blendOut, - &psContext.oMask, - reinterpret_cast(&coverageMask)); - } - } - - // final write mask - simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask)); - - ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. - static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); - - // store with color mask - if (!pRTBlend->writeDisableRed) - { - _simd_maskstore_ps(reinterpret_cast(&pColorSample[0]), outputMask, blendOut.x); - } - if (!pRTBlend->writeDisableGreen) - { - _simd_maskstore_ps(reinterpret_cast(&pColorSample[2]), outputMask, blendOut.y); - } - if (!pRTBlend->writeDisableBlue) - { - _simd_maskstore_ps(reinterpret_cast(&pColorSample[4]), outputMask, blendOut.z); - } - if (!pRTBlend->writeDisableAlpha) - { - _simd_maskstore_ps(reinterpret_cast(&pColorSample[6]), outputMask, blendOut.w); - } - } -} - -#endif - -template -void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) -{ - ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend - - - SWR_CONTEXT *pContext = pDC->pContext; - - AR_BEGIN(BEPixelRateBackend, pDC->drawId); - AR_BEGIN(BESetup, pDC->drawId); - - const API_STATE &state = GetApiState(pDC); - - BarycentricCoeffs coeffs; - SetupBarycentricCoeffs(&coeffs, work); - - SWR_PS_CONTEXT psContext; - const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; - SetupPixelShaderContext(&psContext, samplePos, work); - - uint8_t *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); - - AR_END(BESetup, 0); - - PixelRateZTestLoop PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask); - - psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast(y))); - psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast(y))); - - const simdscalar dy = _simd_set1_ps(static_cast(SIMD_TILE_Y_DIM)); - - for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) - { - psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast(x))); - psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast(x))); - - const simdscalar dx = _simd_set1_ps(static_cast(SIMD_TILE_X_DIM)); - - for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) - { -#if USE_8x2_TILE_BACKEND - const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); -#endif - simdscalar activeLanes; - if(!(work.anyCoveredSamples & MASK)) {goto Endtile;}; - activeLanes = vMask(work.anyCoveredSamples & MASK); - - if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) - { - const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; - - generateInputCoverage(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); - } - - AR_BEGIN(BEBarycentric, pDC->drawId); - - CalcPixelBarycentrics(coeffs, psContext); - - CalcCentroid(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); - - AR_END(BEBarycentric, 0); - - if(T::bForcedSampleCount) - { - // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set - const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si())); - activeLanes = _simd_and_ps(activeLanes, vSampleMask); - } - - // Early-Z? - if(T::bCanEarlyZ && !T::bForcedSampleCount) - { - uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest); - UPDATE_STAT_BE(DepthPassCount, depthPassCount); - AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes))); - } - - // if we have no covered samples that passed depth at this point, go to next tile - if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; - - if(state.psState.usesSourceDepth) - { - AR_BEGIN(BEBarycentric, pDC->drawId); - // interpolate and quantize z - psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); - psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); - AR_END(BEBarycentric, 0); - } - - // pixels that are currently active - psContext.activeMask = _simd_castps_si(activeLanes); - psContext.oMask = T::MultisampleT::FullSampleMask(); - - // execute pixel shader - AR_BEGIN(BEPixelShader, pDC->drawId); - state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); - UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes))); - AR_END(BEPixelShader, 0); - - // update active lanes to remove any discarded or oMask'd pixels - activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si()))); - if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; - - // late-Z - if(!T::bCanEarlyZ && !T::bForcedSampleCount) - { - uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest); - UPDATE_STAT_BE(DepthPassCount, depthPassCount); - AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes))); - } - - // if we have no covered samples that passed depth at this point, skip OM and go to next tile - if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; - - // output merger - // loop over all samples, broadcasting the results of the PS to all passing pixels - for(uint32_t sample = 0; sample < GetNumOMSamples(state.blendState.sampleCount); sample++) - { - AR_BEGIN(BEOutputMerger, pDC->drawId); - // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples - uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample; - simdscalar coverageMask, depthMask; - if(T::bForcedSampleCount) - { - coverageMask = depthMask = activeLanes; - } - else - { - coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum]; - depthMask = PixelRateZTest.depthPassMask[coverageSampleNum]; - if(!_simd_movemask_ps(depthMask)) - { - // stencil should already have been written in early/lateZ tests - AR_END(BEOutputMerger, 0); - continue; - } - } - - // broadcast the results of the PS to all passing pixels -#if USE_8x2_TILE_BACKEND - OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); -#else // USE_8x2_TILE_BACKEND - OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets); -#endif // USE_8x2_TILE_BACKEND - - if(!state.psState.forceEarlyZ && !T::bForcedSampleCount) - { - uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); - uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); - - DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum], - pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]); - } - AR_END(BEOutputMerger, 0); - } -Endtile: - AR_BEGIN(BEEndTile, pDC->drawId); - - for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) - { - work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - } - - if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) - { - work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - } - work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); - -#if USE_8x2_TILE_BACKEND - if (useAlternateOffset) - { - for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) - { - psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - } - } -#else - for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) - { - psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - } -#endif - pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - - AR_END(BEEndTile, 0); - - psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); - psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); - } - - psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); - psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); - } - - AR_END(BEPixelRateBackend, 0); -} +extern PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS]; +extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT]; +extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT] + [2] // centroid + [2]; // canEarlyZ +extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT] + [2] // isCenterPattern + [SWR_INPUT_COVERAGE_COUNT] + [2] // centroid + [2] // forcedSampleCount + [2] // canEarlyZ + ; +extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT] + [SWR_INPUT_COVERAGE_COUNT] + [2] // centroid + [2]; // canEarlyZ -template -struct SwrBackendTraits -{ - static const bool bIsCenterPattern = (isCenter == 1); - static const uint32_t InputCoverage = coverage; - static const bool bCentroidPos = (centroid == 1); - static const bool bForcedSampleCount = (forced == 1); - static const bool bCanEarlyZ = (canEarlyZ == 1); - typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT; -}; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp new file mode 100644 index 00000000000..0ef54e266d7 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp @@ -0,0 +1,281 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file backend.cpp +* +* @brief Backend handles rasterization, pixel shading and output merger +* operations. +* +******************************************************************************/ + +#include + +#include "backend.h" +#include "backend_impl.h" +#include "tilemgr.h" +#include "memory/tilingtraits.h" +#include "core/multisample.h" + +#include + +template +void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value) +{ + auto lambda = [&](int32_t comp) + { + FormatTraits::storeSOA(comp, pTileBuffer, value.v[comp]); + + pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits::GetBPC(comp) / 8); + }; + + const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM); + + for (uint32_t i = 0; i < numIter; ++i) + { + UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); + } +} + +#if USE_8x2_TILE_BACKEND +template +void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value) +{ + auto lambda = [&](int32_t comp) + { + FormatTraits::storeSOA(comp, pTileBuffer, value.v[comp]); + + pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits::GetBPC(comp) / 8); + }; + + const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM); + + for (uint32_t i = 0; i < numIter; ++i) + { + UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); + } +} + +#endif +template +INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect) +{ + // convert clear color to hottile format + // clear color is in RGBA float/uint32 +#if USE_8x2_TILE_BACKEND + simd16vector vClear; + for (uint32_t comp = 0; comp < FormatTraits::numComps; ++comp) + { + simd16scalar vComp; + vComp = _simd16_load1_ps((const float*)&clear[comp]); + if (FormatTraits::isNormalized(comp)) + { + vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits::fromFloat(comp))); + vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp)); + } + vComp = FormatTraits::pack(comp, vComp); + vClear.v[FormatTraits::swizzle(comp)] = vComp; + } + +#else + simdvector vClear; + for (uint32_t comp = 0; comp < FormatTraits::numComps; ++comp) + { + simdscalar vComp; + vComp = _simd_load1_ps((const float*)&clear[comp]); + if (FormatTraits::isNormalized(comp)) + { + vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits::fromFloat(comp))); + vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp)); + } + vComp = FormatTraits::pack(comp, vComp); + vClear.v[FormatTraits::swizzle(comp)] = vComp; + } + +#endif + uint32_t tileX, tileY; + MacroTileMgr::getTileIndices(macroTile, tileX, tileY); + + // Init to full macrotile + SWR_RECT clearTile = + { + KNOB_MACROTILE_X_DIM * int32_t(tileX), + KNOB_MACROTILE_Y_DIM * int32_t(tileY), + KNOB_MACROTILE_X_DIM * int32_t(tileX + 1), + KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1), + }; + + // intersect with clear rect + clearTile &= rect; + + // translate to local hottile origin + clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM); + + // Make maximums inclusive (needed for convert to raster tiles) + clearTile.xmax -= 1; + clearTile.ymax -= 1; + + // convert to raster tiles + clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT); + clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT); + clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT); + clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT); + + const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount); + // compute steps between raster tile samples / raster tiles / macro tile rows + const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8; + const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * numSamples; + const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep; + const uint32_t pitch = (FormatTraits::bpp * KNOB_MACROTILE_X_DIM / 8); + + HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex); + uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples; + uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits::bpp > >(pitch, x, y)) * numSamples; + + // loop over all raster tiles in the current hot tile + for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y) + { + uint8_t* pRasterTile = pRasterTileRow; + for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x) + { + for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++) + { + ClearRasterTile(pRasterTile, vClear); + pRasterTile += rasterTileSampleStep; + } + } + pRasterTileRow += macroTileRowStep; + } + + pHotTile->state = HOTTILE_DIRTY; +} + + +void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) +{ + SWR_CONTEXT *pContext = pDC->pContext; + + if (KNOB_FAST_CLEAR) + { + CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; + SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount; + uint32_t numSamples = GetNumSamples(sampleCount); + + SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason. + + AR_BEGIN(BEClear, pDC->drawId); + + if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR) + { + unsigned long rt = 0; + uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; + while (_BitScanForward(&rt, mask)) + { + mask &= ~(1 << rt); + + HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex); + + // All we want to do here is to mark the hot tile as being in a "needs clear" state. + pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]); + pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]); + pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]); + pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]); + pHotTile->state = HOTTILE_CLEAR; + } + } + + if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT) + { + HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex); + pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth; + pHotTile->state = HOTTILE_CLEAR; + } + + if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT) + { + HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex); + + pHotTile->clearData[0] = pClear->clearStencil; + pHotTile->state = HOTTILE_CLEAR; + } + + AR_END(BEClear, 1); + } + else + { + // Legacy clear + CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; + AR_BEGIN(BEClear, pDC->drawId); + + if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR) + { + DWORD clearData[4]; + clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]); + clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]); + clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]); + clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]); + + PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT]; + SWR_ASSERT(pfnClearTiles != nullptr); + + unsigned long rt = 0; + uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; + while (_BitScanForward(&rt, mask)) + { + mask &= ~(1 << rt); + + pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); + } + } + + if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT) + { + DWORD clearData[4]; + clearData[0] = *(DWORD*)&pClear->clearDepth; + PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT]; + SWR_ASSERT(pfnClearTiles != nullptr); + + pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); + } + + if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT) + { + DWORD clearData[4]; + clearData[0] = pClear->clearStencil; + PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT]; + + pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); + } + + AR_END(BEClear, 1); + } +} + +void InitClearTilesTable() +{ + memset(gClearTilesTable, 0, sizeof(gClearTilesTable)); + + gClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile; + gClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile; + gClearTilesTable[R32_FLOAT] = ClearMacroTile; + gClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile; + gClearTilesTable[R8_UINT] = ClearMacroTile; +} diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h new file mode 100644 index 00000000000..e1518719840 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h @@ -0,0 +1,1067 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file backend.h +* +* @brief Backend handles rasterization, pixel shading and output merger +* operations. +* +******************************************************************************/ +#pragma once + +void InitBackendSingleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_INPUT_COVERAGE_COUNT][2][2]); +void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]); + +static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext); + + +enum SWR_BACKEND_FUNCS +{ + SWR_BACKEND_SINGLE_SAMPLE, + SWR_BACKEND_MSAA_PIXEL_RATE, + SWR_BACKEND_MSAA_SAMPLE_RATE, + SWR_BACKEND_FUNCS_MAX, +}; + +#if KNOB_SIMD_WIDTH == 8 +static const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5}; +static const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5}; +static const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; +static const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; +#define MASK 0xff +#endif + +static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ) +{ + simdscalar vClipMask = _simd_setzero_ps(); + uint32_t numClipDistance = _mm_popcnt_u32(clipMask); + + for (uint32_t i = 0; i < numClipDistance; ++i) + { + // pull triangle clip distance values from clip buffer + simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++); + simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++); + simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++); + + // interpolate + simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ); + + // clip if interpolated clip distance is < 0 || NAN + simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ); + + vClipMask = _simd_or_ps(vClipMask, vCull); + } + + return _simd_movemask_ps(vClipMask); +} + +INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) +{ + static const uint32_t RasterTileColorOffsets[16] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, + }; + assert(sampleNum < 16); + return RasterTileColorOffsets[sampleNum]; +} + +INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) +{ + static const uint32_t RasterTileDepthOffsets[16] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, + }; + assert(sampleNum < 16); + return RasterTileDepthOffsets[sampleNum]; +} + +INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) +{ + static const uint32_t RasterTileStencilOffsets[16] + { 0, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, + (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, + }; + assert(sampleNum < 16); + return RasterTileStencilOffsets[sampleNum]; +} + +template +struct generateInputCoverage +{ + INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) + { + // will need to update for avx512 + assert(KNOB_SIMD_WIDTH == 8); + + simdscalari mask[2]; + simdscalari sampleCoverage[2]; + + if(T::bIsCenterPattern) + { + // center coverage is the same for all samples; just broadcast to the sample slots + uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); + if(T::MultisampleT::numSamples == 1) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); + } + else if(T::MultisampleT::numSamples == 2) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); + } + else if(T::MultisampleT::numSamples == 4) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); + } + else if(T::MultisampleT::numSamples == 8) + { + sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); + } + else if(T::MultisampleT::numSamples == 16) + { + sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); + sampleCoverage[1] = _mm256_set1_epi32(centerCoverage); + } + } + else + { + __m256i src = _mm256_set1_epi32(0); + __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; + + if(T::MultisampleT::numSamples == 1) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); + } + else if(T::MultisampleT::numSamples == 2) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); + } + else if(T::MultisampleT::numSamples == 4) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + } + else if(T::MultisampleT::numSamples == 8) + { + mask[0] = _mm256_set1_epi32(-1); + } + else if(T::MultisampleT::numSamples == 16) + { + mask[0] = _mm256_set1_epi32(-1); + mask[1] = _mm256_set1_epi32(-1); + index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); + } + + // gather coverage for samples 0-7 + sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8)); + if(T::MultisampleT::numSamples > 8) + { + // gather coverage for samples 8-15 + sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8)); + } + } + + mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0); + // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane + simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); + + simdscalari packedCoverage1; + if(T::MultisampleT::numSamples > 8) + { + // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane + packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]); + } + + #if (KNOB_ARCH == KNOB_ARCH_AVX) + // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane + simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); + simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); + packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); + + simdscalari packedSampleCoverage; + if(T::MultisampleT::numSamples > 8) + { + // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane + hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); + shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); + shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); + packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); + packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); + } + else + { + packedSampleCoverage = packedCoverage0; + } + #else + simdscalari permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); + // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane + packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); + + simdscalari packedSampleCoverage; + if(T::MultisampleT::numSamples > 8) + { + permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); + // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane + packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask); + + // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane + packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C); + } + else + { + packedSampleCoverage = packedCoverage0; + } + #endif + + for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) + { + // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2 + inputMask[i] = _simd_movemask_epi8(packedSampleCoverage); + + if(!T::bForcedSampleCount) + { + // input coverage has to be anded with sample mask if MSAA isn't forced on + inputMask[i] &= sampleMask; + } + + // shift to the next pixel in the 4x2 + packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1); + } + } + + INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask) + { + uint32_t inputMask[KNOB_SIMD_WIDTH]; + generateInputCoverage(coverageMask, inputMask, sampleMask); + inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0])); + } + +}; + +template +struct generateInputCoverage +{ + INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask) + { + // will need to update for avx512 + assert(KNOB_SIMD_WIDTH == 8); + simdscalari vec = _mm256_set1_epi32(coverageMask[0]); + const simdscalari bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + vec = _simd_and_si(vec, bit); + vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec); + vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec); + inputCoverage = _simd_castsi_ps(vec); + } + + INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) + { + uint32_t simdCoverage = (coverageMask[0] & MASK); + static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1; + for(int i = 0; i < KNOB_SIMD_WIDTH; i++) + { + // set all samples to covered if conservative coverage mask is set for that pixel + inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0; + } + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Centroid behaves exactly as follows : +// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to +// have a sample location there). +// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the +// coverage with the SampleMask Rasterizer State. +// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is +// evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the +// SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point. +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +template +INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos, + const uint64_t *const coverageMask, const uint32_t sampleMask, + const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL) +{ + uint32_t inputMask[KNOB_SIMD_WIDTH]; + generateInputCoverage(coverageMask, inputMask, sampleMask); + + // Case (2) - partially covered pixel + + // scan for first covered sample per pixel in the 4x2 span + unsigned long sampleNum[KNOB_SIMD_WIDTH]; + (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0); + (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0); + (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0); + (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0); + (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0); + (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0); + (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0); + (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0); + + // look up and set the sample offsets from UL pixel corner for first covered sample + __m256 vXSample = _mm256_set_ps(samplePos.X(sampleNum[7]), + samplePos.X(sampleNum[6]), + samplePos.X(sampleNum[5]), + samplePos.X(sampleNum[4]), + samplePos.X(sampleNum[3]), + samplePos.X(sampleNum[2]), + samplePos.X(sampleNum[1]), + samplePos.X(sampleNum[0])); + + __m256 vYSample = _mm256_set_ps(samplePos.Y(sampleNum[7]), + samplePos.Y(sampleNum[6]), + samplePos.Y(sampleNum[5]), + samplePos.Y(sampleNum[4]), + samplePos.Y(sampleNum[3]), + samplePos.Y(sampleNum[2]), + samplePos.Y(sampleNum[1]), + samplePos.Y(sampleNum[0])); + // add sample offset to UL pixel corner + vXSample = _simd_add_ps(vXSamplePosUL, vXSample); + vYSample = _simd_add_ps(vYSamplePosUL, vYSample); + + // Case (1) and case (3b) - All samples covered or not covered with full SampleMask + static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask(); + simdscalari vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]); + simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask); + + static const simdscalari vZero = _simd_setzero_si(); + const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask); + simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero); + simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask); + simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask); + + simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b); + + // set the centroid position based on results from above + psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter)); + psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter)); + + // Case (3a) No samples covered and partial sample mask + simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask); + // sample mask should never be all 0's for this case, but handle it anyways + unsigned long firstCoveredSampleMaskSample = 0; + (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0); + + simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples); + + vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample)); + vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample)); + + // blend in case 3a pixel locations + psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a)); + psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a)); +} + +INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext, + const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL) +{ + // evaluate I,J + psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid); + psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid); + psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet); + psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet); + + // interpolate 1/w + psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid); +} + +INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar z, float minz, float maxz) +{ + const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz)); + const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz)); + + return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask)); +} + +template +INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount) +{ + // RT has to be single sample if we're in forcedMSAA mode + if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X)) + { + return 1; + } + // unless we're forced to single sample, in which case we run the OM at the sample count of the RT + else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X)) + { + return GetNumSamples(blendSampleCount); + } + // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count + else + { + return T::MultisampleT::numSamples; + } +} + +inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work) +{ + // broadcast scalars + + coeffs->vIa = _simd_broadcast_ss(&work.I[0]); + coeffs->vIb = _simd_broadcast_ss(&work.I[1]); + coeffs->vIc = _simd_broadcast_ss(&work.I[2]); + + coeffs->vJa = _simd_broadcast_ss(&work.J[0]); + coeffs->vJb = _simd_broadcast_ss(&work.J[1]); + coeffs->vJc = _simd_broadcast_ss(&work.J[2]); + + coeffs->vZa = _simd_broadcast_ss(&work.Z[0]); + coeffs->vZb = _simd_broadcast_ss(&work.Z[1]); + coeffs->vZc = _simd_broadcast_ss(&work.Z[2]); + + coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet); + + coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); + coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); + coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); +} + +inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers) +{ + assert(colorBufferCount <= SWR_NUM_RENDERTARGETS); + + if (pColorBuffer) + { + for (uint32_t index = 0; index < colorBufferCount; index += 1) + { + pColorBuffer[index] = renderBuffers.pColor[index]; + } + } + + if (pDepthBuffer) + { + *pDepthBuffer = renderBuffers.pDepth; + } + + if (pStencilBuffer) + { + *pStencilBuffer = renderBuffers.pStencil;; + } +} + +template +void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work) +{ + psContext->pAttribs = work.pAttribs; + psContext->pPerspAttribs = work.pPerspAttribs; + psContext->frontFace = work.triFlags.frontFacing; + psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex; + + // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs + psContext->I = work.I; + psContext->J = work.J; + + psContext->recipDet = work.recipDet; + psContext->pRecipW = work.pRecipW; + psContext->pSamplePosX = samplePos.X();//reinterpret_cast(&T::MultisampleT::samplePosX); + psContext->pSamplePosY = samplePos.Y();//reinterpret_cast(&T::MultisampleT::samplePosY); + psContext->rasterizerSampleCount = T::MultisampleT::numSamples; + psContext->sampleIndex = 0; +} + +template +void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, + const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask) +{ + if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different + { + // for 1x case, centroid is pixel center + psContext->vX.centroid = psContext->vX.center; + psContext->vY.centroid = psContext->vY.center; + psContext->vI.centroid = psContext->vI.center; + psContext->vJ.centroid = psContext->vJ.center; + psContext->vOneOverW.centroid = psContext->vOneOverW.center; + } + else + { + if (T::bCentroidPos) + { + ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid + if (T::bIsCenterPattern) + { + psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f)); + psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f)); + } + else + { + // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'.. + CalcCentroidPos(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL); + } + + CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL); + } + else + { + psContext->vX.centroid = psContext->vX.sample; + psContext->vY.centroid = psContext->vY.sample; + } + } +} + +template +struct PixelRateZTestLoop +{ + PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState, + uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) : + pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState), + samplePos(state.rastState.samplePositions), + clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){}; + + INLINE + uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext, + const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0) + { + SWR_CONTEXT *pContext = pDC->pContext; + + uint32_t statCount = 0; + simdscalar anyDepthSamplePassed = _simd_setzero_ps(); + for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) + { + const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample]; + vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK)); + + if(!_simd_movemask_ps(vCoverageMask[sample])) + { + vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps(); + continue; + } + + // offset depth/stencil buffers current sample + uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); + uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); + + if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) + { + static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); + + const simdscalar z = _simd_load_ps(reinterpret_cast(pDepthSample)); + + const float minz = state.depthBoundsState.depthBoundsTestMinValue; + const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; + + vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz))); + } + + AR_BEGIN(BEBarycentric, pDC->drawId); + + // calculate per sample positions + psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample)); + psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample)); + + // calc I & J per sample + CalcSampleBarycentrics(coeffs, psContext); + + if(psState.writesODepth) + { + { + // broadcast and test oDepth(psContext.vZ) written from the PS for each sample + vZ[sample] = psContext.vZ; + } + } + else + { + vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); + vZ[sample] = state.pfnQuantizeDepth(vZ[sample]); + } + + AR_END(BEBarycentric, 0); + + ///@todo: perspective correct vs non-perspective correct clipping? + // if clip distances are enabled, we need to interpolate for each sample + if(clipDistanceMask) + { + uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); + + vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask)); + } + + // ZTest for this sample + ///@todo Need to uncomment out this bucket. + //AR_BEGIN(BEDepthBucket, pDC->drawId); + depthPassMask[sample] = vCoverageMask[sample]; + stencilPassMask[sample] = vCoverageMask[sample]; + depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, + vZ[sample], pDepthSample, vCoverageMask[sample], + pStencilSample, &stencilPassMask[sample]); + //AR_END(BEDepthBucket, 0); + + // early-exit if no pixels passed depth or earlyZ is forced on + if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample])) + { + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], + pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]); + + if(!_simd_movemask_ps(depthPassMask[sample])) + { + continue; + } + } + anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]); + uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]); + statCount += _mm_popcnt_u32(statMask); + } + + activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes); + // return number of samples that passed depth and coverage + return statCount; + } + + // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite + simdscalar vZ[T::MultisampleT::numCoverageSamples]; + simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples]; + simdscalar depthPassMask[T::MultisampleT::numCoverageSamples]; + simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples]; + +private: + // functor inputs + DRAW_CONTEXT* pDC; + uint32_t workerId; + + const SWR_TRIANGLE_DESC& work; + const BarycentricCoeffs& coeffs; + const API_STATE& state; + const SWR_PS_STATE& psState; + const SWR_MULTISAMPLE_POS& samplePos; + const uint8_t clipDistanceMask; + uint8_t*& pDepthBuffer; + uint8_t*& pStencilBuffer; +}; + +INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) +{ + // evaluate I,J + psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center); + psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center); + psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet); + psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet); + + // interpolate 1/w + psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center); +} + +static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) +{ + // evaluate I,J + psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample); + psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample); + psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet); + psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet); + + // interpolate 1/w + psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample); +} + +// Merge Output to 4x2 SIMD Tile Format +INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, + const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT) +{ + // type safety guaranteed from template instantiation in BEChooser<>::GetFunc + const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); + simdvector blendOut; + + for(uint32_t rt = 0; rt < NumRT; ++rt) + { + uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset; + + const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; + + { + // pfnBlendFunc may not update all channels. Initialize with PS output. + /// TODO: move this into the blend JIT. + blendOut = psContext.shaded[rt]; + + // Blend outputs and update coverage mask for alpha test + if(pfnBlendFunc[rt] != nullptr) + { + pfnBlendFunc[rt]( + pBlendState, + psContext.shaded[rt], + psContext.shaded[1], + psContext.shaded[0].w, + sample, + pColorSample, + blendOut, + &psContext.oMask, + (simdscalari*)&coverageMask); + } + } + + // final write mask + simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask)); + + ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. + static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); + + const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float); + + // store with color mask + if(!pRTBlend->writeDisableRed) + { + _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x); + } + if(!pRTBlend->writeDisableGreen) + { + _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y); + } + if(!pRTBlend->writeDisableBlue) + { + _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z); + } + if(!pRTBlend->writeDisableAlpha) + { + _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w); + } + } +} + +#if USE_8x2_TILE_BACKEND +// Merge Output to 8x2 SIMD16 Tile Format +INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, + const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, const uint32_t colorBufferEnableMask, bool useAlternateOffset) +{ + // type safety guaranteed from template instantiation in BEChooser<>::GetFunc + uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); + + if (useAlternateOffset) + { + rasterTileColorOffset += sizeof(simdscalar); + } + + simdvector blendSrc; + simdvector blendOut; + + uint32_t colorBufferBit = 1; + for (uint32_t rt = 0; rt < NumRT; rt += 1, colorBufferBit <<= 1) + { + simdscalar *pColorSample = reinterpret_cast(pColorBase[rt] + rasterTileColorOffset); + + const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; + + if (colorBufferBit & colorBufferEnableMask) + { + blendSrc[0] = pColorSample[0]; + blendSrc[1] = pColorSample[2]; + blendSrc[2] = pColorSample[4]; + blendSrc[3] = pColorSample[6]; + } + + { + // pfnBlendFunc may not update all channels. Initialize with PS output. + /// TODO: move this into the blend JIT. + blendOut = psContext.shaded[rt]; + + // Blend outputs and update coverage mask for alpha test + if(pfnBlendFunc[rt] != nullptr) + { + pfnBlendFunc[rt]( + pBlendState, + psContext.shaded[rt], + psContext.shaded[1], + psContext.shaded[0].w, + sample, + reinterpret_cast(&blendSrc), + blendOut, + &psContext.oMask, + reinterpret_cast(&coverageMask)); + } + } + + // final write mask + simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask)); + + ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. + static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); + + // store with color mask + if (!pRTBlend->writeDisableRed) + { + _simd_maskstore_ps(reinterpret_cast(&pColorSample[0]), outputMask, blendOut.x); + } + if (!pRTBlend->writeDisableGreen) + { + _simd_maskstore_ps(reinterpret_cast(&pColorSample[2]), outputMask, blendOut.y); + } + if (!pRTBlend->writeDisableBlue) + { + _simd_maskstore_ps(reinterpret_cast(&pColorSample[4]), outputMask, blendOut.z); + } + if (!pRTBlend->writeDisableAlpha) + { + _simd_maskstore_ps(reinterpret_cast(&pColorSample[6]), outputMask, blendOut.w); + } + } +} + +#endif + +template +void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) +{ + ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend + + + SWR_CONTEXT *pContext = pDC->pContext; + + AR_BEGIN(BEPixelRateBackend, pDC->drawId); + AR_BEGIN(BESetup, pDC->drawId); + + const API_STATE &state = GetApiState(pDC); + + BarycentricCoeffs coeffs; + SetupBarycentricCoeffs(&coeffs, work); + + SWR_PS_CONTEXT psContext; + const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; + SetupPixelShaderContext(&psContext, samplePos, work); + + uint8_t *pDepthBuffer, *pStencilBuffer; + SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); + + AR_END(BESetup, 0); + + PixelRateZTestLoop PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask); + + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast(y))); + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast(y))); + + const simdscalar dy = _simd_set1_ps(static_cast(SIMD_TILE_Y_DIM)); + + for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) + { + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast(x))); + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast(x))); + + const simdscalar dx = _simd_set1_ps(static_cast(SIMD_TILE_X_DIM)); + + for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) + { +#if USE_8x2_TILE_BACKEND + const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); +#endif + simdscalar activeLanes; + if(!(work.anyCoveredSamples & MASK)) {goto Endtile;}; + activeLanes = vMask(work.anyCoveredSamples & MASK); + + if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) + { + const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; + + generateInputCoverage(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); + } + + AR_BEGIN(BEBarycentric, pDC->drawId); + + CalcPixelBarycentrics(coeffs, psContext); + + CalcCentroid(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); + + AR_END(BEBarycentric, 0); + + if(T::bForcedSampleCount) + { + // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set + const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si())); + activeLanes = _simd_and_ps(activeLanes, vSampleMask); + } + + // Early-Z? + if(T::bCanEarlyZ && !T::bForcedSampleCount) + { + uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest); + UPDATE_STAT_BE(DepthPassCount, depthPassCount); + AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes))); + } + + // if we have no covered samples that passed depth at this point, go to next tile + if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; + + if(state.psState.usesSourceDepth) + { + AR_BEGIN(BEBarycentric, pDC->drawId); + // interpolate and quantize z + psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); + psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); + AR_END(BEBarycentric, 0); + } + + // pixels that are currently active + psContext.activeMask = _simd_castps_si(activeLanes); + psContext.oMask = T::MultisampleT::FullSampleMask(); + + // execute pixel shader + AR_BEGIN(BEPixelShader, pDC->drawId); + state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); + UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes))); + AR_END(BEPixelShader, 0); + + // update active lanes to remove any discarded or oMask'd pixels + activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si()))); + if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; + + // late-Z + if(!T::bCanEarlyZ && !T::bForcedSampleCount) + { + uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest); + UPDATE_STAT_BE(DepthPassCount, depthPassCount); + AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes))); + } + + // if we have no covered samples that passed depth at this point, skip OM and go to next tile + if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; + + // output merger + // loop over all samples, broadcasting the results of the PS to all passing pixels + for(uint32_t sample = 0; sample < GetNumOMSamples(state.blendState.sampleCount); sample++) + { + AR_BEGIN(BEOutputMerger, pDC->drawId); + // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples + uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample; + simdscalar coverageMask, depthMask; + if(T::bForcedSampleCount) + { + coverageMask = depthMask = activeLanes; + } + else + { + coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum]; + depthMask = PixelRateZTest.depthPassMask[coverageSampleNum]; + if(!_simd_movemask_ps(depthMask)) + { + // stencil should already have been written in early/lateZ tests + AR_END(BEOutputMerger, 0); + continue; + } + } + + // broadcast the results of the PS to all passing pixels +#if USE_8x2_TILE_BACKEND + OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); +#else // USE_8x2_TILE_BACKEND + OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets); +#endif // USE_8x2_TILE_BACKEND + + if(!state.psState.forceEarlyZ && !T::bForcedSampleCount) + { + uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); + uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); + + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum], + pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]); + } + AR_END(BEOutputMerger, 0); + } +Endtile: + AR_BEGIN(BEEndTile, pDC->drawId); + + for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) + { + work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + } + + if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) + { + work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + } + work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + +#if USE_8x2_TILE_BACKEND + if (useAlternateOffset) + { + for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) + { + psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + } + } +#else + for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) + { + psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + } +#endif + pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + + AR_END(BEEndTile, 0); + + psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); + psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); + } + + psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); + psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); + } + + AR_END(BEPixelRateBackend, 0); +} + +template +struct SwrBackendTraits +{ + static const bool bIsCenterPattern = (isCenter == 1); + static const uint32_t InputCoverage = coverage; + static const bool bCentroidPos = (centroid == 1); + static const bool bForcedSampleCount = (forced == 1); + static const bool bCanEarlyZ = (canEarlyZ == 1); + typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT; +}; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp new file mode 100644 index 00000000000..0f75ec24fb0 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp @@ -0,0 +1,345 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file backend.cpp +* +* @brief Backend handles rasterization, pixel shading and output merger +* operations. +* +******************************************************************************/ + +#include + +#include "backend.h" +#include "backend_impl.h" +#include "tilemgr.h" +#include "memory/tilingtraits.h" +#include "core/multisample.h" + +#include + +template +void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) +{ + SWR_CONTEXT *pContext = pDC->pContext; + + AR_BEGIN(BESampleRateBackend, pDC->drawId); + AR_BEGIN(BESetup, pDC->drawId); + + const API_STATE &state = GetApiState(pDC); + + BarycentricCoeffs coeffs; + SetupBarycentricCoeffs(&coeffs, work); + + uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer; + SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); + + SWR_PS_CONTEXT psContext; + const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; + SetupPixelShaderContext(&psContext, samplePos, work); + + AR_END(BESetup, 0); + + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast(y))); + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast(y))); + + const simdscalar dy = _simd_set1_ps(static_cast(SIMD_TILE_Y_DIM)); + + for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) + { + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast(x))); + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast(x))); + + const simdscalar dx = _simd_set1_ps(static_cast(SIMD_TILE_X_DIM)); + + for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) + { +#if USE_8x2_TILE_BACKEND + const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); + +#endif + if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) + { + const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; + + generateInputCoverage(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); + } + + AR_BEGIN(BEBarycentric, pDC->drawId); + + CalcPixelBarycentrics(coeffs, psContext); + + CalcCentroid(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); + + AR_END(BEBarycentric, 0); + + for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++) + { + simdmask coverageMask = work.coverageMask[sample] & MASK; + + if (coverageMask) + { + // offset depth/stencil buffers current sample + uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); + uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); + + if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) + { + static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); + + const simdscalar z = _simd_load_ps(reinterpret_cast(pDepthSample)); + + const float minz = state.depthBoundsState.depthBoundsTestMinValue; + const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; + + coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); + } + + AR_BEGIN(BEBarycentric, pDC->drawId); + + // calculate per sample positions + psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample)); + psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample)); + + CalcSampleBarycentrics(coeffs, psContext); + + // interpolate and quantize z + psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); + psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); + + AR_END(BEBarycentric, 0); + + // interpolate user clip distance if available + if (state.rastState.clipDistanceMask) + { + coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); + } + + simdscalar vCoverageMask = vMask(coverageMask); + simdscalar depthPassMask = vCoverageMask; + simdscalar stencilPassMask = vCoverageMask; + + // Early-Z? + if (T::bCanEarlyZ) + { + AR_BEGIN(BEEarlyDepthTest, pDC->drawId); + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, + psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); + AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); + AR_END(BEEarlyDepthTest, 0); + + // early-exit if no samples passed depth or earlyZ is forced on. + if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask)) + { + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); + + if (!_simd_movemask_ps(depthPassMask)) + { + work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + continue; + } + } + } + + psContext.sampleIndex = sample; + psContext.activeMask = _simd_castps_si(vCoverageMask); + + // execute pixel shader + AR_BEGIN(BEPixelShader, pDC->drawId); + UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask))); + state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); + AR_END(BEPixelShader, 0); + + vCoverageMask = _simd_castsi_ps(psContext.activeMask); + + // late-Z + if (!T::bCanEarlyZ) + { + AR_BEGIN(BELateDepthTest, pDC->drawId); + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, + psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); + AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); + AR_END(BELateDepthTest, 0); + + if (!_simd_movemask_ps(depthPassMask)) + { + // need to call depth/stencil write for stencil write + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); + + work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + continue; + } + } + + uint32_t statMask = _simd_movemask_ps(depthPassMask); + uint32_t statCount = _mm_popcnt_u32(statMask); + UPDATE_STAT_BE(DepthPassCount, statCount); + + // output merger + AR_BEGIN(BEOutputMerger, pDC->drawId); +#if USE_8x2_TILE_BACKEND + OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); +#else + OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets); +#endif + + // do final depth write after all pixel kills + if (!state.psState.forceEarlyZ) + { + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); + } + AR_END(BEOutputMerger, 0); + } + work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + } + + Endtile: + ATTR_UNUSED; + + AR_BEGIN(BEEndTile, pDC->drawId); + + if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) + { + work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + } + +#if USE_8x2_TILE_BACKEND + if (useAlternateOffset) + { + for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) + { + pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + } + } +#else + for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) + { + pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + } +#endif + pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + + AR_END(BEEndTile, 0); + + psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); + psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); + } + + psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); + psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); + } + + AR_END(BESampleRateBackend, 0); +} + +// Recursive template used to auto-nest conditionals. Converts dynamic enum function +// arguments to static template arguments. +template +struct BEChooserSampleRate +{ + // Last Arg Terminator + static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg) + { + switch (tArg) + { + case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate>; break; + case SWR_BACKEND_SINGLE_SAMPLE: + case SWR_BACKEND_MSAA_PIXEL_RATE: + SWR_ASSERT(0 && "Invalid backend func\n"); + return nullptr; + break; + default: + SWR_ASSERT(0 && "Invalid backend func\n"); + return nullptr; + break; + } + } + + // Recursively parse args + template + static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs) + { + switch (tArg) + { + case SWR_INPUT_COVERAGE_NONE: return BEChooserSampleRate::GetFunc(remainingArgs...); break; + case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSampleRate::GetFunc(remainingArgs...); break; + case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSampleRate::GetFunc(remainingArgs...); break; + default: + SWR_ASSERT(0 && "Invalid sample pattern\n"); + return BEChooserSampleRate::GetFunc(remainingArgs...); + break; + } + } + + // Recursively parse args + template + static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs) + { + switch (tArg) + { + case SWR_MULTISAMPLE_1X: return BEChooserSampleRate::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_2X: return BEChooserSampleRate::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_4X: return BEChooserSampleRate::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_8X: return BEChooserSampleRate::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_16X: return BEChooserSampleRate::GetFunc(remainingArgs...); break; + default: + SWR_ASSERT(0 && "Invalid sample count\n"); + return BEChooserSampleRate::GetFunc(remainingArgs...); + break; + } + } + + // Recursively parse args + template + static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs) + { + if (tArg == true) + { + return BEChooserSampleRate::GetFunc(remainingArgs...); + } + + return BEChooserSampleRate::GetFunc(remainingArgs...); + } +}; + +void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]) +{ + for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++) + { + for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) + { + for (uint32_t centroid = 0; centroid < 2; centroid++) + { + for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) + { + table[sampleCount][inputCoverage][centroid][canEarlyZ] = + BEChooserSampleRate<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage, + (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE); + } + } + } + } +} diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp new file mode 100644 index 00000000000..0eecc25882a --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp @@ -0,0 +1,321 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file backend.cpp +* +* @brief Backend handles rasterization, pixel shading and output merger +* operations. +* +******************************************************************************/ + +#include + +#include "backend.h" +#include "backend_impl.h" +#include "tilemgr.h" +#include "memory/tilingtraits.h" +#include "core/multisample.h" + +#include + +template +void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) +{ + SWR_CONTEXT *pContext = pDC->pContext; + + AR_BEGIN(BESingleSampleBackend, pDC->drawId); + AR_BEGIN(BESetup, pDC->drawId); + + const API_STATE &state = GetApiState(pDC); + + BarycentricCoeffs coeffs; + SetupBarycentricCoeffs(&coeffs, work); + + uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer; + SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); + + SWR_PS_CONTEXT psContext; + const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; + SetupPixelShaderContext(&psContext, samplePos, work); + + AR_END(BESetup, 1); + + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast(y))); + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast(y))); + + const simdscalar dy = _simd_set1_ps(static_cast(SIMD_TILE_Y_DIM)); + + for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) + { + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast(x))); + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast(x))); + + const simdscalar dx = _simd_set1_ps(static_cast(SIMD_TILE_X_DIM)); + + for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) + { +#if USE_8x2_TILE_BACKEND + const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); + +#endif + simdmask coverageMask = work.coverageMask[0] & MASK; + + if (coverageMask) + { + if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) + { + static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); + + const simdscalar z = _simd_load_ps(reinterpret_cast(pDepthBuffer)); + + const float minz = state.depthBoundsState.depthBoundsTestMinValue; + const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; + + coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); + } + + if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) + { + const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; + + generateInputCoverage(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); + } + + AR_BEGIN(BEBarycentric, pDC->drawId); + + CalcPixelBarycentrics(coeffs, psContext); + + CalcCentroid(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); + + // interpolate and quantize z + psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); + psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); + + AR_END(BEBarycentric, 1); + + // interpolate user clip distance if available + if (state.rastState.clipDistanceMask) + { + coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center); + } + + simdscalar vCoverageMask = vMask(coverageMask); + simdscalar depthPassMask = vCoverageMask; + simdscalar stencilPassMask = vCoverageMask; + + // Early-Z? + if (T::bCanEarlyZ) + { + AR_BEGIN(BEEarlyDepthTest, pDC->drawId); + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, + psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask); + AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); + AR_END(BEEarlyDepthTest, 0); + + // early-exit if no pixels passed depth or earlyZ is forced on + if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask)) + { + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); + + if (!_simd_movemask_ps(depthPassMask)) + { + goto Endtile; + } + } + } + + psContext.sampleIndex = 0; + psContext.activeMask = _simd_castps_si(vCoverageMask); + + // execute pixel shader + AR_BEGIN(BEPixelShader, pDC->drawId); + UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask))); + state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); + AR_END(BEPixelShader, 0); + + vCoverageMask = _simd_castsi_ps(psContext.activeMask); + + // late-Z + if (!T::bCanEarlyZ) + { + AR_BEGIN(BELateDepthTest, pDC->drawId); + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, + psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask); + AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); + AR_END(BELateDepthTest, 0); + + if (!_simd_movemask_ps(depthPassMask)) + { + // need to call depth/stencil write for stencil write + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); + goto Endtile; + } + } else { + // for early z, consolidate discards from shader + // into depthPassMask + depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask); + } + + uint32_t statMask = _simd_movemask_ps(depthPassMask); + uint32_t statCount = _mm_popcnt_u32(statMask); + UPDATE_STAT_BE(DepthPassCount, statCount); + + // output merger + AR_BEGIN(BEOutputMerger, pDC->drawId); +#if USE_8x2_TILE_BACKEND + OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); +#else + OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets); +#endif + + // do final depth write after all pixel kills + if (!state.psState.forceEarlyZ) + { + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); + } + AR_END(BEOutputMerger, 0); + } + +Endtile: + AR_BEGIN(BEEndTile, pDC->drawId); + + work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) + { + work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + } + +#if USE_8x2_TILE_BACKEND + if (useAlternateOffset) + { + for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) + { + pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + } + } +#else + for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) + { + pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + } +#endif + pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + + AR_END(BEEndTile, 0); + + psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); + psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); + } + + psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); + psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); + } + + AR_END(BESingleSampleBackend, 0); +} + +// Recursive template used to auto-nest conditionals. Converts dynamic enum function +// arguments to static template arguments. +template +struct BEChooserSingleSample +{ + // Last Arg Terminator + static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg) + { + switch(tArg) + { + case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample>; break; + case SWR_BACKEND_MSAA_PIXEL_RATE: + case SWR_BACKEND_MSAA_SAMPLE_RATE: + default: + SWR_ASSERT(0 && "Invalid backend func\n"); + return nullptr; + break; + } + } + + // Recursively parse args + template + static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs) + { + switch(tArg) + { + case SWR_INPUT_COVERAGE_NONE: return BEChooserSingleSample::GetFunc(remainingArgs...); break; + case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSingleSample::GetFunc(remainingArgs...); break; + case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSingleSample::GetFunc(remainingArgs...); break; + default: + SWR_ASSERT(0 && "Invalid sample pattern\n"); + return BEChooserSingleSample::GetFunc(remainingArgs...); + break; + } + } + + // Recursively parse args + template + static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs) + { + switch(tArg) + { + case SWR_MULTISAMPLE_1X: return BEChooserSingleSample::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_2X: return BEChooserSingleSample::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_4X: return BEChooserSingleSample::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_8X: return BEChooserSingleSample::GetFunc(remainingArgs...); break; + case SWR_MULTISAMPLE_16X: return BEChooserSingleSample::GetFunc(remainingArgs...); break; + default: + SWR_ASSERT(0 && "Invalid sample count\n"); + return BEChooserSingleSample::GetFunc(remainingArgs...); + break; + } + } + + // Recursively parse args + template + static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs) + { + if(tArg == true) + { + return BEChooserSingleSample::GetFunc(remainingArgs...); + } + + return BEChooserSingleSample::GetFunc(remainingArgs...); + } +}; + +void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]) +{ + for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) + { + for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) + { + for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) + { + table[inputCoverage][isCentroid][canEarlyZ] = + BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage, + (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE); + } + } + } +}