From 0bf1df2bb6e2311c532734c4cb6a096389e511bf Mon Sep 17 00:00:00 2001 From: Alok Hota Date: Thu, 14 Jun 2018 12:30:56 -0500 Subject: [PATCH] swr/rast: Remove deprecated 4x2 backend code - Use 8x2 tiling by default - Remove associated macros - Use SIMDLIB emulation for SIMD16 on SIMD8 hardware - Remove code rot in Load/StoreTile Reviewed-by: Bruce Cherniak --- .../swr/rasterizer/core/backend_clear.cpp | 46 +- .../swr/rasterizer/core/backend_impl.h | 114 +--- .../swr/rasterizer/core/backend_sample.cpp | 30 +- .../rasterizer/core/backend_singlesample.cpp | 31 +- .../drivers/swr/rasterizer/core/knobs.h | 1 - .../drivers/swr/rasterizer/core/tilemgr.cpp | 86 --- .../drivers/swr/rasterizer/memory/LoadTile.h | 16 - .../drivers/swr/rasterizer/memory/StoreTile.h | 529 ------------------ 8 files changed, 19 insertions(+), 834 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp index 623ebc81d77..5750ceac7f0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp @@ -37,29 +37,11 @@ #include -template -void ClearRasterTile(uint8_t* pTileBuffer, simdvector& value) -{ - auto lambda = [&](int32_t comp) { - FormatTraits::storeSOA(comp, pTileBuffer, value.v[comp]); - - pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits::GetBPC(comp) / 8); - }; - - const uint32_t numIter = - (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM); - - for (uint32_t i = 0; i < numIter; ++i) - { - UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); - } -} - -#if USE_8x2_TILE_BACKEND template void ClearRasterTile(uint8_t* pTileBuffer, simd16vector& value) { - auto lambda = [&](int32_t comp) { + auto lambda = [&](int32_t comp) + { FormatTraits::storeSOA(comp, pTileBuffer, value.v[comp]); pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits::GetBPC(comp) / 8); @@ -74,7 +56,6 @@ void ClearRasterTile(uint8_t* pTileBuffer, simd16vector& value) } } -#endif template INLINE void ClearMacroTile(DRAW_CONTEXT* pDC, HANDLE hWorkerPrivateData, @@ -86,37 +67,22 @@ INLINE void ClearMacroTile(DRAW_CONTEXT* pDC, { // convert clear color to hottile format // clear color is in RGBA float/uint32 -#if USE_8x2_TILE_BACKEND + simd16vector vClear; for (uint32_t comp = 0; comp < FormatTraits::numComps; ++comp) { - simd16scalar vComp; - vComp = _simd16_load1_ps((const float*)&clear[comp]); + simd16scalar vComp = _simd16_load1_ps((const float*)&clear[comp]); + if (FormatTraits::isNormalized(comp)) { vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits::fromFloat(comp))); vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp)); } - vComp = FormatTraits::pack(comp, vComp); - vClear.v[FormatTraits::swizzle(comp)] = vComp; - } + vComp = FormatTraits::pack(comp, vComp); -#else - simdvector vClear; - for (uint32_t comp = 0; comp < FormatTraits::numComps; ++comp) - { - simdscalar vComp; - vComp = _simd_load1_ps((const float*)&clear[comp]); - if (FormatTraits::isNormalized(comp)) - { - vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits::fromFloat(comp))); - vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp)); - } - vComp = FormatTraits::pack(comp, vComp); vClear.v[FormatTraits::swizzle(comp)] = vComp; } -#endif uint32_t tileX, tileY; MacroTileMgr::getTileIndices(macroTile, tileX, tileY); diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h index d556c549704..83d662bd9a7 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h @@ -894,87 +894,6 @@ static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, psContext.vJ.sample); } -// Merge Output to 4x2 SIMD Tile Format -INLINE void OutputMerger4x2(DRAW_CONTEXT* pDC, - SWR_PS_CONTEXT& psContext, - uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], - uint32_t sample, - const SWR_BLEND_STATE* pBlendState, - const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], - simdscalar& coverageMask, - simdscalar const& depthPassMask, - uint32_t renderTargetMask, - uint32_t workerId) -{ - // type safety guaranteed from template instantiation in BEChooser<>::GetFunc - const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); - simdvector blendOut; - - DWORD rt = 0; - while (_BitScanForward(&rt, renderTargetMask)) - { - renderTargetMask &= ~(1 << rt); - uint8_t* pColorSample = pColorBase[rt] + rasterTileColorOffset; - - const SWR_RENDER_TARGET_BLEND_STATE* pRTBlend = &pBlendState->renderTarget[rt]; - - SWR_BLEND_CONTEXT blendContext = {0}; - { - // pfnBlendFunc may not update all channels. Initialize with PS output. - /// TODO: move this into the blend JIT. - blendOut = psContext.shaded[rt]; - - blendContext.pBlendState = pBlendState; - blendContext.src = &psContext.shaded[rt]; - blendContext.src1 = &psContext.shaded[1]; - blendContext.src0alpha = reinterpret_cast(&psContext.shaded[0].w); - blendContext.sampleNum = sample; - blendContext.pDst = (simdvector*)&pColorSample; - blendContext.result = &blendOut; - blendContext.oMask = &psContext.oMask; - blendContext.pMask = reinterpret_cast(&coverageMask); - - // Blend outputs and update coverage mask for alpha test - if (pfnBlendFunc[rt] != nullptr) - { - pfnBlendFunc[rt](&blendContext); - } - } - - // Track alpha events - AR_EVENT( - AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended)); - - // final write mask - simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask)); - - ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. - static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, - "Unsupported hot tile format"); - - const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float); - - // store with color mask - if (!pRTBlend->writeDisableRed) - { - _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x); - } - if (!pRTBlend->writeDisableGreen) - { - _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y); - } - if (!pRTBlend->writeDisableBlue) - { - _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z); - } - if (!pRTBlend->writeDisableAlpha) - { - _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w); - } - } -} - -#if USE_8x2_TILE_BACKEND // Merge Output to 8x2 SIMD16 Tile Format INLINE void OutputMerger8x2(DRAW_CONTEXT* pDC, SWR_PS_CONTEXT& psContext, @@ -1076,8 +995,6 @@ INLINE void OutputMerger8x2(DRAW_CONTEXT* pDC, } } -#endif - template void BackendPixelRate(DRAW_CONTEXT* pDC, uint32_t workerId, @@ -1137,9 +1054,9 @@ void BackendPixelRate(DRAW_CONTEXT* pDC, for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { -#if USE_8x2_TILE_BACKEND const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); -#endif + + simdscalar activeLanes; if (!(work.anyCoveredSamples & MASK)) { @@ -1264,7 +1181,7 @@ void BackendPixelRate(DRAW_CONTEXT* pDC, } // broadcast the results of the PS to all passing pixels -#if USE_8x2_TILE_BACKEND + OutputMerger8x2(pDC, psContext, psContext.pColorBuffer, @@ -1276,18 +1193,7 @@ void BackendPixelRate(DRAW_CONTEXT* pDC, state.psState.renderTargetMask, useAlternateOffset, workerId); -#else // USE_8x2_TILE_BACKEND - OutputMerger4x2(pDC, - psContext, - psContext.pColorBuffer, - sample, - &state.blendState, - state.pfnBlendFunc, - coverageMask, - depthMask, - state.psState.renderTargetMask, - workerId); -#endif // USE_8x2_TILE_BACKEND + if (!state.psState.forceEarlyZ && !T::bForcedSampleCount) { @@ -1320,7 +1226,6 @@ void BackendPixelRate(DRAW_CONTEXT* pDC, } work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); -#if USE_8x2_TILE_BACKEND if (useAlternateOffset) { DWORD rt; @@ -1332,16 +1237,7 @@ void BackendPixelRate(DRAW_CONTEXT* pDC, (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; } } -#else - DWORD rt; - uint32_t rtMask = state.colorHottileEnable; - while (_BitScanForward(&rt, rtMask)) - { - rtMask &= ~(1 << rt); - psContext.pColorBuffer[rt] += - (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - } -#endif + pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp index ff09cc6caa7..9b0b80f766f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp @@ -81,9 +81,9 @@ void BackendSampleRate(DRAW_CONTEXT* pDC, for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { -#if USE_8x2_TILE_BACKEND const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); -#endif + + if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) { const uint64_t* pCoverageMask = @@ -252,7 +252,7 @@ void BackendSampleRate(DRAW_CONTEXT* pDC, // output merger RDTSC_BEGIN(BEOutputMerger, pDC->drawId); -#if USE_8x2_TILE_BACKEND + OutputMerger8x2(pDC, psContext, psContext.pColorBuffer, @@ -264,18 +264,6 @@ void BackendSampleRate(DRAW_CONTEXT* pDC, state.psState.renderTargetMask, useAlternateOffset, workerId); -#else - OutputMerger4x2(pDC, - psContext, - psContext.pColorBuffer, - sample, - &state.blendState, - state.pfnBlendFunc, - vCoverageMask, - depthPassMask, - state.psState.renderTargetMask, - workerId); -#endif // do final depth write after all pixel kills if (!state.psState.forceEarlyZ) @@ -305,7 +293,6 @@ void BackendSampleRate(DRAW_CONTEXT* pDC, work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); } -#if USE_8x2_TILE_BACKEND if (useAlternateOffset) { DWORD rt; @@ -317,16 +304,7 @@ void BackendSampleRate(DRAW_CONTEXT* pDC, (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; } } -#else - DWORD rt; - uint32_t rtMask = state.colorHottileEnable; - while (_BitScanForward(&rt, rtMask)) - { - rtMask &= ~(1 << rt); - psContext.pColorBuffer[rt] += - (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - } -#endif + pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp index 1c065ab14bf..46aabcdf34b 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp @@ -82,9 +82,9 @@ void BackendSingleSample(DRAW_CONTEXT* pDC, for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { -#if USE_8x2_TILE_BACKEND const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); -#endif + + simdmask coverageMask = work.coverageMask[0] & MASK; if (coverageMask) @@ -237,7 +237,7 @@ void BackendSingleSample(DRAW_CONTEXT* pDC, // output merger RDTSC_BEGIN(BEOutputMerger, pDC->drawId); -#if USE_8x2_TILE_BACKEND + OutputMerger8x2(pDC, psContext, psContext.pColorBuffer, @@ -249,19 +249,6 @@ void BackendSingleSample(DRAW_CONTEXT* pDC, state.psState.renderTargetMask, useAlternateOffset, workerId); -#else - OutputMerger4x2(pDC, - psContext, - psContext.pColorBuffer, - 0, - &state.blendState, - state.pfnBlendFunc, - vCoverageMask, - depthPassMask, - state.psState.renderTargetMask, - workerId, - workerId); -#endif // do final depth write after all pixel kills if (!state.psState.forceEarlyZ) @@ -288,7 +275,6 @@ void BackendSingleSample(DRAW_CONTEXT* pDC, work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); } -#if USE_8x2_TILE_BACKEND if (useAlternateOffset) { DWORD rt; @@ -300,16 +286,7 @@ void BackendSingleSample(DRAW_CONTEXT* pDC, (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; } } -#else - DWORD rt; - uint32_t rtMask = state.colorHottileEnable; - while (_BitScanForward(&rt, rtMask)) - { - rtMask &= ~(1 << rt); - psContext.pColorBuffer[rt] += - (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; - } -#endif + pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h index b52accbbab3..8cccbf416af 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h @@ -39,7 +39,6 @@ /////////////////////////////////////////////////////////////////////////////// #define ENABLE_AVX512_SIMD16 1 -#define USE_8x2_TILE_BACKEND 1 #define USE_SIMD16_FRONTEND 1 #define USE_SIMD16_SHADERS 1 // requires USE_SIMD16_FRONTEND #define USE_SIMD16_VS 1 // requires USE_SIMD16_SHADERS diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp index 87d5373a215..1ea1c4b1a6f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp @@ -244,7 +244,6 @@ HOTTILE* HotTileMgr::GetHotTileNoLoad(SWR_CONTEXT* pContext, return &hotTile; } -#if USE_8x2_TILE_BACKEND void HotTileMgr::ClearColorHotTile( const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. { @@ -330,91 +329,6 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile) } } -#else -void HotTileMgr::ClearColorHotTile( - const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. -{ - // Load clear color into SIMD register... - float* pClearData = (float*)(pHotTile->clearData); - simdscalar valR = _simd_broadcast_ss(&pClearData[0]); - simdscalar valG = _simd_broadcast_ss(&pClearData[1]); - simdscalar valB = _simd_broadcast_ss(&pClearData[2]); - simdscalar valA = _simd_broadcast_ss(&pClearData[3]); - - float* pfBuf = (float*)pHotTile->pBuffer; - uint32_t numSamples = pHotTile->numSamples; - - for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); - si += - SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) // SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++) - { - _simd_store_ps(pfBuf, valR); - pfBuf += KNOB_SIMD_WIDTH; - _simd_store_ps(pfBuf, valG); - pfBuf += KNOB_SIMD_WIDTH; - _simd_store_ps(pfBuf, valB); - pfBuf += KNOB_SIMD_WIDTH; - _simd_store_ps(pfBuf, valA); - pfBuf += KNOB_SIMD_WIDTH; - } - } - } -} - -void HotTileMgr::ClearDepthHotTile( - const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. -{ - // Load clear color into SIMD register... - float* pClearData = (float*)(pHotTile->clearData); - simdscalar valZ = _simd_broadcast_ss(&pClearData[0]); - - float* pfBuf = (float*)pHotTile->pBuffer; - uint32_t numSamples = pHotTile->numSamples; - - for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); - si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) - { - _simd_store_ps(pfBuf, valZ); - pfBuf += KNOB_SIMD_WIDTH; - } - } - } -} - -void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile) -{ - // convert from F32 to U8. - uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]); - // broadcast 32x into __m256i... - simdscalari valS = _simd_set1_epi8(clearVal); - - simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer; - uint32_t numSamples = pHotTile->numSamples; - - for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly. - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); - si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4) - { - _simd_store_si(pBuf, valS); - pBuf += 1; - } - } - } -} - -#endif ////////////////////////////////////////////////////////////////////////// /// @brief InitializeHotTiles /// for draw calls, we initialize the active hot tiles and perform deferred diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.h b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.h index d1cc3ed207f..d85a3353526 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.h +++ b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.h @@ -67,7 +67,6 @@ struct LoadRasterTile uint32_t x, uint32_t y, uint8_t* pDst) { -#if USE_8x2_TILE_BACKEND typedef SimdTile_16 SimdT; SimdT* pDstSimdTiles = (SimdT*)pDst; @@ -81,21 +80,6 @@ struct LoadRasterTile uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM); pSimdTile->SetSwizzledColor(simdOffset, srcColor); -#else - typedef SimdTile SimdT; - - SimdT* pDstSimdTiles = (SimdT*)pDst; - - // Compute which simd tile we're accessing within 8x8 tile. - // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. - uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM); - - SimdT* pSimdTile = &pDstSimdTiles[simdIndex]; - - uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM); - - pSimdTile->SetSwizzledColor(simdOffset, srcColor); -#endif } ////////////////////////////////////////////////////////////////////////// diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h index 29717261e7f..407cefae54e 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h @@ -104,7 +104,6 @@ struct StorePixels<8, 2> } }; -#if USE_8x2_TILE_BACKEND template <> struct StorePixels<8, 4> { @@ -130,7 +129,6 @@ struct StorePixels<8, 4> } }; -#endif ////////////////////////////////////////////////////////////////////////// /// StorePixels (32-bit pixel specialization) /// @brief Stores a 4x2 (AVX) raster-tile to two rows. @@ -159,7 +157,6 @@ struct StorePixels<16, 2> } }; -#if USE_8x2_TILE_BACKEND template <> struct StorePixels<16, 4> { @@ -185,7 +182,6 @@ struct StorePixels<16, 4> } }; -#endif ////////////////////////////////////////////////////////////////////////// /// StorePixels (32-bit pixel specialization) /// @brief Stores a 4x2 (AVX) raster-tile to two rows. @@ -213,7 +209,6 @@ struct StorePixels<32, 2> } }; -#if USE_8x2_TILE_BACKEND template <> struct StorePixels<32, 4> { @@ -237,7 +232,6 @@ struct StorePixels<32, 4> } }; -#endif ////////////////////////////////////////////////////////////////////////// /// StorePixels (32-bit pixel specialization) /// @brief Stores a 4x2 (AVX) raster-tile to two rows. @@ -264,7 +258,6 @@ struct StorePixels<64, 4> } }; -#if USE_8x2_TILE_BACKEND template <> struct StorePixels<64, 8> { @@ -287,7 +280,6 @@ struct StorePixels<64, 8> } }; -#endif ////////////////////////////////////////////////////////////////////////// /// StorePixels (32-bit pixel specialization) /// @brief Stores a 4x2 (AVX) raster-tile to two rows. @@ -318,7 +310,6 @@ struct StorePixels<128, 8> } }; -#if USE_8x2_TILE_BACKEND template <> struct StorePixels<128, 16> { @@ -339,7 +330,6 @@ struct StorePixels<128, 16> } }; -#endif ////////////////////////////////////////////////////////////////////////// /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) ////////////////////////////////////////////////////////////////////////// @@ -354,7 +344,6 @@ struct ConvertPixelsSOAtoAOS template INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { -#if USE_8x2_TILE_BACKEND static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; @@ -368,21 +357,6 @@ struct ConvertPixelsSOAtoAOS // Convert from SOA --> AOS FormatTraits::TransposeT::Transpose_16(soaTile, aosTile); -#else - static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel - - OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; - OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; - - // Convert from SrcFormat --> DstFormat - simdvector src; - LoadSOA(pSrc, src); - StoreSOA(src, soaTile); - - // Convert from SOA --> AOS - FormatTraits::TransposeT::Transpose(soaTile, aosTile); - -#endif // Store data into destination StorePixels::bpp, NumDests>::Store(aosTile, ppDsts); } @@ -403,7 +377,6 @@ struct ConvertPixelsSOAtoAOS template INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { -#if USE_8x2_TILE_BACKEND static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; @@ -411,15 +384,6 @@ struct ConvertPixelsSOAtoAOS // Convert from SOA --> AOS FormatTraits::TransposeT::Transpose_16(pSrc, aosTile); -#else - static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel - - OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; - - // Convert from SOA --> AOS - FormatTraits::TransposeT::Transpose(pSrc, aosTile); - -#endif // Store data into destination StorePixels::bpp, NumDests>::Store(aosTile, ppDsts); } @@ -439,7 +403,6 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM > template INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { -#if USE_8x2_TILE_BACKEND static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT; static const SWR_FORMAT DstFormat = B5G6R5_UNORM; @@ -483,47 +446,6 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM > *pAosTile++ = *pPacked++; } -#else - static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT; - static const SWR_FORMAT DstFormat = B5G6R5_UNORM; - static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel - - OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; - - // Load hot-tile - simdvector src, dst; - LoadSOA(pSrc, src); - - // deswizzle - dst.x = src[FormatTraits::swizzle(0)]; - dst.y = src[FormatTraits::swizzle(1)]; - dst.z = src[FormatTraits::swizzle(2)]; - - // clamp - dst.x = Clamp(dst.x, 0); - dst.y = Clamp(dst.y, 1); - dst.z = Clamp(dst.z, 2); - - // normalize - dst.x = Normalize(dst.x, 0); - dst.y = Normalize(dst.y, 1); - dst.z = Normalize(dst.z, 2); - - // pack - simdscalari packed = _simd_castps_si(dst.x); - packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits::GetConstBPC(0))); - packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits::GetConstBPC(0) + - FormatTraits::GetConstBPC(1))); - - // pack low 16 bits of each 32 bit lane to low 128 bits of dst - uint32_t *pPacked = (uint32_t*)&packed; - uint16_t *pAosTile = (uint16_t*)&aosTile[0]; - for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t) - { - *pAosTile++ = *pPacked++; - } - -#endif // Store data into destination StorePixels::bpp, NumDests>::Store(aosTile, ppDsts); } @@ -546,7 +468,6 @@ struct ConvertPixelsSOAtoAOS template INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { -#if USE_8x2_TILE_BACKEND simd16scalar comp = _simd16_load_ps(reinterpret_cast(pSrc)); // clamp @@ -579,46 +500,9 @@ struct ConvertPixelsSOAtoAOS _simd_storeu2_si(reinterpret_cast(ppDsts[1]), reinterpret_cast(ppDsts[0]), _simd16_extract_si(dest, 0)); _simd_storeu2_si(reinterpret_cast(ppDsts[3]), reinterpret_cast(ppDsts[2]), _simd16_extract_si(dest, 1)); -#else - static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel - - OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; - OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; - - // Convert from SrcFormat --> DstFormat - simdvector src; - LoadSOA(pSrc, src); - StoreSOA(src, soaTile); - - // Convert from SOA --> AOS - FormatTraits::TransposeT::Transpose(soaTile, aosTile); - - // Store data into destination but don't overwrite the X8 bits - // Each 4-pixel row is 16-bytes - simd4scalari *pZRow01 = (simd4scalari*)aosTile; - simd4scalari vQuad00 = SIMD128::load_si(pZRow01); - simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1); - - simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01); - simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01); - - simd4scalari vDst0 = SIMD128::loadu_si((const simd4scalari*)ppDsts[0]); - simd4scalari vDst1 = SIMD128::loadu_si((const simd4scalari*)ppDsts[1]); - - simd4scalari vMask = _mm_set1_epi32(0xFFFFFF); - - vDst0 = SIMD128::andnot_si(vMask, vDst0); - vDst0 = SIMD128::or_si(vDst0, SIMD128::and_si(vRow00, vMask)); - vDst1 = SIMD128::andnot_si(vMask, vDst1); - vDst1 = SIMD128::or_si(vDst1, SIMD128::and_si(vRow10, vMask)); - - SIMD128::storeu_si((simd4scalari*)ppDsts[0], vDst0); - SIMD128::storeu_si((simd4scalari*)ppDsts[1], vDst1); -#endif } }; -#if USE_8x2_TILE_BACKEND template INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3) { @@ -689,7 +573,6 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDs _simd_storeu2_si(reinterpret_cast(pDst3), reinterpret_cast(pDst2), _simd16_extract_si(final, 1)); } -#endif template INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) { @@ -790,7 +673,6 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final); } -#if USE_8x2_TILE_BACKEND template INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3) { @@ -854,7 +736,6 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8 _simd_storeu2_si(reinterpret_cast(pDst3), reinterpret_cast(pDst2), _simd16_extract_si(final, 1)); } -#endif template INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) { @@ -947,11 +828,7 @@ struct ConvertPixelsSOAtoAOS template INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { -#if USE_8x2_TILE_BACKEND FlatConvert(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); -#else - FlatConvert(pSrc, ppDsts[0], ppDsts[1]); -#endif } }; @@ -961,11 +838,7 @@ struct ConvertPixelsSOAtoAOS template INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { -#if USE_8x2_TILE_BACKEND FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); -#else - FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); -#endif } }; @@ -975,11 +848,7 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB > template INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { -#if USE_8x2_TILE_BACKEND FlatConvert(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); -#else - FlatConvert(pSrc, ppDsts[0], ppDsts[1]); -#endif } }; @@ -989,11 +858,7 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB > template INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { -#if USE_8x2_TILE_BACKEND FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); -#else - FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); -#endif } }; @@ -1003,11 +868,7 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM > template INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { -#if USE_8x2_TILE_BACKEND FlatConvert(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); -#else - FlatConvert(pSrc, ppDsts[0], ppDsts[1]); -#endif } }; @@ -1017,11 +878,7 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM > template INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { -#if USE_8x2_TILE_BACKEND FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); -#else - FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); -#endif } }; @@ -1031,11 +888,7 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB > template INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { -#if USE_8x2_TILE_BACKEND FlatConvert(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); -#else - FlatConvert(pSrc, ppDsts[0], ppDsts[1]); -#endif } }; @@ -1045,11 +898,7 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB > template INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { -#if USE_8x2_TILE_BACKEND FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); -#else - FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); -#endif } }; @@ -1069,7 +918,6 @@ struct StoreRasterTile uint32_t x, uint32_t y, float outputColor[4]) { -#if USE_8x2_TILE_BACKEND typedef SimdTile_16 SimdT; SimdT *pSrcSimdTiles = reinterpret_cast(pSrc); @@ -1083,21 +931,6 @@ struct StoreRasterTile uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM); pSimdTile->GetSwizzledColor(simdOffset, outputColor); -#else - typedef SimdTile SimdT; - - SimdT* pSrcSimdTiles = (SimdT*)pSrc; - - // Compute which simd tile we're accessing within 8x8 tile. - // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. - uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM); - - SimdT* pSimdTile = &pSrcSimdTiles[simdIndex]; - - uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM); - - pSimdTile->GetSwizzledColor(simdOffset, outputColor); -#endif } ////////////////////////////////////////////////////////////////////////// @@ -1230,7 +1063,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat> uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -#if USE_8x2_TILE_BACKEND const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; @@ -1262,27 +1094,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat> ppDsts[2] += dy; ppDsts[3] += dy; } -#else - uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; - - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) - { - uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; - - for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) - { - // Format conversion and convert from SOA to AOS, and store the rows. - ConvertPixelsSOAtoAOS::Convert(pSrc, ppRows); - - ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; - ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; - pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; - } - - ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; - ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; - } -#endif } }; @@ -1317,7 +1128,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -#if USE_8x2_TILE_BACKEND const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; @@ -1349,27 +1159,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat ppDsts[2] += dy; ppDsts[3] += dy; } -#else - uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; - - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) - { - uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; - - for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) - { - // Format conversion and convert from SOA to AOS, and store the rows. - ConvertPixelsSOAtoAOS::Convert(pSrc, ppRows); - - ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; - ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; - pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; - } - - ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; - ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; - } -#endif } }; @@ -1404,7 +1193,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -#if USE_8x2_TILE_BACKEND const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; @@ -1436,27 +1224,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat ppDsts[2] += dy; ppDsts[3] += dy; } -#else - uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; - - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) - { - uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; - - for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) - { - // Format conversion and convert from SOA to AOS, and store the rows. - ConvertPixelsSOAtoAOS::Convert(pSrc, ppRows); - - ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; - ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; - pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; - } - - ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; - ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; - } -#endif } }; @@ -1470,10 +1237,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; static const size_t MAX_DST_COLUMN_BYTES = 16; -#if !USE_8x2_TILE_BACKEND - static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; - static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; -#endif ////////////////////////////////////////////////////////////////////////// /// @brief Stores an 8x8 raster tile to the destination surface. @@ -1496,7 +1259,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -#if USE_8x2_TILE_BACKEND const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch; @@ -1530,43 +1292,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat ppDsts[i] += dy; } } -#else - uint8_t* ppDsts[] = - { - pDst, // row 0, col 0 - pDst + pDstSurface->pitch, // row 1, col 0 - pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 - pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 - }; - - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) - { - uint8_t* ppStartRows[] = - { - ppDsts[0], - ppDsts[1], - ppDsts[2], - ppDsts[3], - }; - - for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) - { - // Format conversion and convert from SOA to AOS, and store the rows. - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - - ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; - ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; - ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; - ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; - pSrc += SRC_COLUMN_BYTES; - } - - ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch; - ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch; - ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch; - ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch; - } -#endif } }; @@ -1580,10 +1305,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstForma static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; static const size_t MAX_DST_COLUMN_BYTES = 16; -#if !USE_8x2_TILE_BACKEND - static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; - static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; -#endif ////////////////////////////////////////////////////////////////////////// /// @brief Stores an 8x8 raster tile to the destination surface. @@ -1606,7 +1327,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstForma uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -#if USE_8x2_TILE_BACKEND const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch; @@ -1648,51 +1368,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, DstForma ppDsts[i] += dy; } } -#else - struct DstPtrs - { - uint8_t* ppDsts[8]; - } ptrs; - - // Need 8 pointers, 4 columns of 2 rows each - for (uint32_t y = 0; y < 2; ++y) - { - for (uint32_t x = 0; x < 4; ++x) - { - ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES; - } - } - - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) - { - DstPtrs startPtrs = ptrs; - - for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) - { - // Format conversion and convert from SOA to AOS, and store the rows. - ConvertPixelsSOAtoAOS::Convert(pSrc, ptrs.ppDsts); - - ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; - pSrc += SRC_COLUMN_BYTES; - } - - ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch; - ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch; - ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch; - ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch; - ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch; - ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch; - ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch; - ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch; - } -#endif } }; @@ -1728,7 +1403,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Dst // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. // We can compute the offsets to each column within the raster tile once and increment from these. -#if USE_8x2_TILE_BACKEND // There will be 4 8x2 simd tiles in an 8x8 raster tile. uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); @@ -1758,32 +1432,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Dst ppDsts[2] += dy; ppDsts[3] += dy; } -#else - // There will be 8 4x2 simd tiles in an 8x8 raster tile. - uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. - uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; - - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) - { - uint32_t rowOffset = row * DestRowWidthBytes; - - uint8_t* pRow = pCol0 + rowOffset; - uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - - ppDsts[0] += DestRowWidthBytes / 4; - ppDsts[1] += DestRowWidthBytes / 4; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - } -#endif } }; @@ -1819,7 +1467,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. // We can compute the offsets to each column within the raster tile once and increment from these. -#if USE_8x2_TILE_BACKEND // There will be 4 8x2 simd tiles in an 8x8 raster tile. uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); @@ -1849,32 +1496,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds ppDsts[2] += dy; ppDsts[3] += dy; } -#else - // There will be 8 4x2 simd tiles in an 8x8 raster tile. - uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. - uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; - - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) - { - uint32_t rowOffset = row * DestRowWidthBytes; - - uint8_t* pRow = pCol0 + rowOffset; - uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - - ppDsts[0] += DestRowWidthBytes / 2; - ppDsts[1] += DestRowWidthBytes / 2; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - } -#endif } }; @@ -1911,7 +1532,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows. // We can compute the offsets to each column within the raster tile once and increment from these. -#if USE_8x2_TILE_BACKEND uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); @@ -1945,28 +1565,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds ppDsts[2] += dy; ppDsts[3] += dy; } -#else - uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - uint8_t* pRow1 = pRow0 + DestRowWidthBytes; - - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) - { - for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM) - { - uint32_t xRowOffset = col * (FormatTraits::bpp / 8); - - uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset }; - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - - // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. - pSrc += (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; - } - - pRow0 += (DestRowWidthBytes * 2); - pRow1 += (DestRowWidthBytes * 2); - } -#endif } }; @@ -2003,7 +1601,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. // We can compute the offsets to each column within the raster tile once and increment from these. -#if USE_8x2_TILE_BACKEND // There will be 4 8x2 simd tiles in an 8x8 raster tile. uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); @@ -2034,32 +1631,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds ppDsts[2] += dy; ppDsts[3] += dy; } -#else - // There will be 8 4x2 simd tiles in an 8x8 raster tile. - uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - - // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. - uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; - - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) - { - uint32_t rowOffset = row * DestRowWidthBytes; - - uint8_t* pRow = pCol0 + rowOffset; - uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - - ppDsts[0] += DestColumnBytes; - ppDsts[1] += DestColumnBytes; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - } -#endif } }; @@ -2096,7 +1667,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. // We can compute the offsets to each column within the raster tile once and increment from these. -#if USE_8x2_TILE_BACKEND // There will be 4 8x2 simd tiles in an 8x8 raster tile. uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); @@ -2131,40 +1701,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, Ds ppDsts[i] += dy; } } -#else - // There will be 8 4x2 simd tiles in an 8x8 raster tile. - uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - uint8_t* pCol1 = pCol0 + DestColumnBytes; - - // There are 4 columns, each 2 pixels wide when we have 64bpp pixels. - // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. - uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; - - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) - { - uint32_t rowOffset = row * DestRowWidthBytes; - uint8_t* ppDsts[] = - { - pCol0 + rowOffset, - pCol0 + rowOffset + DestRowWidthBytes, - pCol1 + rowOffset, - pCol1 + rowOffset + DestRowWidthBytes, - }; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - - ppDsts[0] += DestColumnBytes * 2; - ppDsts[1] += DestColumnBytes * 2; - ppDsts[2] += DestColumnBytes * 2; - ppDsts[3] += DestColumnBytes * 2; - - ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); - pSrc += pSrcInc; - } -#endif } }; @@ -2175,22 +1711,8 @@ template struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat> { typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; -#if USE_8x2_TILE_BACKEND - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; - -#else - static const size_t TILE_Y_COL_WIDTH_BYTES = 16; - static const size_t TILE_Y_ROWS = 32; - static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES; - - static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; - static const size_t MAX_DST_COLUMN_BYTES = 16; - - static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; - static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4; -#endif ////////////////////////////////////////////////////////////////////////// /// @brief Stores an 8x8 raster tile to the destination surface. /// @param pSrc - Pointer to raster tile. @@ -2201,10 +1723,8 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, D SWR_SURFACE_STATE* pDstSurface, uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) { -#if USE_8x2_TILE_BACKEND static const uint32_t DestRowWidthBytes = 16; // 16B rows static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. -#endif // Punt non-full tiles to generic store uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); @@ -2217,7 +1737,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, D // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. // We can compute the offsets to each column within the raster tile once and increment from these. -#if USE_8x2_TILE_BACKEND // There will be 4 8x2 simd tiles in an 8x8 raster tile. uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); @@ -2260,54 +1779,6 @@ struct OptStoreRasterTile< TilingTraits, SrcFormat, D ppDsts[i] += dy; } } -#else - // There will be 8 4x2 simd tiles in an 8x8 raster tile. - uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, - pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); - struct DstPtrs - { - uint8_t* ppDsts[8]; - } ptrs; - - // Need 8 pointers, 4 columns of 2 rows each - for (uint32_t y = 0; y < 2; ++y) - { - for (uint32_t x = 0; x < 4; ++x) - { - ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES; - } - } - - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) - { - DstPtrs startPtrs = ptrs; - - for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) - { - // Format conversion and convert from SOA to AOS, and store the rows. - ConvertPixelsSOAtoAOS::Convert(pSrc, ptrs.ppDsts); - - ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; - ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; - pSrc += SRC_COLUMN_BYTES; - } - - ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES; - ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES; - ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES; - ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES; - ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES; - ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES; - ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES; - ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES; - } -#endif } }; -- 2.30.2