From c6ca1265919e1a8bbabdd89a6cf79fb1aa7c93ec Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Mon, 27 Jun 2016 15:50:58 -0600 Subject: [PATCH] swr: [rasterizer core] conservative rast backend changes Signed-off-by: Tim Rowley --- .../drivers/swr/rasterizer/core/api.cpp | 6 +- .../drivers/swr/rasterizer/core/backend.cpp | 22 +- .../swr/rasterizer/core/conservativeRast.h | 111 +++- .../drivers/swr/rasterizer/core/frontend.cpp | 28 +- .../drivers/swr/rasterizer/core/frontend.h | 8 - .../swr/rasterizer/core/rasterizer.cpp | 511 ++++++++++++------ .../drivers/swr/rasterizer/core/rasterizer.h | 71 ++- .../drivers/swr/rasterizer/core/state.h | 2 +- 8 files changed, 538 insertions(+), 221 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 6f9c4027719..6460a16ec3f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -760,8 +760,8 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC) // templated backend function tables extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX]; extern PFN_BACKEND_FUNC gBackendSingleSample[2][2][2]; -extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2][2]; -extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2][2]; +extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][2][2][2][2]; +extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2]; void SetupPipeline(DRAW_CONTEXT *pDC) { DRAW_STATE* pState = pDC->pState; @@ -780,7 +780,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC) const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.forcedSampleCount) ? 1 : 0; const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0; - const uint32_t inputCoverage = (psState.inputCoverage != SWR_INPUT_COVERAGE_NONE); + const uint32_t inputCoverage = (psState.inputCoverage != SWR_INPUT_COVERAGE_NONE) ? 1 : 0; SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 8e1fa78d8a8..b492810b812 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -1154,12 +1154,13 @@ PFN_BACKEND_FUNC gBackendSingleSample[2] // input coverage = {}; PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX] [SWR_MSAA_SAMPLE_PATTERN_MAX] - [SWR_INPUT_COVERAGE_MAX] + [2] // input coverage [2] // centroid [2] // forcedSampleCount [2] // canEarlyZ = {}; -PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX] +PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX] + [2] // input coverage [2] // centroid [2] // canEarlyZ = {}; @@ -1232,28 +1233,27 @@ struct BEChooser void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[2][2][2]) { - for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++) + for(uint32_t inputCoverage = 0; inputCoverage < 2; inputCoverage++) { for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) { for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) { table[inputCoverage][isCentroid][canEarlyZ] = - BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), + BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (inputCoverage > 0), (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE); } } } } -void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX] - [2][2][2]) +void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][2][2][2][2]) { for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_MAX; sampleCount++) { for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_MAX; samplePattern++) { - for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++) + for(uint32_t inputCoverage = 0; inputCoverage < 2; inputCoverage++) { for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) { @@ -1262,7 +1262,7 @@ void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MA for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) { table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] = - BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), + BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage > 0), (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE); } } @@ -1272,18 +1272,18 @@ void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MA } } -void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2][2]) +void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][2][2][2]) { for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_MAX; sampleCount++) { - for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++) + for(uint32_t inputCoverage = 0; inputCoverage < 2; inputCoverage++) { for(uint32_t centroid = 0; centroid < 2; centroid++) { for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) { table[sampleCount][inputCoverage][centroid][canEarlyZ] = - BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), + BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage > 0), (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE); } } diff --git a/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h b/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h index f8aa8df76c9..ca4c19ec90d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h +++ b/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h @@ -31,7 +31,8 @@ enum FixedPointFmt { FP_UNINIT, _16_8, - _16_9 + _16_9, + _X_16, }; ////////////////////////////////////////////////////////////////////////// @@ -39,6 +40,7 @@ enum FixedPointFmt typedef std::integral_constant Fixed_Uninit; typedef std::integral_constant Fixed_16_8; typedef std::integral_constant Fixed_16_9; +typedef std::integral_constant Fixed_X_16; ////////////////////////////////////////////////////////////////////////// /// @struct FixedPointTraits @@ -53,9 +55,9 @@ template<> struct FixedPointTraits { /// multiplier to go from FP32 to Fixed Point 16.8 - typedef std::integral_constant FixedPointScaleT; + typedef std::integral_constant ScaleT; /// number of bits to shift to go from 16.8 fixed => int32 - typedef std::integral_constant FixedPointShiftT; + typedef std::integral_constant BitsT; typedef Fixed_16_8 TypeT; }; @@ -65,12 +67,24 @@ template<> struct FixedPointTraits { /// multiplier to go from FP32 to Fixed Point 16.9 - typedef std::integral_constant FixedPointScaleT; + typedef std::integral_constant ScaleT; /// number of bits to shift to go from 16.9 fixed => int32 - typedef std::integral_constant FixedPointShiftT; + typedef std::integral_constant BitsT; typedef Fixed_16_9 TypeT; }; +////////////////////////////////////////////////////////////////////////// +/// @brief Fixed_16_9 specialization of FixedPointTraits +template<> +struct FixedPointTraits +{ + /// multiplier to go from FP32 to Fixed Point X.16 + typedef std::integral_constant ScaleT; + /// number of bits to shift to go from X.16 fixed => int32 + typedef std::integral_constant BitsT; + typedef Fixed_X_16 TypeT; +}; + ////////////////////////////////////////////////////////////////////////// /// @brief convenience typedefs for conservative rasterization modes typedef std::false_type StandardRastT; @@ -118,3 +132,90 @@ struct ConservativeRastFETraits /// @brief convenience typedefs for ConservativeRastFETraits typedef ConservativeRastFETraits FEStandardRastT; typedef ConservativeRastFETraits FEConservativeRastT; + +////////////////////////////////////////////////////////////////////////// +/// @struct ConservativeRastBETraits +/// @brief primary ConservativeRastBETraits template. Shouldn't be instantiated; +/// default to standard rasterization behavior +/// @tparam ConservativeT: type of conservative rasterization +/// @tparam InputCoverageT: type of input coverage requested, if any +template +struct ConservativeRastBETraits { + typedef std::false_type IsConservativeT; + typedef FixedPointTraits ConservativePrecisionT; + typedef std::integral_constant ConservativeEdgeOffsetT; + typedef std::integral_constant InnerConservativeEdgeOffsetT; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief StandardRastT specialization of ConservativeRastBETraits +template +struct ConservativeRastBETraits +{ + typedef std::false_type IsConservativeT; + typedef FixedPointTraits ConservativePrecisionT; + typedef std::integral_constant ConservativeEdgeOffsetT; + typedef std::integral_constant InnerConservativeEdgeOffsetT; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief ConservativeRastT specialization of ConservativeRastBETraits +/// with no input coverage +template <> +struct ConservativeRastBETraits +{ + typedef std::true_type IsConservativeT; + typedef NoInputCoverageT InputCoverageT; + + typedef FixedPointTraits ConservativePrecisionT; + + /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision + /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead of + /// of having to compare individual edges to pixel corners to check if any part of the triangle + /// intersects a pixel + typedef std::integral_constant ConservativeEdgeOffsetT; + typedef std::integral_constant InnerConservativeEdgeOffsetT; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief ConservativeRastT specialization of ConservativeRastBETraits +/// with OuterConservativeCoverage +template <> +struct ConservativeRastBETraits +{ + typedef std::true_type IsConservativeT; + typedef OuterConservativeCoverageT InputCoverageT; + + typedef FixedPointTraits ConservativePrecisionT; + + /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision + /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead of + /// of having to compare individual edges to pixel corners to check if any part of the triangle + /// intersects a pixel + typedef std::integral_constant ConservativeEdgeOffsetT; + typedef std::integral_constant InnerConservativeEdgeOffsetT; + +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief ConservativeRastT specialization of ConservativeRastBETraits +/// with InnerConservativeCoverage +template <> +struct ConservativeRastBETraits +{ + typedef std::true_type IsConservativeT; + typedef InnerConservativeCoverageT InputCoverageT; + + typedef FixedPointTraits ConservativePrecisionT; + + /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision + /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead of + /// of having to compare individual edges to pixel corners to check if any part of the triangle + /// intersects a pixel + typedef std::integral_constant ConservativeEdgeOffsetT; + + /// offset edge towards from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision + /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead of + /// of having to compare individual edges to pixel corners to check if a pixel is fully covered by a triangle + typedef std::integral_constant(-((ConservativePrecisionT::ScaleT::value/2) + 1))> InnerConservativeEdgeOffsetT; +}; \ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 7f3e33e7aa7..cc8ebda35bc 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -1596,7 +1596,7 @@ void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, template > INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn) { - simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::FixedPointScaleT::value)); + simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value)); return _simd_cvtps_epi32(vFixed); } @@ -1842,10 +1842,13 @@ void BinTriangles( /// Note: these variable initializations must stay above any 'goto endBenTriangles' // compute per tri backface uint32_t frontFaceMask = frontWindingTris; - uint32_t *pPrimID = (uint32_t *)&primID; DWORD triIndex = 0; - + // for center sample pattern, all samples are at pixel center; calculate coverage + // once at center and broadcast the results in the backend + uint32_t sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X; + PFN_WORK_FUNC pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0), + pDC->pState->state.psState.inputCoverage, (rastState.scissorEnable > 0)); if (!triMask) { goto endBinTriangles; @@ -1945,34 +1948,23 @@ void BinTriangles( _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); } - // scan remaining valid triangles and bin each separately while (_BitScanForward(&triIndex, triMask)) { uint32_t linkageCount = state.linkageCount; uint32_t linkageMask = state.linkageMask; uint32_t numScalarAttribs = linkageCount * 4; - + BE_WORK work; work.type = DRAW; + work.pfnWork = pfnWork; TRIANGLE_WORK_DESC &desc = work.desc.tri; desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1); desc.triFlags.primID = pPrimID[triIndex]; desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex]; - - if(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) - { - work.pfnWork = gRasterizerTable[rastState.scissorEnable][rastState.sampleCount]; - } - else - { - // for center sample pattern, all samples are at pixel center; calculate coverage - // once at center and broadcast the results in the backend - work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X]; - } - + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); @@ -2028,7 +2020,7 @@ struct FEBinTrianglesChooser } }; -// Selector for correct templated Draw front-end function +// Selector for correct templated BinTrinagles function PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative) { return TemplateArgUnroller::GetFunc(IsConservative); diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h index 2de5d269036..9142101089e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.h +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h @@ -246,14 +246,6 @@ void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, BBOX &bbox) bbox.right = _mm_extract_epi32(vMaxX, 0); bbox.top = _mm_extract_epi32(vMinY, 0); bbox.bottom = _mm_extract_epi32(vMaxY, 0); - -#if 0 - Jacob: A = _mm_shuffle_ps(X, Y, 0 0 0 0) -B = _mm_shuffle_ps(Z, W, 0 0 0 0) -A = _mm_shuffle_epi32(A, 3 0 3 0) -A = _mm_shuffle_ps(A, B, 1 0 1 0) -#endif - } INLINE diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp index 54c2904b5fb..c9b0285a5c8 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp @@ -30,7 +30,6 @@ #include #include "rasterizer.h" -#include "multisample.h" #include "rdtsc_core.h" #include "backend.h" #include "utils.h" @@ -38,11 +37,12 @@ #include "tilemgr.h" #include "memory/tilingtraits.h" -void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, - uint32_t numSamples, uint32_t renderTargetArrayIndex); -void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers, uint32_t colorTileStep, uint32_t depthTileStep, uint32_t stencilTileStep); -void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow, - uint32_t colorRowStep, uint32_t depthRowStep, uint32_t stencilRowStep); +template +void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex); +template +void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers); +template +void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow); #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3} const __m256d gMaskToVecpd[] = @@ -254,7 +254,7 @@ INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdg // Top left: a sample is in if it is a top or left edge. // Out: !(horizontal && above) = !horizontal && below // Out: !horizontal && left = !(!horizontal && left) = horizontal and right -INLINE __m256d adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, const __m256d vEdge) +INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge) { // if vA < 0, vC-- // if vA == 0 && vB < 0, vC-- @@ -271,8 +271,110 @@ INLINE __m256d adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, con msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB)); // if either of these are true and we're on the line (edge == 0), bump it outside the line - vEdgeOut = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]); - return vEdgeOut; + vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]); +} + +////////////////////////////////////////////////////////////////////////// +/// @struct adjustEdgeConservative +/// @brief Primary template definition used for partially specializing +/// the adjustEdgeConservative function. This struct should never +/// be instantiated. +/// @tparam RT: rasterizer traits +/// @tparam IsConservativeT: is conservative rast enabled? +template +struct adjustEdgeConservative +{ + INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) = delete; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief adjustEdgeConservative specialization +/// of adjustEdgeConservative. Used for conservative rasterization specific +/// edge adjustments +template +struct adjustEdgeConservative +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Performs calculations to adjust each edge of a triangle away + /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y + /// direction. + /// + /// Uncertainty regions arise from fixed point rounding, which + /// can snap a vertex +/- by min fixed point value. + /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners. + /// This allows the rasterizer to test for coverage only at the pixel center, + /// instead of having to test individual pixel corners for conservative coverage + INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) + { + /// Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away + /// from the pixel center (in the direction of the edge normal A/B) + + /// edge = Ax + Bx + C - (manh/e) + /// manh = manhattan distance = abs(A) + abs(B) + /// e = absolute rounding error from snapping from float to fixed point precision + + /// 'fixed point' multiply (in double to be avx1 friendly) + /// need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example + __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi)); + __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(RT::ConservativeEdgeOffsetT::value)), + _mm256_mul_pd(vBai, _mm256_set1_pd(RT::ConservativeEdgeOffsetT::value))); + + static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value, + "Inadequate precision of result of manh calculation "); + + /// rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision + /// since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right + manh = _mm256_mul_pd(manh, _mm256_set1_pd(((RT::PrecisionT::BitsT::value + + RT::ConservativePrecisionT::BitsT::value) - + RT::EdgePrecisionT::BitsT::value) * 0.5)); + + /// move the edge away from the pixel center by the required conservative precision + 1/2 pixel + /// this allows the rasterizer to do a single conservative coverage test to see if the primitive + /// intersects the pixel at all + vEdge = _mm256_sub_pd(vEdge, manh); + }; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief adjustEdgeConservative specialization +/// of adjustEdgeConservative. Allows code to be generically called; when +/// IsConservativeT trait is disabled this inlines an empty function, which +/// should get optimized out. +template +struct adjustEdgeConservative +{ + INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge){}; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Performs calculations to adjust each a scalar edge out +/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y +/// direction. +template +INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge) +{ + int32_t aabs = std::abs(static_cast(a)), babs = std::abs(static_cast(b)); + + int64_t manh = ((aabs * RT::ConservativeEdgeOffsetT::value) + (babs * RT::ConservativeEdgeOffsetT::value)) >> + ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value); + + static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value, + "Inadequate precision of result of manh calculation "); + + vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh)); +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Perform any needed adjustments to evaluated triangle edges +template +INLINE void adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) +{ + static_assert(std::is_same>::value, + "Edge equation expected to be in x.16 fixed point"); + /// need to offset the edge before applying the top-left rule + adjustEdgeConservative(vAi, vBi, vEdge); + + adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); } // max(abs(dz/dx), abs(dz,dy) @@ -409,7 +511,128 @@ void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge) ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge); } -template +////////////////////////////////////////////////////////////////////////// +/// @brief Primary template definition used for partially specializing +/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel +/// corner to sample position, and test for coverage +/// @tparam sampleCount: multisample count +template +INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d (&vEdgeFix16)[7], + int32_t &mask0, int32_t &mask1, int32_t &mask2) +{ + __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2; + // evaluate edge equations at the tile multisample bounding box + vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]); + vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]); + vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]); + mask0 = _mm256_movemask_pd(vSampleBboxTest0); + mask1 = _mm256_movemask_pd(vSampleBboxTest1); + mask2 = _mm256_movemask_pd(vSampleBboxTest2); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief UpdateEdgeMasks partial specialization, +/// instantiated when MSAA is disabled. +template <> +INLINE void UpdateEdgeMasks(const __m256d(&)[3], const __m256d (&vEdgeFix16)[7], + int32_t &mask0, int32_t &mask1, int32_t &mask2) +{ + mask0 = _mm256_movemask_pd(vEdgeFix16[0]); + mask1 = _mm256_movemask_pd(vEdgeFix16[1]); + mask2 = _mm256_movemask_pd(vEdgeFix16[2]); +} + +////////////////////////////////////////////////////////////////////////// +/// @struct ComputeScissorEdges +/// @brief Primary template definition. Allows the function to be generically +/// called. When paired with below specializations, will result in an empty +/// inlined function if scissor is not enabled +/// @tparam RasterScissorEdgesT: is scissor enabled? +/// @tparam IsConservativeT: is conservative rast enabled? +/// @tparam RT: rasterizer traits +template +struct ComputeScissorEdges +{ + INLINE ComputeScissorEdges(const BBOX &triBBox, const BBOX &scissorBBox, const int32_t x, const int32_t y, + EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){}; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief ComputeScissorEdges partial +/// specialization. Instantiated when conservative rast and scissor are enabled +template +struct ComputeScissorEdges +{ + + ////////////////////////////////////////////////////////////////////////// + /// @brief Intersect tri bbox with scissor, compute scissor edge vectors, + /// evaluate edge equations and offset them away from pixel center. + INLINE ComputeScissorEdges(const BBOX &triBBox, const BBOX &scissorBBox, const int32_t x, const int32_t y, + EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]) + { + /// if conservative rasterizing, triangle bbox intersected with scissor bbox is used + BBOX scissor; + scissor.left = std::max(triBBox.left, scissorBBox.left); + scissor.right = std::min(triBBox.right, scissorBBox.right); + scissor.top = std::max(triBBox.top, scissorBBox.top); + scissor.bottom = std::min(triBBox.bottom, scissorBBox.bottom); + + POS topLeft{scissor.left, scissor.top}; + POS bottomLeft{scissor.left, scissor.bottom}; + POS topRight{scissor.right, scissor.top}; + POS bottomRight{scissor.right, scissor.bottom}; + + // construct 4 scissor edges in ccw direction + ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); + ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); + ComputeEdgeData(bottomRight, topRight, rastEdges[5]); + ComputeEdgeData(topRight, topLeft, rastEdges[6]); + + vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.left)) + (rastEdges[3].b * (y - scissor.top))); + vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.left)) + (rastEdges[4].b * (y - scissor.bottom))); + vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.right)) + (rastEdges[5].b * (y - scissor.bottom))); + vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.right)) + (rastEdges[6].b * (y - scissor.top))); + + /// if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing + adjustScissorEdge(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]); + adjustScissorEdge(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]); + adjustScissorEdge(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]); + adjustScissorEdge(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief ComputeScissorEdges partial +/// specialization. Instantiated when scissor is enabled and conservative rast +/// is disabled. +template +struct ComputeScissorEdges +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Compute scissor edge vectors and evaluate edge equations + INLINE ComputeScissorEdges(const BBOX &, const BBOX &scissorBBox, const int32_t x, const int32_t y, + EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]) + { + const BBOX &scissor = scissorBBox; + POS topLeft{scissor.left, scissor.top}; + POS bottomLeft{scissor.left, scissor.bottom}; + POS topRight{scissor.right, scissor.top}; + POS bottomRight{scissor.right, scissor.bottom}; + + // construct 4 scissor edges in ccw direction + ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); + ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); + ComputeEdgeData(bottomRight, topRight, rastEdges[5]); + ComputeEdgeData(topRight, topLeft, rastEdges[6]); + + vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.left)) + (rastEdges[3].b * (y - scissor.top))); + vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.left)) + (rastEdges[4].b * (y - scissor.bottom))); + vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.right)) + (rastEdges[5].b * (y - scissor.bottom))); + vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.right)) + (rastEdges[6].b * (y - scissor.top))); + } +}; + +template void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc) { const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc); @@ -439,6 +662,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); // convert to fixed point + static_assert(std::is_same>::value, "Rasterizer expects 16.8 fixed point precision"); __m128i vXi = fpToFixedPoint(vX); __m128i vYi = fpToFixedPoint(vY); @@ -457,7 +681,8 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // determinant float det = calcDeterminantInt(vAi, vBi); - /// @todo: This test is flipped...we have a stray '-' sign somewhere + /// Verts in Pixel Coordinate Space at this point + /// Det > 0 = CW winding order // Convert CW triangles to CCW if (det > 0.0) { @@ -468,6 +693,8 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, det = -det; } + /// @todo: handle degenerates for ConservativeRast + __m128 vC; // Finish triangle setup - C edge coef triangleSetupC(vX, vY, vA, vB, vC); @@ -533,43 +760,16 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // add depth bias triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8); - // Compute edge data - OSALIGNSIMD(int32_t) aAi[4], aBi[4]; - _mm_store_si128((__m128i*)aAi, vAi); - _mm_store_si128((__m128i*)aBi, vBi); - - const uint32_t numEdges = 3 + (RasterizeScissorEdges ? 4 : 0); - EDGE rastEdges[7]; - - // compute triangle edges - ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]); - ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]); - ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]); - - // compute scissor edges if enabled - if (RasterizeScissorEdges) - { - POS topLeft{state.scissorInFixedPoint.left, state.scissorInFixedPoint.top}; - POS bottomLeft{state.scissorInFixedPoint.left, state.scissorInFixedPoint.bottom}; - POS topRight{state.scissorInFixedPoint.right, state.scissorInFixedPoint.top}; - POS bottomRight{state.scissorInFixedPoint.right, state.scissorInFixedPoint.bottom}; - - // construct 4 scissor edges in ccw direction - ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); - ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); - ComputeEdgeData(bottomRight, topRight, rastEdges[5]); - ComputeEdgeData(topRight, topLeft, rastEdges[6]); - } - // Calc bounding box of triangle OSALIGNSIMD(BBOX) bbox; calcBoundingBoxInt(vXi, vYi, bbox); // Intersect with scissor/viewport - bbox.left = std::max(bbox.left, state.scissorInFixedPoint.left); - bbox.right = std::min(bbox.right - 1, state.scissorInFixedPoint.right); - bbox.top = std::max(bbox.top, state.scissorInFixedPoint.top); - bbox.bottom = std::min(bbox.bottom - 1, state.scissorInFixedPoint.bottom); + OSALIGNSIMD(BBOX) intersect; + intersect.left = std::max(bbox.left, state.scissorInFixedPoint.left); + intersect.right = std::min(bbox.right - 1, state.scissorInFixedPoint.right); + intersect.top = std::max(bbox.top, state.scissorInFixedPoint.top); + intersect.bottom = std::min(bbox.bottom - 1, state.scissorInFixedPoint.bottom); triDesc.triFlags = workDesc.triFlags; @@ -581,11 +781,10 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; - OSALIGNSIMD(BBOX) intersect; - intersect.left = std::max(bbox.left, macroBoxLeft); - intersect.top = std::max(bbox.top, macroBoxTop); - intersect.right = std::min(bbox.right, macroBoxRight); - intersect.bottom = std::min(bbox.bottom, macroBoxBottom); + intersect.left = std::max(intersect.left, macroBoxLeft); + intersect.top = std::max(intersect.top, macroBoxTop); + intersect.right = std::min(intersect.right, macroBoxRight); + intersect.bottom = std::min(intersect.bottom, macroBoxBottom); SWR_ASSERT(intersect.left <= intersect.right && intersect.top <= intersect.bottom && intersect.left >= 0 && intersect.right >= 0 && intersect.top >= 0 && intersect.bottom >= 0); @@ -613,7 +812,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, int32_t x = AlignDown(intersect.left, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM)); int32_t y = AlignDown(intersect.top, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM)); - if(sampleCount == SWR_MULTISAMPLE_1X) + if(RT::MT::sampleCount == SWR_MULTISAMPLE_1X) { // Add 0.5, in fixed point, to offset to pixel center x += (FIXED_POINT_SCALE / 2); @@ -624,9 +823,6 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, __m128i vTopLeftY = _mm_set1_epi32(y); // evaluate edge equations at top-left pixel using 64bit math - // all other evaluations will be 32bit steps from it - // small triangles could skip this and do all 32bit math - // edge 0 // // line = Ax + By + C // solving for C: @@ -634,18 +830,15 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // we know x0 and y0 are on the line; plug them in: // C = -Ax0 - By0 // plug C back into line equation: - // line = Ax - Bx - Ax0 - Bx1 + // line = Ax - By - Ax0 - By0 // line = A(x - x0) + B(y - y0) - // line = A(x0+dX) + B(y0+dY) + C = Ax0 + AdX + By0 + BdY + c = AdX + BdY + // dX = (x-x0), dY = (y-y0) + // so all this simplifies to + // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within - // edge 0 and 1 - // edge0 = A0(x - x0) + B0(y - y0) - // edge1 = A1(x - x1) + B1(y - y1) __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi); __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi); - __m256d vEdgeFix16[7]; - // evaluate A(dx) and B(dY) for all points __m256d vAipd = _mm256_cvtepi32_pd(vAi); __m256d vBipd = _mm256_cvtepi32_pd(vBi); @@ -656,28 +849,33 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd); __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16); - // adjust for top-left rule - vEdge = adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); + // apply and edge adjustments(top-left, crast, etc) + adjustEdgesFix16(vAi, vBi, vEdge); // broadcast respective edge results to all lanes double* pEdge = (double*)&vEdge; + __m256d vEdgeFix16[7]; vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]); vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]); vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]); - // evaluate edge equations for scissor edges - if (RasterizeScissorEdges) - { - const BBOX &scissor = state.scissorInFixedPoint; - vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.left)) + (rastEdges[3].b * (y - scissor.top))); - vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.left)) + (rastEdges[4].b * (y - scissor.bottom))); - vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.right)) + (rastEdges[5].b * (y - scissor.bottom))); - vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.right)) + (rastEdges[6].b * (y - scissor.top))); - } + OSALIGNSIMD(int32_t) aAi[4], aBi[4]; + _mm_store_si128((__m128i*)aAi, vAi); + _mm_store_si128((__m128i*)aBi, vBi); + EDGE rastEdges[RT::NumEdgesT::value]; + + // Compute and store triangle edge data + ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]); + ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]); + ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]); + + // Compute and store triangle edge data if scissor needs to rasterized + ComputeScissorEdges + (bbox, state.scissorInFixedPoint, x, y, rastEdges, vEdgeFix16); // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile // used to for testing if entire raster tile is inside a triangle - for (uint32_t e = 0; e < numEdges; ++e) + for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) { vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets); } @@ -689,10 +887,10 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // | | // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples) __m256d vEdgeTileBbox[3]; - if (sampleCount > SWR_MULTISAMPLE_1X) + if (RT::MT::sampleCount > SWR_MULTISAMPLE_1X) { - __m128i vTileSampleBBoxXh = MultisampleTraits::TileSampleOffsetsX(); - __m128i vTileSampleBBoxYh = MultisampleTraits::TileSampleOffsetsY(); + __m128i vTileSampleBBoxXh = RT::MT::TileSampleOffsetsX(); + __m128i vTileSampleBBoxYh = RT::MT::TileSampleOffsetsY(); __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh); __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh); @@ -714,25 +912,15 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, uint32_t maxY = maxTileY; uint32_t maxX = maxTileX; - // compute steps between raster tiles for render output buffers - static const uint32_t colorRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * MultisampleTraits::numSamples}; - static const uint32_t colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * colorRasterTileStep}; - static const uint32_t depthRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * MultisampleTraits::numSamples}; - static const uint32_t depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM)* depthRasterTileStep}; - static const uint32_t stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * MultisampleTraits::numSamples}; - static const uint32_t stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * stencilRasterTileStep}; RenderOutputBuffers renderBuffers, currentRenderBufferRow; - - GetRenderHotTiles(pDC, macroTile, tileX, tileY, renderBuffers, MultisampleTraits::numSamples, - triDesc.triFlags.renderTargetArrayIndex); + GetRenderHotTiles(pDC, macroTile, tileX, tileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex); currentRenderBufferRow = renderBuffers; // rasterize and generate coverage masks per sample - uint32_t maxSamples = MultisampleTraits::numSamples; for (uint32_t tileY = tY; tileY <= maxY; ++tileY) { - __m256d vStartOfRowEdge[numEdges]; - for (uint32_t e = 0; e < numEdges; ++e) + __m256d vStartOfRowEdge[RT::NumEdgesT::value]; + for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) { vStartOfRowEdge[e] = vEdgeFix16[e]; } @@ -743,25 +931,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // is the corner of the edge outside of the raster tile? (vEdge < 0) int mask0, mask1, mask2; - if (sampleCount == SWR_MULTISAMPLE_1X) - { - mask0 = _mm256_movemask_pd(vEdgeFix16[0]); - mask1 = _mm256_movemask_pd(vEdgeFix16[1]); - mask2 = _mm256_movemask_pd(vEdgeFix16[2]); - } - else - { - __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2; - // evaluate edge equations at the tile multisample bounding box - vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]); - vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]); - vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]); - mask0 = _mm256_movemask_pd(vSampleBboxTest0); - mask1 = _mm256_movemask_pd(vSampleBboxTest1); - mask2 = _mm256_movemask_pd(vSampleBboxTest2); - } + UpdateEdgeMasks(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2); - for (uint32_t sampleNum = 0; sampleNum < maxSamples; sampleNum++) + for (uint32_t sampleNum = 0; sampleNum < RT::MT::numSamples; sampleNum++) { // trivial reject, at least one edge has all 4 corners of raster tile outside bool trivialReject = (!(mask0 && mask1 && mask2)) ? true : false; @@ -779,27 +951,24 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, } else { - __m256d vEdgeAtSample[numEdges]; - if(sampleCount == SWR_MULTISAMPLE_1X) + __m256d vEdgeAtSample[RT::NumEdgesT::value]; + if(RT::MT::sampleCount == SWR_MULTISAMPLE_1X) { // should get optimized out for single sample case (global value numbering or copy propagation) - for (uint32_t e = 0; e < numEdges; ++e) + for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) { vEdgeAtSample[e] = vEdgeFix16[e]; } } else { - __m128i vSampleOffsetXh = MultisampleTraits::vXi(sampleNum); - __m128i vSampleOffsetYh = MultisampleTraits::vYi(sampleNum); + __m128i vSampleOffsetXh = RT::MT::vXi(sampleNum); + __m128i vSampleOffsetYh = RT::MT::vYi(sampleNum); __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh); __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh); - // *note*: none of this needs to be vectorized as rasterizePartialTile just takes vEdge[0] - // for each edge and broadcasts it before offsetting to individual pixel quads - // step edge equation tests from UL tile corner to pixel sample position - for (uint32_t e = 0; e < numEdges; ++e) + for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) { __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX); __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY); @@ -808,23 +977,16 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, } } - double startQuadEdges[numEdges]; + double startQuadEdges[RT::NumEdgesT::value]; const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); - for (uint32_t e = 0; e < numEdges; ++e) + for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) { _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]); } // not trivial accept or reject, must rasterize full tile RDTSC_START(BERasterizePartial); - if (RasterizeScissorEdges) - { - triDesc.coverageMask[sampleNum] = rasterizePartialTile<7>(pDC, startQuadEdges, rastEdges); - } - else - { - triDesc.coverageMask[sampleNum] = rasterizePartialTile<3>(pDC, startQuadEdges, rastEdges); - } + triDesc.coverageMask[sampleNum] = rasterizePartialTile(pDC, startQuadEdges, rastEdges); RDTSC_STOP(BERasterizePartial, 0, 0); triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; @@ -833,7 +995,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, else { // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything - if(sampleCount > SWR_MULTISAMPLE_1X) + if(RT::MT::sampleCount > SWR_MULTISAMPLE_1X) { triDesc.coverageMask[sampleNum] = 0; } @@ -856,19 +1018,19 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, } // step to the next tile in X - for (uint32_t e = 0; e < numEdges; ++e) + for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) { vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX)); } - StepRasterTileX(state.psState.numRenderTargets, renderBuffers, colorRasterTileStep, depthRasterTileStep, stencilRasterTileStep); + StepRasterTileX(state.psState.numRenderTargets, renderBuffers); } // step to the next tile in Y - for (uint32_t e = 0; e < numEdges; ++e) + for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) { vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY)); } - StepRasterTileY(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow, colorRasterTileRowStep, depthRasterTileRowStep, stencilRasterTileRowStep); + StepRasterTileY(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow); } RDTSC_STOP(BERasterizeTriangle, 1, 0); @@ -922,16 +1084,11 @@ void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, // setup triangle rasterizer function PFN_WORK_FUNC pfnTriRast; - if (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) - { - pfnTriRast = gRasterizerTable[rastState.scissorEnable][rastState.sampleCount]; - } - else - { - // for center sample pattern, all samples are at pixel center; calculate coverage - // once at center and broadcast the results in the backend - pfnTriRast = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X]; - } + // for center sample pattern, all samples are at pixel center; calculate coverage + // once at center and broadcast the results in the backend + uint32_t sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X; + // conservative rast not supported for points/lines + pfnTriRast = GetRasterizerFunc(sampleCount, false, SWR_INPUT_COVERAGE_NONE, (rastState.scissorEnable > 0)); // overwrite texcoords for point sprites if (isPointSpriteTexCoordEnabled) @@ -1064,7 +1221,7 @@ void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTi RenderOutputBuffers renderBuffers; GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, - renderBuffers, 1, triDesc.triFlags.renderTargetArrayIndex); + renderBuffers, triDesc.triFlags.renderTargetArrayIndex); RDTSC_START(BEPixelBackend); backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers); @@ -1072,8 +1229,8 @@ void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTi } // Get pointers to hot tile memory for color RT, depth, stencil -void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, - uint32_t numSamples, uint32_t renderTargetArrayIndex) +template +void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex) { const API_STATE& state = GetApiState(pDC); SWR_CONTEXT *pContext = pDC->pContext; @@ -1123,52 +1280,33 @@ void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint } } -INLINE -void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers, uint32_t colorTileStep, uint32_t depthTileStep, uint32_t stencilTileStep) +template +INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers) { for(uint32_t rt = 0; rt < NumRT; ++rt) { - buffers.pColor[rt] += colorTileStep; + buffers.pColor[rt] += RT::colorRasterTileStep; } - buffers.pDepth += depthTileStep; - buffers.pStencil += stencilTileStep; + buffers.pDepth += RT::depthRasterTileStep; + buffers.pStencil += RT::stencilRasterTileStep; } -INLINE -void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow, uint32_t colorRowStep, uint32_t depthRowStep, uint32_t stencilRowStep) +template +INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow) { for(uint32_t rt = 0; rt < NumRT; ++rt) { - startBufferRow.pColor[rt] += colorRowStep; + startBufferRow.pColor[rt] += RT::colorRasterTileRowStep; buffers.pColor[rt] = startBufferRow.pColor[rt]; } - startBufferRow.pDepth += depthRowStep; + startBufferRow.pDepth += RT::depthRasterTileRowStep; buffers.pDepth = startBufferRow.pDepth; - startBufferRow.pStencil += stencilRowStep; + startBufferRow.pStencil += RT::stencilRasterTileRowStep; buffers.pStencil = startBufferRow.pStencil; } -// initialize rasterizer function table -PFN_WORK_FUNC gRasterizerTable[2][SWR_MULTISAMPLE_TYPE_MAX] = -{ - { - RasterizeTriangle, - RasterizeTriangle, - RasterizeTriangle, - RasterizeTriangle, - RasterizeTriangle - }, - { - RasterizeTriangle, - RasterizeTriangle, - RasterizeTriangle, - RasterizeTriangle, - RasterizeTriangle - } -}; - void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) { const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData); @@ -1274,6 +1412,12 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi } } + // setup triangle rasterizer function + PFN_WORK_FUNC pfnTriRast; + uint32_t sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X; + // conservative rast not supported for points/lines + pfnTriRast = GetRasterizerFunc(sampleCount, false, SWR_INPUT_COVERAGE_NONE, (rastState.scissorEnable > 0)); + // make sure this macrotile intersects the triangle __m128i vXai = fpToFixedPoint(vXa); __m128i vYai = fpToFixedPoint(vYa); @@ -1289,7 +1433,7 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi bboxA.bottom - 1 < macroBoxTop || bboxA.bottom - 1 < state.scissorInFixedPoint.top)) { // rasterize triangle - gRasterizerTable[rastState.scissorEnable][rastState.sampleCount](pDC, workerId, macroTile, (void*)&newWorkDesc); + pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); } // triangle 1 @@ -1362,9 +1506,30 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi bboxA.bottom - 1 < macroBoxTop || bboxA.bottom - 1 < state.scissorInFixedPoint.top)) { // rasterize triangle - gRasterizerTable[rastState.scissorEnable][rastState.sampleCount](pDC, workerId, macroTile, (void*)&newWorkDesc); + pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); } RDTSC_STOP(BERasterizeLine, 1, 0); } +struct RasterizerChooser +{ + typedef PFN_WORK_FUNC FuncType; + + template + static FuncType GetFunc() + { + return RasterizeTriangle>; + } +}; + +// Selector for correct templated RasterizeTriangle function +PFN_WORK_FUNC GetRasterizerFunc( + uint32_t numSamples, + bool IsConservative, + uint32_t InputCoverage, + bool RasterizeScissorEdges +) +{ + return TemplateArgUnroller::GetFunc(numSamples, IsConservative, InputCoverage, RasterizeScissorEdges); +} diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h index d3faf2aa6c9..a400780f85d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h @@ -29,8 +29,9 @@ #include "context.h" #include +#include "conservativeRast.h" +#include "multisample.h" -extern PFN_WORK_FUNC gRasterizerTable[2][SWR_MULTISAMPLE_TYPE_MAX]; void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); @@ -40,4 +41,70 @@ __m128i fpToFixedPoint(const __m128 vIn) { __m128 vFixed = _mm_mul_ps(vIn, _mm_set1_ps(FIXED_POINT_SCALE)); return _mm_cvtps_epi32(vFixed); -} \ No newline at end of file +} + +// Selector for correct templated RasterizeTriangle function +PFN_WORK_FUNC GetRasterizerFunc( + uint32_t numSamples, + bool IsConservative, + uint32_t InputCoverage, + bool RasterizeScissorEdges); + +////////////////////////////////////////////////////////////////////////// +/// @struct RasterScissorEdgesT +/// @brief Primary RasterScissorEdgesT templated struct that holds compile +/// time information about the number of edges needed to be rasterized, +/// If either the scissor rect or conservative rast is enabled, +/// the scissor test is enabled and the rasterizer will test +/// 3 triangle edges + 4 scissor edges for coverage. +/// @tparam RasterScissorEdgesT: number of multisamples +/// @tparam ConservativeT: is this a conservative rasterization +template +struct RasterEdgeTraits +{ + typedef std::true_type RasterizeScissorEdgesT; + typedef std::integral_constant NumEdgesT; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief specialization of RasterEdgeTraits. If neither scissor rect +/// nor conservative rast is enabled, only test 3 triangle edges +/// for coverage +template <> +struct RasterEdgeTraits +{ + typedef std::false_type RasterizeScissorEdgesT; + typedef std::integral_constant NumEdgesT; +}; + +////////////////////////////////////////////////////////////////////////// +/// @struct RasterizerTraits +/// @brief templated struct that holds compile time information used +/// during rasterization. Inherits EdgeTraits and ConservativeRastBETraits. +/// @tparam NumSamplesT: number of multisamples +/// @tparam ConservativeT: is this a conservative rasterization +/// @tparam InputCoverageT: what type of input coverage is the PS expecting? +/// (only used with conservative rasterization) +/// @tparam RasterScissorEdgesT: do we need to rasterize with a scissor? +template +struct RasterizerTraits final : public ConservativeRastBETraits, + public RasterEdgeTraits +{ + typedef MultisampleTraits(NumSamplesT::value)> MT; + + /// Fixed point precision the rasterizer is using + typedef FixedPointTraits PrecisionT; + /// Fixed point precision of the edge tests used during rasterization + typedef FixedPointTraits EdgePrecisionT; + + static_assert(EdgePrecisionT::BitsT::value >= ConservativeRastBETraits::ConservativePrecisionT::BitsT::value, + "Rasterizer edge fixed point precision < required conservative rast precision"); + + /// constants used to offset between different types of raster tiles + static const int colorRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * MT::numSamples}; + static const int depthRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * MT::numSamples}; + static const int stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * MT::numSamples}; + static const int colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * colorRasterTileStep}; + static const int depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM)* depthRasterTileStep}; + static const int stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * stencilRasterTileStep}; +}; diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index dc6cb3ca43a..9fc304a8c3f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -1022,7 +1022,7 @@ struct SWR_PS_STATE // dword 2 uint32_t killsPixel : 1; // pixel shader can kill pixels - uint32_t inputCoverage : 1; // ps uses input coverage + uint32_t inputCoverage : 2; // ps uses input coverage uint32_t writesODepth : 1; // pixel shader writes to depth uint32_t usesSourceDepth : 1; // pixel shader reads depth uint32_t shadingRate : 2; // shading per pixel / sample / coarse pixel -- 2.30.2