#include <algorithm>
#include "rasterizer.h"
+#include "backends/gen_rasterizer.hpp"
#include "rdtsc_core.h"
#include "backend.h"
#include "utils.h"
#include "frontend.h"
#include "tilemgr.h"
#include "memory/tilingtraits.h"
+#include "rasterizer_impl.h"
-template <uint32_t numSamples = 1>
-void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
-template <typename RT>
-void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers);
-template <typename RT>
-void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
-
-#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
-const __m256d gMaskToVecpd[] =
-{
- MASKTOVEC(0, 0, 0, 0),
- MASKTOVEC(0, 0, 0, 1),
- MASKTOVEC(0, 0, 1, 0),
- MASKTOVEC(0, 0, 1, 1),
- MASKTOVEC(0, 1, 0, 0),
- MASKTOVEC(0, 1, 0, 1),
- MASKTOVEC(0, 1, 1, 0),
- MASKTOVEC(0, 1, 1, 1),
- MASKTOVEC(1, 0, 0, 0),
- MASKTOVEC(1, 0, 0, 1),
- MASKTOVEC(1, 0, 1, 0),
- MASKTOVEC(1, 0, 1, 1),
- MASKTOVEC(1, 1, 0, 0),
- MASKTOVEC(1, 1, 0, 1),
- MASKTOVEC(1, 1, 1, 0),
- MASKTOVEC(1, 1, 1, 1),
-};
-
-struct POS
-{
- int32_t x, y;
-};
-
-struct EDGE
-{
- double a, b; // a, b edge coefficients in fix8
- double stepQuadX; // step to adjacent horizontal quad in fix16
- double stepQuadY; // step to adjacent vertical quad in fix16
- double stepRasterTileX; // step to adjacent horizontal raster tile in fix16
- double stepRasterTileY; // step to adjacent vertical raster tile in fix16
-
- __m256d vQuadOffsets; // offsets for 4 samples of a quad
- __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief rasterize a raster tile partially covered by the triangle
-/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
-/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
-/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
-/// Used to step between quads when sweeping over the raster tile.
-template<uint32_t NumEdges, typename EdgeMaskT>
-INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
-{
- uint64_t coverageMask = 0;
-
- __m256d vEdges[NumEdges];
- __m256d vStepX[NumEdges];
- __m256d vStepY[NumEdges];
-
- for (uint32_t e = 0; e < NumEdges; ++e)
- {
- // Step to the pixel sample locations of the 1st quad
- vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
-
- // compute step to next quad (mul by 2 in x and y direction)
- vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
- vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
- }
-
- // fast unrolled version for 8x8 tile
-#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
- int edgeMask[NumEdges];
- uint64_t mask;
-
- auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
- auto update_lambda = [&](int e){mask &= edgeMask[e];};
- auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
- auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
- auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
-
-// evaluate which pixels in the quad are covered
-#define EVAL \
- UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
-
- // update coverage mask
- // if edge 0 is degenerate and will be skipped; init the mask
-#define UPDATE_MASK(bit) \
- if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
- mask = 0xf;\
- }\
- else{\
- mask = edgeMask[0]; \
- }\
- UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
- coverageMask |= (mask << bit);
-
- // step in the +x direction to the next quad
-#define INCX \
- UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
-
- // step in the +y direction to the next quad
-#define INCY \
- UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
-
- // step in the -x direction to the next quad
-#define DECX \
- UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
-
- // sweep 2x2 quad back and forth through the raster tile,
- // computing coverage masks for the entire tile
-
- // raster tile
- // 0 1 2 3 4 5 6 7
- // x x
- // x x ------------------>
- // x x |
- // <-----------------x x V
- // ..
-
- // row 0
- EVAL;
- UPDATE_MASK(0);
- INCX;
- EVAL;
- UPDATE_MASK(4);
- INCX;
- EVAL;
- UPDATE_MASK(8);
- INCX;
- EVAL;
- UPDATE_MASK(12);
- INCY;
-
- //row 1
- EVAL;
- UPDATE_MASK(28);
- DECX;
- EVAL;
- UPDATE_MASK(24);
- DECX;
- EVAL;
- UPDATE_MASK(20);
- DECX;
- EVAL;
- UPDATE_MASK(16);
- INCY;
-
- // row 2
- EVAL;
- UPDATE_MASK(32);
- INCX;
- EVAL;
- UPDATE_MASK(36);
- INCX;
- EVAL;
- UPDATE_MASK(40);
- INCX;
- EVAL;
- UPDATE_MASK(44);
- INCY;
-
- // row 3
- EVAL;
- UPDATE_MASK(60);
- DECX;
- EVAL;
- UPDATE_MASK(56);
- DECX;
- EVAL;
- UPDATE_MASK(52);
- DECX;
- EVAL;
- UPDATE_MASK(48);
-#else
- uint32_t bit = 0;
- for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
- {
- __m256d vStartOfRowEdge[NumEdges];
- for (uint32_t e = 0; e < NumEdges; ++e)
- {
- vStartOfRowEdge[e] = vEdges[e];
- }
-
- for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
- {
- int edgeMask[NumEdges];
- for (uint32_t e = 0; e < NumEdges; ++e)
- {
- edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
- }
-
- uint64_t mask = edgeMask[0];
- for (uint32_t e = 1; e < NumEdges; ++e)
- {
- mask &= edgeMask[e];
- }
- coverageMask |= (mask << bit);
-
- // step to the next pixel in the x
- for (uint32_t e = 0; e < NumEdges; ++e)
- {
- vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
- }
- bit+=4;
- }
-
- // step to the next row
- for (uint32_t e = 0; e < NumEdges; ++e)
- {
- vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
- }
- }
-#endif
- return coverageMask;
-
-}
-// Top left rule:
-// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
-// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
-// Top left: a sample is in if it is a top or left edge.
-// Out: !(horizontal && above) = !horizontal && below
-// Out: !horizontal && left = !(!horizontal && left) = horizontal and right
-INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge)
-{
- // if vA < 0, vC--
- // if vA == 0 && vB < 0, vC--
-
- __m256d vEdgeOut = vEdge;
- __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
-
- // if vA < 0 (line is not horizontal and below)
- int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
-
- // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
- __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
- int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
- msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
-
- // if either of these are true and we're on the line (edge == 0), bump it outside the line
- vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief calculates difference in precision between the result of manh
-/// calculation and the edge precision, based on compile time trait values
-template<typename RT>
-constexpr int64_t ManhToEdgePrecisionAdjust()
-{
- static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
- "Inadequate precision of result of manh calculation ");
- return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct adjustEdgeConservative
-/// @brief Primary template definition used for partially specializing
-/// the adjustEdgeConservative function. This struct should never
-/// be instantiated.
-/// @tparam RT: rasterizer traits
-/// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
-template <typename RT, typename ConservativeEdgeOffsetT>
-struct adjustEdgeConservative
-{
- //////////////////////////////////////////////////////////////////////////
- /// @brief Performs calculations to adjust each edge of a triangle away
- /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
- /// direction.
- ///
- /// Uncertainty regions arise from fixed point rounding, which
- /// can snap a vertex +/- by min fixed point value.
- /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
- /// This allows the rasterizer to test for coverage only at the pixel center,
- /// instead of having to test individual pixel corners for conservative coverage
- INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
- {
- // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
- // from the pixel center (in the direction of the edge normal A/B)
-
- // edge = Ax + Bx + C - (manh/e)
- // manh = manhattan distance = abs(A) + abs(B)
- // e = absolute rounding error from snapping from float to fixed point precision
-
- // 'fixed point' multiply (in double to be avx1 friendly)
- // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
- __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
- __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
- _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
-
- static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
- "Inadequate precision of result of manh calculation ");
-
- // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
- // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
- manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
-
- // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
- // this allows the rasterizer to do a single conservative coverage test to see if the primitive
- // intersects the pixel at all
- vEdge = _mm256_sub_pd(vEdge, manh);
- };
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief adjustEdgeConservative specialization where no edge offset is needed
-template <typename RT>
-struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
-{
- INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief calculates the distance a degenerate BBox needs to be adjusted
-/// for conservative rast based on compile time trait values
-template<typename RT>
-constexpr int64_t ConservativeScissorOffset()
-{
- static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
- // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
- typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
- // 1/2 pixel edge offset + conservative offset - degenerateTriangle
- return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Performs calculations to adjust each a vector of evaluated edges out
-/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
-/// direction.
-template <typename RT>
-INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
-{
- int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
- int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
- vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Performs calculations to adjust each a scalar evaluated edge out
-/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
-/// direction.
-template <typename RT, typename OffsetT>
-INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
-{
- int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
- int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
- return (Edge - manh);
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Perform any needed adjustments to evaluated triangle edges
-template <typename RT, typename EdgeOffsetT>
-struct adjustEdgesFix16
-{
- INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
- {
- static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
- "Edge equation expected to be in x.16 fixed point");
-
- static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
-
- // need to apply any edge offsets before applying the top-left rule
- adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
-
- adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
- }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Perform top left adjustments to evaluated triangle edges
-template <typename RT>
-struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
-{
- INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
- {
- adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
- }
-};
-
-// max(abs(dz/dx), abs(dz,dy)
-INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
-{
- /*
- // evaluate i,j at (0,0)
- float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
- float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
-
- // evaluate i,j at (1,0)
- float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
- float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
-
- // compute dz/dx
- float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
- float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
- float dzdx = abs(d10 - d00);
-
- // evaluate i,j at (0,1)
- float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
- float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
-
- float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
- float dzdy = abs(d01 - d00);
- */
-
- // optimized version of above
- float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
- float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
-
- return std::max(dzdx, dzdy);
-}
-
-INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
-{
- if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
- {
- return (1.0f / (1 << 24));
- }
- else if (pState->depthFormat == R16_UNORM)
- {
- return (1.0f / (1 << 16));
- }
- else
- {
- SWR_ASSERT(pState->depthFormat == R32_FLOAT);
-
- // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
- float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
- uint32_t zMaxInt = *(uint32_t*)&zMax;
- zMaxInt &= 0x7f800000;
- zMax = *(float*)&zMaxInt;
-
- return zMax * (1.0f / (1 << 23));
- }
-}
-
-INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
-{
- if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
- {
- return 0.0f;
- }
-
- float scale = pState->slopeScaledDepthBias;
- if (scale != 0.0f)
- {
- scale *= ComputeMaxDepthSlope(pTri);
- }
-
- float bias = pState->depthBias;
- if (!pState->depthBiasPreAdjusted)
- {
- bias *= ComputeBiasFactor(pState, pTri, z);
- }
- bias += scale;
-
- if (pState->depthBiasClamp > 0.0f)
- {
- bias = std::min(bias, pState->depthBiasClamp);
- }
- else if (pState->depthBiasClamp < 0.0f)
- {
- bias = std::max(bias, pState->depthBiasClamp);
- }
-
- return bias;
-}
-
-// Prevent DCE by writing coverage mask from rasterizer to volatile
-#if KNOB_ENABLE_TOSS_POINTS
-__declspec(thread) volatile uint64_t gToss;
-#endif
-
-static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
-// try to avoid _chkstk insertions; make this thread local
-static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
-
-INLINE
-void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
-{
- edge.a = a;
- edge.b = b;
-
- // compute constant steps to adjacent quads
- edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
- edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
-
- // compute constant steps to adjacent raster tiles
- edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
- edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
-
- // compute quad offsets
- const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
- const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
-
- __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
- __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
- edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
-
- // compute raster tile offsets
- const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
- const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
-
- __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
- __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
- edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
-}
-
-INLINE
-void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
-{
- ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary template definition used for partially specializing
-/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
-/// corner to sample position, and test for coverage
-/// @tparam sampleCount: multisample count
-template <typename NumSamplesT>
-INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
- int32_t &mask0, int32_t &mask1, int32_t &mask2)
-{
- __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
- // evaluate edge equations at the tile multisample bounding box
- vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
- vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
- vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
- mask0 = _mm256_movemask_pd(vSampleBboxTest0);
- mask1 = _mm256_movemask_pd(vSampleBboxTest1);
- mask2 = _mm256_movemask_pd(vSampleBboxTest2);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
-/// when only rasterizing a single coverage test point
-template <>
-INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
- int32_t &mask0, int32_t &mask1, int32_t &mask2)
-{
- mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
- mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
- mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct ComputeScissorEdges
-/// @brief Primary template definition. Allows the function to be generically
-/// called. When paired with below specializations, will result in an empty
-/// inlined function if scissor is not enabled
-/// @tparam RasterScissorEdgesT: is scissor enabled?
-/// @tparam IsConservativeT: is conservative rast enabled?
-/// @tparam RT: rasterizer traits
-template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
-struct ComputeScissorEdges
-{
- INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
- EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
-/// specialization. Instantiated when conservative rast and scissor are enabled
-template <typename RT>
-struct ComputeScissorEdges<std::true_type, std::true_type, RT>
-{
- //////////////////////////////////////////////////////////////////////////
- /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
- /// evaluate edge equations and offset them away from pixel center.
- INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
- EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
- {
- // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
- SWR_RECT scissor;
- scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
- scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
- scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
- scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
-
- POS topLeft{scissor.xmin, scissor.ymin};
- POS bottomLeft{scissor.xmin, scissor.ymax};
- POS topRight{scissor.xmax, scissor.ymin};
- POS bottomRight{scissor.xmax, scissor.ymax};
-
- // construct 4 scissor edges in ccw direction
- ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
- ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
- ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
- ComputeEdgeData(topRight, topLeft, rastEdges[6]);
-
- vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
- vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
- vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
- vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
-
- // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
- adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
- adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
- adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
- adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
-
- // Upper left rule for scissor
- vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
- vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
- }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
-/// specialization. Instantiated when scissor is enabled and conservative rast
-/// is disabled.
-template <typename RT>
-struct ComputeScissorEdges<std::true_type, std::false_type, RT>
-{
- //////////////////////////////////////////////////////////////////////////
- /// @brief Compute scissor edge vectors and evaluate edge equations
- INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
- EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
- {
- const SWR_RECT &scissor = scissorBBox;
- POS topLeft{scissor.xmin, scissor.ymin};
- POS bottomLeft{scissor.xmin, scissor.ymax};
- POS topRight{scissor.xmax, scissor.ymin};
- POS bottomRight{scissor.xmax, scissor.ymax};
-
- // construct 4 scissor edges in ccw direction
- ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
- ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
- ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
- ComputeEdgeData(topRight, topLeft, rastEdges[6]);
-
- vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
- vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
- vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
- vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
-
- // Upper left rule for scissor
- vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
- vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
- }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for TrivialRejectTest. Should
-/// never be called, but TemplateUnroller instantiates a few unused values,
-/// so it calls a runtime assert instead of a static_assert.
-template <typename ValidEdgeMaskT>
-INLINE bool TrivialRejectTest(const int, const int, const int)
-{
- SWR_INVALID("Primary templated function should never be called");
- return false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
-/// and edge 1 for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
-{
- return (!(mask0 && mask1)) ? true : false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
-/// and edge 2 for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
-{
- return (!(mask0 && mask2)) ? true : false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
-/// and edge 2 for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
-{
- return (!(mask1 && mask2)) ? true : false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
-/// primitive edges for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
-{
- return (!(mask0 && mask1 && mask2)) ? true : false;;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
-/// point, so return false and rasterize against conservative BBox
-template <>
-INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
-{
- return false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for TrivialAcceptTest. Always returns
-/// false, since it will only be called for degenerate tris, and as such
-/// will never cover the entire raster tile
-template <typename ScissorEnableT>
-INLINE bool TrivialAcceptTest(const int, const int, const int)
-{
- return false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
-/// edge masks for a fully covered raster tile
-template <>
-INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
-{
- return ((mask0 & mask1 & mask2) == 0xf);
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for GenerateSVInnerCoverage. Results
-/// in an empty function call if SVInnerCoverage isn't requested
-template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
-struct GenerateSVInnerCoverage
-{
- INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t &){};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Specialization of GenerateSVInnerCoverage where all edges
-/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
-/// edge values from OuterConservative to InnerConservative and rasterizes.
-template <typename RT>
-struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
-{
- INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges, uint64_t &innerCoverageMask)
- {
- SWR_CONTEXT *pContext = pDC->pContext;
-
- double startQuadEdgesAdj[RT::NumEdgesT::value];
- for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
- {
- startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
- }
-
- // not trivial accept or reject, must rasterize full tile
- AR_BEGIN(BERasterizePartial, pDC->drawId);
- innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
- AR_END(BERasterizePartial, 0);
- }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
-/// in an empty function call if SVInnerCoverage isn't requested
-template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
-struct UpdateEdgeMasksInnerConservative
-{
- INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
- const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
-/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
-/// evaluated at raster tile corners to inner conservative position and
-/// updates edge masks
-template <typename RT>
-struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
-{
- INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
- const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
- {
- __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
-
- // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
- // conservative evaluated edge when adjusting the edge in for inner conservative tests
- adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
- adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
- adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
-
- UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
- }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
-/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
-/// cover an entire raster tile, set mask0 to 0 to force it down the
-/// rastierizePartialTile path
-template <typename RT, typename ValidEdgeMaskT>
-struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
-{
- INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
- const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
- {
- // set one mask to zero to force the triangle down the rastierizePartialTile path
- mask0 = 0;
- }
-};
-
-template <typename RT>
-void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
-{
- SWR_CONTEXT *pContext = pDC->pContext;
- const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
-#if KNOB_ENABLE_TOSS_POINTS
- if (KNOB_TOSS_BIN_TRIS)
- {
- return;
- }
-#endif
- AR_BEGIN(BERasterizeTriangle, pDC->drawId);
- AR_BEGIN(BETriangleSetup, pDC->drawId);
-
- const API_STATE &state = GetApiState(pDC);
- const SWR_RASTSTATE &rastState = state.rastState;
- const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
-
- OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
- triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
-
- __m128 vX, vY, vZ, vRecipW;
-
- // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
- // eg: vX = [x0 x1 x2 dc]
- vX = _mm_load_ps(workDesc.pTriBuffer);
- vY = _mm_load_ps(workDesc.pTriBuffer + 4);
- vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
- vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
-
- // convert to fixed point
- static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
- __m128i vXi = fpToFixedPoint(vX);
- __m128i vYi = fpToFixedPoint(vY);
-
- // quantize floating point position to fixed point precision
- // to prevent attribute creep around the triangle vertices
- vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
- vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
-
- // triangle setup - A and B edge equation coefs
- __m128 vA, vB;
- triangleSetupAB(vX, vY, vA, vB);
-
- __m128i vAi, vBi;
- triangleSetupABInt(vXi, vYi, vAi, vBi);
-
- // determinant
- float det = calcDeterminantInt(vAi, vBi);
-
- // Verts in Pixel Coordinate Space at this point
- // Det > 0 = CW winding order
- // Convert CW triangles to CCW
- if (det > 0.0)
- {
- vA = _mm_mul_ps(vA, _mm_set1_ps(-1));
- vB = _mm_mul_ps(vB, _mm_set1_ps(-1));
- vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
- vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
- det = -det;
- }
-
- __m128 vC;
- // Finish triangle setup - C edge coef
- triangleSetupC(vX, vY, vA, vB, vC);
-
- if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
- {
- // If we have degenerate edge(s) to rasterize, set I and J coefs
- // to 0 for constant interpolation of attributes
- triDesc.I[0] = 0.0f;
- triDesc.I[1] = 0.0f;
- triDesc.I[2] = 0.0f;
- triDesc.J[0] = 0.0f;
- triDesc.J[1] = 0.0f;
- triDesc.J[2] = 0.0f;
-
- // Degenerate triangles have no area
- triDesc.recipDet = 0.0f;
- }
- else
- {
- // only extract coefs for 2 of the barycentrics; the 3rd can be
- // determined from the barycentric equation:
- // i + j + k = 1 <=> k = 1 - j - i
- _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
- _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
- _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
- _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
- _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
- _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
-
- // compute recipDet, used to calculate barycentric i and j in the backend
- triDesc.recipDet = 1.0f/det;
- }
-
- OSALIGNSIMD(float) oneOverW[4];
- _mm_store_ps(oneOverW, vRecipW);
- triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
- triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
- triDesc.OneOverW[2] = oneOverW[2];
-
- // calculate perspective correct coefs per vertex attrib
- float* pPerspAttribs = perspAttribsTLS;
- float* pAttribs = workDesc.pAttribs;
- triDesc.pPerspAttribs = pPerspAttribs;
- triDesc.pAttribs = pAttribs;
- float *pRecipW = workDesc.pTriBuffer + 12;
- triDesc.pRecipW = pRecipW;
- __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
- __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
- __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
- for(uint32_t i = 0; i < workDesc.numAttribs; i++)
- {
- __m128 attribA = _mm_load_ps(pAttribs);
- __m128 attribB = _mm_load_ps(pAttribs+=4);
- __m128 attribC = _mm_load_ps(pAttribs+=4);
- pAttribs+=4;
-
- attribA = _mm_mul_ps(attribA, vOneOverWV0);
- attribB = _mm_mul_ps(attribB, vOneOverWV1);
- attribC = _mm_mul_ps(attribC, vOneOverWV2);
-
- _mm_store_ps(pPerspAttribs, attribA);
- _mm_store_ps(pPerspAttribs+=4, attribB);
- _mm_store_ps(pPerspAttribs+=4, attribC);
- pPerspAttribs+=4;
- }
-
- // compute bary Z
- // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
- OSALIGNSIMD(float) a[4];
- _mm_store_ps(a, vZ);
- triDesc.Z[0] = a[0] - a[2];
- triDesc.Z[1] = a[1] - a[2];
- triDesc.Z[2] = a[2];
-
- // add depth bias
- triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
-
- // Calc bounding box of triangle
- OSALIGNSIMD(SWR_RECT) bbox;
- calcBoundingBoxInt(vXi, vYi, bbox);
-
- const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
-
- if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
- {
- // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
- bbox.xmin--; bbox.xmax++; bbox.ymin--; bbox.ymax++;
- SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
- "Conservative rast degenerate handling requires a valid scissor rect");
- }
-
- // Intersect with scissor/viewport
- OSALIGNSIMD(SWR_RECT) intersect;
- intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
- intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
- intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
- intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
-
- triDesc.triFlags = workDesc.triFlags;
-
- // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
- uint32_t macroX, macroY;
- MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
- int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
- int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
- int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
- int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
-
- intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
- intersect.ymin = std::max(intersect.ymin, macroBoxTop);
- intersect.xmax = std::min(intersect.xmax, macroBoxRight);
- intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
-
- SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
-
- AR_END(BETriangleSetup, 0);
-
- // update triangle desc
- uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
- uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
- uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
- uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
- uint32_t numTilesX = maxTileX - minTileX + 1;
- uint32_t numTilesY = maxTileY - minTileY + 1;
-
- if (numTilesX == 0 || numTilesY == 0)
- {
- RDTSC_EVENT(BEEmptyTriangle, 1, 0);
- AR_END(BERasterizeTriangle, 1);
- return;
- }
-
- AR_BEGIN(BEStepSetup, pDC->drawId);
-
- // Step to pixel center of top-left pixel of the triangle bbox
- // Align intersect bbox (top/left) to raster tile's (top/left).
- int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
- int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
-
- // convenience typedef
- typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
-
- // single sample rasterization evaluates edges at pixel center,
- // multisample evaluates edges UL pixel corner and steps to each sample position
- if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
- {
- // Add 0.5, in fixed point, to offset to pixel center
- x += (FIXED_POINT_SCALE / 2);
- y += (FIXED_POINT_SCALE / 2);
- }
-
- __m128i vTopLeftX = _mm_set1_epi32(x);
- __m128i vTopLeftY = _mm_set1_epi32(y);
-
- // evaluate edge equations at top-left pixel using 64bit math
- //
- // line = Ax + By + C
- // solving for C:
- // C = -Ax - By
- // we know x0 and y0 are on the line; plug them in:
- // C = -Ax0 - By0
- // plug C back into line equation:
- // line = Ax - By - Ax0 - By0
- // line = A(x - x0) + B(y - y0)
- // dX = (x-x0), dY = (y-y0)
- // so all this simplifies to
- // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
-
- __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
- __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
-
- // evaluate A(dx) and B(dY) for all points
- __m256d vAipd = _mm256_cvtepi32_pd(vAi);
- __m256d vBipd = _mm256_cvtepi32_pd(vBi);
- __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
- __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
-
- __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
- __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
- __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
-
- // apply any edge adjustments(top-left, crast, etc)
- adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
-
- // broadcast respective edge results to all lanes
- double* pEdge = (double*)&vEdge;
- __m256d vEdgeFix16[7];
- vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
- vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
- vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
-
- OSALIGNSIMD(int32_t) aAi[4], aBi[4];
- _mm_store_si128((__m128i*)aAi, vAi);
- _mm_store_si128((__m128i*)aBi, vBi);
- EDGE rastEdges[RT::NumEdgesT::value];
-
- // Compute and store triangle edge data
- ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
- ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
- ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
-
- // Compute and store triangle edge data if scissor needs to rasterized
- ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
- (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
-
- // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
- // used to for testing if entire raster tile is inside a triangle
- for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
- {
- vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
- }
-
- // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
- // step sample positions to the raster tile bbox of multisample points
- // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples)
- // | |
- // | |
- // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
- __m256d vEdgeTileBbox[3];
- if (NumCoverageSamplesT::value > 1)
- {
- const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
- const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
- const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
-
- __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
- __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
-
- // step edge equation tests from Tile
- // used to for testing if entire raster tile is inside a triangle
- for (uint32_t e = 0; e < 3; ++e)
- {
- __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
- __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
- vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-
- // adjust for msaa tile bbox edges outward for conservative rast, if enabled
- adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
- }
- }
-
- AR_END(BEStepSetup, 0);
-
- uint32_t tY = minTileY;
- uint32_t tX = minTileX;
- uint32_t maxY = maxTileY;
- uint32_t maxX = maxTileX;
-
- RenderOutputBuffers renderBuffers, currentRenderBufferRow;
- GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
- currentRenderBufferRow = renderBuffers;
-
- // rasterize and generate coverage masks per sample
- for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
- {
- __m256d vStartOfRowEdge[RT::NumEdgesT::value];
- for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
- {
- vStartOfRowEdge[e] = vEdgeFix16[e];
- }
-
- for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
- {
- triDesc.anyCoveredSamples = 0;
-
- // is the corner of the edge outside of the raster tile? (vEdge < 0)
- int mask0, mask1, mask2;
- UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
-
- for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
- {
- // trivial reject, at least one edge has all 4 corners of raster tile outside
- bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
-
- if (!trivialReject)
- {
- // trivial accept mask
- triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
-
- // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
- UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
- (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
-
- // @todo Make this a bit smarter to allow use of trivial accept when:
- // 1) scissor/vp intersection rect is raster tile aligned
- // 2) raster tile is entirely within scissor/vp intersection rect
- if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
- {
- // trivial accept, all 4 corners of all 3 edges are negative
- // i.e. raster tile completely inside triangle
- triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
- if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
- {
- triDesc.innerCoverageMask = 0xffffffffffffffffULL;
- }
- RDTSC_EVENT(BETrivialAccept, 1, 0);
- }
- else
- {
- __m256d vEdgeAtSample[RT::NumEdgesT::value];
- if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
- {
- // should get optimized out for single sample case (global value numbering or copy propagation)
- for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
- {
- vEdgeAtSample[e] = vEdgeFix16[e];
- }
- }
- else
- {
- const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
- __m128i vSampleOffsetXh = samplePos.vXi(sampleNum);
- __m128i vSampleOffsetYh = samplePos.vYi(sampleNum);
- __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
- __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
-
- // step edge equation tests from UL tile corner to pixel sample position
- for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
- {
- __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
- __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
- vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
- vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
- }
- }
-
- double startQuadEdges[RT::NumEdgesT::value];
- const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
- for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
- {
- _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
- }
-
- // not trivial accept or reject, must rasterize full tile
- AR_BEGIN(BERasterizePartial, pDC->drawId);
- triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
- AR_END(BERasterizePartial, 0);
-
- triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
-
- // Output SV InnerCoverage, if needed
- GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
- }
- }
- else
- {
- // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
- if(NumCoverageSamplesT::value > 1)
- {
- triDesc.coverageMask[sampleNum] = 0;
- }
- RDTSC_EVENT(BETrivialReject, 1, 0);
- }
- }
-
-#if KNOB_ENABLE_TOSS_POINTS
- if(KNOB_TOSS_RS)
- {
- gToss = triDesc.coverageMask[0];
- }
- else
-#endif
- if(triDesc.anyCoveredSamples)
- {
- // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
- // copy conservative coverage result to all samples
- if(RT::IsConservativeT::value)
- {
- auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
- UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
- }
-
- AR_BEGIN(BEPixelBackend, pDC->drawId);
- backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
- AR_END(BEPixelBackend, 0);
- }
-
- // step to the next tile in X
- for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
- {
- vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
- }
- StepRasterTileX<RT>(state.psState.numRenderTargets, renderBuffers);
- }
-
- // step to the next tile in Y
- for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
- {
- vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
- }
- StepRasterTileY<RT>(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow);
- }
-
- AR_END(BERasterizeTriangle, 1);
-}
-
-void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
-{
- const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
- const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
- const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
-
- bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
-
- // load point vertex
- float x = *workDesc.pTriBuffer;
- float y = *(workDesc.pTriBuffer + 1);
- float z = *(workDesc.pTriBuffer + 2);
-
- // create a copy of the triangle buffer to write our adjusted vertices to
- OSALIGNSIMD(float) newTriBuffer[4 * 4];
- TRIANGLE_WORK_DESC newWorkDesc = workDesc;
- newWorkDesc.pTriBuffer = &newTriBuffer[0];
-
- // create a copy of the attrib buffer to write our adjusted attribs to
- OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
- newWorkDesc.pAttribs = &newAttribBuffer[0];
-
- newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
- newWorkDesc.numAttribs = workDesc.numAttribs;
- newWorkDesc.triFlags = workDesc.triFlags;
-
- // construct two tris by bloating point by point size
- float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
- float lowerX = x - halfPointSize;
- float upperX = x + halfPointSize;
- float lowerY = y - halfPointSize;
- float upperY = y + halfPointSize;
-
- // tri 0
- float *pBuf = &newTriBuffer[0];
- *pBuf++ = lowerX;
- *pBuf++ = lowerX;
- *pBuf++ = upperX;
- pBuf++;
- *pBuf++ = lowerY;
- *pBuf++ = upperY;
- *pBuf++ = upperY;
- pBuf++;
- _mm_store_ps(pBuf, _mm_set1_ps(z));
- _mm_store_ps(pBuf+=4, _mm_set1_ps(1.0f));
-
- // setup triangle rasterizer function
- PFN_WORK_FUNC pfnTriRast;
- // conservative rast not supported for points/lines
- pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
- SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
-
- // overwrite texcoords for point sprites
- if (isPointSpriteTexCoordEnabled)
- {
- // copy original attribs
- memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
- newWorkDesc.pAttribs = &newAttribBuffer[0];
-
- // overwrite texcoord for point sprites
- uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
- DWORD texCoordAttrib = 0;
-
- while (_BitScanForward(&texCoordAttrib, texCoordMask))
- {
- texCoordMask &= ~(1 << texCoordAttrib);
- __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
- if (rastState.pointSpriteTopOrigin)
- {
- pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
- pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
- pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
- }
- else
- {
- pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
- pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
- pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
- }
- }
- }
- else
- {
- // no texcoord overwrite, can reuse the attrib buffer from frontend
- newWorkDesc.pAttribs = workDesc.pAttribs;
- }
-
- pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
-
- // tri 1
- pBuf = &newTriBuffer[0];
- *pBuf++ = lowerX;
- *pBuf++ = upperX;
- *pBuf++ = upperX;
- pBuf++;
- *pBuf++ = lowerY;
- *pBuf++ = upperY;
- *pBuf++ = lowerY;
- // z, w unchanged
-
- if (isPointSpriteTexCoordEnabled)
- {
- uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
- DWORD texCoordAttrib = 0;
-
- while (_BitScanForward(&texCoordAttrib, texCoordMask))
- {
- texCoordMask &= ~(1 << texCoordAttrib);
- __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
- if (rastState.pointSpriteTopOrigin)
- {
- pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
- pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
- pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
-
- }
- else
- {
- pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
- pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
- pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
- }
- }
- }
-
- pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
-}
-
-void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
-{
- SWR_CONTEXT *pContext = pDC->pContext;
-
-#if KNOB_ENABLE_TOSS_POINTS
- if (KNOB_TOSS_BIN_TRIS)
- {
- return;
- }
-#endif
-
- const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
- const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
-
- // map x,y relative offsets from start of raster tile to bit position in
- // coverage mask for the point
- static const uint32_t coverageMap[8][8] = {
- { 0, 1, 4, 5, 8, 9, 12, 13 },
- { 2, 3, 6, 7, 10, 11, 14, 15 },
- { 16, 17, 20, 21, 24, 25, 28, 29 },
- { 18, 19, 22, 23, 26, 27, 30, 31 },
- { 32, 33, 36, 37, 40, 41, 44, 45 },
- { 34, 35, 38, 39, 42, 43, 46, 47 },
- { 48, 49, 52, 53, 56, 57, 60, 61 },
- { 50, 51, 54, 55, 58, 59, 62, 63 }
- };
-
- OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
-
- // pull point information from triangle buffer
- // @todo use structs for readability
- uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
- uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
- float z = *(workDesc.pTriBuffer + 2);
-
- // construct triangle descriptor for point
- // no interpolation, set up i,j for constant interpolation of z and attribs
- // @todo implement an optimized backend that doesn't require triangle information
-
- // compute coverage mask from x,y packed into the coverageMask flag
- // mask indices by the maximum valid index for x/y of coveragemap.
- uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
- uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
- // todo: multisample points?
- triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
-
- // no persp divide needed for points
- triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
- triDesc.triFlags = workDesc.triFlags;
- triDesc.recipDet = 1.0f;
- triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
- triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
- triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
- triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
-
- RenderOutputBuffers renderBuffers;
- GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
- renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
-
- AR_BEGIN(BEPixelBackend, pDC->drawId);
- backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
- AR_END(BEPixelBackend, 0);
-}
-
-// Get pointers to hot tile memory for color RT, depth, stencil
-template <uint32_t numSamples>
-void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
-{
- const API_STATE& state = GetApiState(pDC);
- SWR_CONTEXT *pContext = pDC->pContext;
-
- uint32_t mx, my;
- MacroTileMgr::getTileIndices(macroID, mx, my);
- tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
- tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
-
- // compute tile offset for active hottile buffers
- const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
- uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
- offset*=numSamples;
-
- unsigned long rtSlot = 0;
- uint32_t colorHottileEnableMask = state.colorHottileEnable;
- while(_BitScanForward(&rtSlot, colorHottileEnableMask))
- {
- HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true,
- numSamples, renderTargetArrayIndex);
- pColor->state = HOTTILE_DIRTY;
- renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
-
- colorHottileEnableMask &= ~(1 << rtSlot);
- }
- if(state.depthHottileEnable)
- {
- const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
- uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
- offset*=numSamples;
- HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true,
- numSamples, renderTargetArrayIndex);
- pDepth->state = HOTTILE_DIRTY;
- SWR_ASSERT(pDepth->pBuffer != nullptr);
- renderBuffers.pDepth = pDepth->pBuffer + offset;
- }
- if(state.stencilHottileEnable)
- {
- const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
- uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
- offset*=numSamples;
- HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true,
- numSamples, renderTargetArrayIndex);
- pStencil->state = HOTTILE_DIRTY;
- SWR_ASSERT(pStencil->pBuffer != nullptr);
- renderBuffers.pStencil = pStencil->pBuffer + offset;
- }
-}
-
-template <typename RT>
-INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers)
-{
- for(uint32_t rt = 0; rt < NumRT; ++rt)
- {
- buffers.pColor[rt] += RT::colorRasterTileStep;
- }
-
- buffers.pDepth += RT::depthRasterTileStep;
- buffers.pStencil += RT::stencilRasterTileStep;
-}
-
-template <typename RT>
-INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
-{
- for(uint32_t rt = 0; rt < NumRT; ++rt)
- {
- startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
- buffers.pColor[rt] = startBufferRow.pColor[rt];
- }
- startBufferRow.pDepth += RT::depthRasterTileRowStep;
- buffers.pDepth = startBufferRow.pDepth;
-
- startBufferRow.pStencil += RT::stencilRasterTileRowStep;
- buffers.pStencil = startBufferRow.pStencil;
-}
+PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
{
// tri0 needs v0, v0, v1
for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
{
- __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]);
- __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]);
+ __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
+ __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
- _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0);
- _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0);
- _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1);
+ _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib0);
+ _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib0);
+ _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib1);
}
// Store user clip distances for triangle 0
// setup triangle rasterizer function
PFN_WORK_FUNC pfnTriRast;
// conservative rast not supported for points/lines
- pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
- SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
+ pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
+ SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
// make sure this macrotile intersects the triangle
__m128i vXai = fpToFixedPoint(vXa);
calcBoundingBoxInt(vXai, vYai, bboxA);
if (!(bboxA.xmin > macroBoxRight ||
- bboxA.xmin > scissorInFixedPoint.xmax ||
- bboxA.xmax - 1 < macroBoxLeft ||
- bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
- bboxA.ymin > macroBoxBottom ||
- bboxA.ymin > scissorInFixedPoint.ymax ||
- bboxA.ymax - 1 < macroBoxTop ||
- bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
+ bboxA.xmin > scissorInFixedPoint.xmax ||
+ bboxA.xmax - 1 < macroBoxLeft ||
+ bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
+ bboxA.ymin > macroBoxBottom ||
+ bboxA.ymin > scissorInFixedPoint.ymax ||
+ bboxA.ymax - 1 < macroBoxTop ||
+ bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
// rasterize triangle
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
}
calcBoundingBoxInt(vXai, vYai, bboxA);
if (!(bboxA.xmin > macroBoxRight ||
- bboxA.xmin > scissorInFixedPoint.xmax ||
- bboxA.xmax - 1 < macroBoxLeft ||
- bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
- bboxA.ymin > macroBoxBottom ||
- bboxA.ymin > scissorInFixedPoint.ymax ||
- bboxA.ymax - 1 < macroBoxTop ||
- bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
+ bboxA.xmin > scissorInFixedPoint.xmax ||
+ bboxA.xmax - 1 < macroBoxLeft ||
+ bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
+ bboxA.ymin > macroBoxBottom ||
+ bboxA.ymin > scissorInFixedPoint.ymax ||
+ bboxA.ymax - 1 < macroBoxTop ||
+ bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
// rasterize triangle
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
}
AR_END(BERasterizeLine, 1);
}
-struct RasterizerChooser
+void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
+{
+ SWR_CONTEXT *pContext = pDC->pContext;
+
+#if KNOB_ENABLE_TOSS_POINTS
+ if (KNOB_TOSS_BIN_TRIS)
+ {
+ return;
+ }
+#endif
+
+ const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
+ const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+ // map x,y relative offsets from start of raster tile to bit position in
+ // coverage mask for the point
+ static const uint32_t coverageMap[8][8] = {
+ { 0, 1, 4, 5, 8, 9, 12, 13 },
+ { 2, 3, 6, 7, 10, 11, 14, 15 },
+ { 16, 17, 20, 21, 24, 25, 28, 29 },
+ { 18, 19, 22, 23, 26, 27, 30, 31 },
+ { 32, 33, 36, 37, 40, 41, 44, 45 },
+ { 34, 35, 38, 39, 42, 43, 46, 47 },
+ { 48, 49, 52, 53, 56, 57, 60, 61 },
+ { 50, 51, 54, 55, 58, 59, 62, 63 }
+ };
+
+ OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
+
+ // pull point information from triangle buffer
+ // @todo use structs for readability
+ uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
+ uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
+ float z = *(workDesc.pTriBuffer + 2);
+
+ // construct triangle descriptor for point
+ // no interpolation, set up i,j for constant interpolation of z and attribs
+ // @todo implement an optimized backend that doesn't require triangle information
+
+ // compute coverage mask from x,y packed into the coverageMask flag
+ // mask indices by the maximum valid index for x/y of coveragemap.
+ uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
+ uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
+ // todo: multisample points?
+ triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
+
+ // no persp divide needed for points
+ triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
+ triDesc.triFlags = workDesc.triFlags;
+ triDesc.recipDet = 1.0f;
+ triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
+ triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
+ triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
+ triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
+
+ RenderOutputBuffers renderBuffers;
+ GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
+ renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
+
+ AR_BEGIN(BEPixelBackend, pDC->drawId);
+ backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
+ AR_END(BEPixelBackend, 0);
+}
+
+void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
{
- typedef PFN_WORK_FUNC FuncType;
+ const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
+ const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
+ const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
+
+ bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
+
+ // load point vertex
+ float x = *workDesc.pTriBuffer;
+ float y = *(workDesc.pTriBuffer + 1);
+ float z = *(workDesc.pTriBuffer + 2);
+
+ // create a copy of the triangle buffer to write our adjusted vertices to
+ OSALIGNSIMD(float) newTriBuffer[4 * 4];
+ TRIANGLE_WORK_DESC newWorkDesc = workDesc;
+ newWorkDesc.pTriBuffer = &newTriBuffer[0];
+
+ // create a copy of the attrib buffer to write our adjusted attribs to
+ OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
+ newWorkDesc.pAttribs = &newAttribBuffer[0];
+
+ newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
+ newWorkDesc.numAttribs = workDesc.numAttribs;
+ newWorkDesc.triFlags = workDesc.triFlags;
+
+ // construct two tris by bloating point by point size
+ float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
+ float lowerX = x - halfPointSize;
+ float upperX = x + halfPointSize;
+ float lowerY = y - halfPointSize;
+ float upperY = y + halfPointSize;
+
+ // tri 0
+ float *pBuf = &newTriBuffer[0];
+ *pBuf++ = lowerX;
+ *pBuf++ = lowerX;
+ *pBuf++ = upperX;
+ pBuf++;
+ *pBuf++ = lowerY;
+ *pBuf++ = upperY;
+ *pBuf++ = upperY;
+ pBuf++;
+ _mm_store_ps(pBuf, _mm_set1_ps(z));
+ _mm_store_ps(pBuf += 4, _mm_set1_ps(1.0f));
+
+ // setup triangle rasterizer function
+ PFN_WORK_FUNC pfnTriRast;
+ // conservative rast not supported for points/lines
+ pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
+ SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
+
+ // overwrite texcoords for point sprites
+ if (isPointSpriteTexCoordEnabled)
+ {
+ // copy original attribs
+ memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
+ newWorkDesc.pAttribs = &newAttribBuffer[0];
+
+ // overwrite texcoord for point sprites
+ uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
+ DWORD texCoordAttrib = 0;
+
+ while (_BitScanForward(&texCoordAttrib, texCoordMask))
+ {
+ texCoordMask &= ~(1 << texCoordAttrib);
+ __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
+ if (rastState.pointSpriteTopOrigin)
+ {
+ pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
+ pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
+ pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
+ }
+ else
+ {
+ pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
+ pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
+ pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
+ }
+ }
+ }
+ else
+ {
+ // no texcoord overwrite, can reuse the attrib buffer from frontend
+ newWorkDesc.pAttribs = workDesc.pAttribs;
+ }
+
+ pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
+
+ // tri 1
+ pBuf = &newTriBuffer[0];
+ *pBuf++ = lowerX;
+ *pBuf++ = upperX;
+ *pBuf++ = upperX;
+ pBuf++;
+ *pBuf++ = lowerY;
+ *pBuf++ = upperY;
+ *pBuf++ = lowerY;
+ // z, w unchanged
- template <typename... ArgsB>
- static FuncType GetFunc()
+ if (isPointSpriteTexCoordEnabled)
{
- return RasterizeTriangle<RasterizerTraits<ArgsB...>>;
+ uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
+ DWORD texCoordAttrib = 0;
+
+ while (_BitScanForward(&texCoordAttrib, texCoordMask))
+ {
+ texCoordMask &= ~(1 << texCoordAttrib);
+ __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
+ if (rastState.pointSpriteTopOrigin)
+ {
+ pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
+ pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
+ pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
+
+ }
+ else
+ {
+ pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
+ pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
+ pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
+ }
+ }
}
-};
+
+ pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
+}
+
+void InitRasterizerFunctions()
+{
+ InitRasterizerFuncs();
+}
// Selector for correct templated RasterizeTriangle function
PFN_WORK_FUNC GetRasterizerFunc(
- uint32_t numSamples,
+ SWR_MULTISAMPLE_COUNT numSamples,
bool IsCenter,
bool IsConservative,
- uint32_t InputCoverage,
+ SWR_INPUT_COVERAGE InputCoverage,
uint32_t EdgeEnable,
bool RasterizeScissorEdges
)
{
- return TemplateArgUnroller<RasterizerChooser>::GetFunc(
- IntArg<SWR_MULTISAMPLE_1X,SWR_MULTISAMPLE_TYPE_COUNT-1>{numSamples},
- IsCenter,
- IsConservative,
- IntArg<SWR_INPUT_COVERAGE_NONE, SWR_INPUT_COVERAGE_COUNT-1>{InputCoverage},
- IntArg<0, STATE_VALID_TRI_EDGE_COUNT-1>{EdgeEnable},
- RasterizeScissorEdges);
+ SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT);
+ SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT);
+ SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT);
+
+ PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage][EdgeEnable][RasterizeScissorEdges];
+ SWR_ASSERT(func);
+
+ return func;
}
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file rasterizer.cpp
+*
+* @brief Implementation for the rasterizer.
+*
+******************************************************************************/
+
+#include <vector>
+#include <algorithm>
+
+#include "rasterizer.h"
+#include "rdtsc_core.h"
+#include "backend.h"
+#include "utils.h"
+#include "frontend.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+
+extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
+
+template <uint32_t numSamples = 1>
+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
+template <typename RT>
+void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers);
+template <typename RT>
+void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
+
+#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
+static const __m256d gMaskToVecpd[] =
+{
+ MASKTOVEC(0, 0, 0, 0),
+ MASKTOVEC(0, 0, 0, 1),
+ MASKTOVEC(0, 0, 1, 0),
+ MASKTOVEC(0, 0, 1, 1),
+ MASKTOVEC(0, 1, 0, 0),
+ MASKTOVEC(0, 1, 0, 1),
+ MASKTOVEC(0, 1, 1, 0),
+ MASKTOVEC(0, 1, 1, 1),
+ MASKTOVEC(1, 0, 0, 0),
+ MASKTOVEC(1, 0, 0, 1),
+ MASKTOVEC(1, 0, 1, 0),
+ MASKTOVEC(1, 0, 1, 1),
+ MASKTOVEC(1, 1, 0, 0),
+ MASKTOVEC(1, 1, 0, 1),
+ MASKTOVEC(1, 1, 1, 0),
+ MASKTOVEC(1, 1, 1, 1),
+};
+
+struct POS
+{
+ int32_t x, y;
+};
+
+struct EDGE
+{
+ double a, b; // a, b edge coefficients in fix8
+ double stepQuadX; // step to adjacent horizontal quad in fix16
+ double stepQuadY; // step to adjacent vertical quad in fix16
+ double stepRasterTileX; // step to adjacent horizontal raster tile in fix16
+ double stepRasterTileY; // step to adjacent vertical raster tile in fix16
+
+ __m256d vQuadOffsets; // offsets for 4 samples of a quad
+ __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief rasterize a raster tile partially covered by the triangle
+/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
+/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
+/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
+/// Used to step between quads when sweeping over the raster tile.
+template<uint32_t NumEdges, typename EdgeMaskT>
+INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
+{
+ uint64_t coverageMask = 0;
+
+ __m256d vEdges[NumEdges];
+ __m256d vStepX[NumEdges];
+ __m256d vStepY[NumEdges];
+
+ for (uint32_t e = 0; e < NumEdges; ++e)
+ {
+ // Step to the pixel sample locations of the 1st quad
+ vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
+
+ // compute step to next quad (mul by 2 in x and y direction)
+ vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
+ vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
+ }
+
+ // fast unrolled version for 8x8 tile
+#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
+ int edgeMask[NumEdges];
+ uint64_t mask;
+
+ auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
+ auto update_lambda = [&](int e){mask &= edgeMask[e];};
+ auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
+ auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
+ auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
+
+// evaluate which pixels in the quad are covered
+#define EVAL \
+ UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
+
+ // update coverage mask
+ // if edge 0 is degenerate and will be skipped; init the mask
+#define UPDATE_MASK(bit) \
+ if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
+ mask = 0xf;\
+ }\
+ else{\
+ mask = edgeMask[0]; \
+ }\
+ UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
+ coverageMask |= (mask << bit);
+
+ // step in the +x direction to the next quad
+#define INCX \
+ UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
+
+ // step in the +y direction to the next quad
+#define INCY \
+ UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
+
+ // step in the -x direction to the next quad
+#define DECX \
+ UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
+
+ // sweep 2x2 quad back and forth through the raster tile,
+ // computing coverage masks for the entire tile
+
+ // raster tile
+ // 0 1 2 3 4 5 6 7
+ // x x
+ // x x ------------------>
+ // x x |
+ // <-----------------x x V
+ // ..
+
+ // row 0
+ EVAL;
+ UPDATE_MASK(0);
+ INCX;
+ EVAL;
+ UPDATE_MASK(4);
+ INCX;
+ EVAL;
+ UPDATE_MASK(8);
+ INCX;
+ EVAL;
+ UPDATE_MASK(12);
+ INCY;
+
+ //row 1
+ EVAL;
+ UPDATE_MASK(28);
+ DECX;
+ EVAL;
+ UPDATE_MASK(24);
+ DECX;
+ EVAL;
+ UPDATE_MASK(20);
+ DECX;
+ EVAL;
+ UPDATE_MASK(16);
+ INCY;
+
+ // row 2
+ EVAL;
+ UPDATE_MASK(32);
+ INCX;
+ EVAL;
+ UPDATE_MASK(36);
+ INCX;
+ EVAL;
+ UPDATE_MASK(40);
+ INCX;
+ EVAL;
+ UPDATE_MASK(44);
+ INCY;
+
+ // row 3
+ EVAL;
+ UPDATE_MASK(60);
+ DECX;
+ EVAL;
+ UPDATE_MASK(56);
+ DECX;
+ EVAL;
+ UPDATE_MASK(52);
+ DECX;
+ EVAL;
+ UPDATE_MASK(48);
+#else
+ uint32_t bit = 0;
+ for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
+ {
+ __m256d vStartOfRowEdge[NumEdges];
+ for (uint32_t e = 0; e < NumEdges; ++e)
+ {
+ vStartOfRowEdge[e] = vEdges[e];
+ }
+
+ for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
+ {
+ int edgeMask[NumEdges];
+ for (uint32_t e = 0; e < NumEdges; ++e)
+ {
+ edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
+ }
+
+ uint64_t mask = edgeMask[0];
+ for (uint32_t e = 1; e < NumEdges; ++e)
+ {
+ mask &= edgeMask[e];
+ }
+ coverageMask |= (mask << bit);
+
+ // step to the next pixel in the x
+ for (uint32_t e = 0; e < NumEdges; ++e)
+ {
+ vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
+ }
+ bit+=4;
+ }
+
+ // step to the next row
+ for (uint32_t e = 0; e < NumEdges; ++e)
+ {
+ vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
+ }
+ }
+#endif
+ return coverageMask;
+
+}
+// Top left rule:
+// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
+// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
+// Top left: a sample is in if it is a top or left edge.
+// Out: !(horizontal && above) = !horizontal && below
+// Out: !horizontal && left = !(!horizontal && left) = horizontal and right
+INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge)
+{
+ // if vA < 0, vC--
+ // if vA == 0 && vB < 0, vC--
+
+ __m256d vEdgeOut = vEdge;
+ __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
+
+ // if vA < 0 (line is not horizontal and below)
+ int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
+
+ // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
+ __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
+ int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
+ msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
+
+ // if either of these are true and we're on the line (edge == 0), bump it outside the line
+ vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief calculates difference in precision between the result of manh
+/// calculation and the edge precision, based on compile time trait values
+template<typename RT>
+constexpr int64_t ManhToEdgePrecisionAdjust()
+{
+ static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
+ "Inadequate precision of result of manh calculation ");
+ return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @struct adjustEdgeConservative
+/// @brief Primary template definition used for partially specializing
+/// the adjustEdgeConservative function. This struct should never
+/// be instantiated.
+/// @tparam RT: rasterizer traits
+/// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
+template <typename RT, typename ConservativeEdgeOffsetT>
+struct adjustEdgeConservative
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Performs calculations to adjust each edge of a triangle away
+ /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
+ /// direction.
+ ///
+ /// Uncertainty regions arise from fixed point rounding, which
+ /// can snap a vertex +/- by min fixed point value.
+ /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
+ /// This allows the rasterizer to test for coverage only at the pixel center,
+ /// instead of having to test individual pixel corners for conservative coverage
+ INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
+ {
+ // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
+ // from the pixel center (in the direction of the edge normal A/B)
+
+ // edge = Ax + Bx + C - (manh/e)
+ // manh = manhattan distance = abs(A) + abs(B)
+ // e = absolute rounding error from snapping from float to fixed point precision
+
+ // 'fixed point' multiply (in double to be avx1 friendly)
+ // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
+ __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
+ __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
+ _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
+
+ static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
+ "Inadequate precision of result of manh calculation ");
+
+ // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
+ // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
+ manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
+
+ // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
+ // this allows the rasterizer to do a single conservative coverage test to see if the primitive
+ // intersects the pixel at all
+ vEdge = _mm256_sub_pd(vEdge, manh);
+ };
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief adjustEdgeConservative specialization where no edge offset is needed
+template <typename RT>
+struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
+{
+ INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief calculates the distance a degenerate BBox needs to be adjusted
+/// for conservative rast based on compile time trait values
+template<typename RT>
+constexpr int64_t ConservativeScissorOffset()
+{
+ static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
+ // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
+ typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
+ // 1/2 pixel edge offset + conservative offset - degenerateTriangle
+ return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Performs calculations to adjust each a vector of evaluated edges out
+/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
+/// direction.
+template <typename RT>
+INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
+{
+ int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
+ int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
+ vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Performs calculations to adjust each a scalar evaluated edge out
+/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
+/// direction.
+template <typename RT, typename OffsetT>
+INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
+{
+ int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
+ int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
+ return (Edge - manh);
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Perform any needed adjustments to evaluated triangle edges
+template <typename RT, typename EdgeOffsetT>
+struct adjustEdgesFix16
+{
+ INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
+ {
+ static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
+ "Edge equation expected to be in x.16 fixed point");
+
+ static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
+
+ // need to apply any edge offsets before applying the top-left rule
+ adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
+
+ adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Perform top left adjustments to evaluated triangle edges
+template <typename RT>
+struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
+{
+ INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
+ {
+ adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
+ }
+};
+
+// max(abs(dz/dx), abs(dz,dy)
+INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
+{
+ /*
+ // evaluate i,j at (0,0)
+ float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
+ float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
+
+ // evaluate i,j at (1,0)
+ float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
+ float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
+
+ // compute dz/dx
+ float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
+ float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
+ float dzdx = abs(d10 - d00);
+
+ // evaluate i,j at (0,1)
+ float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
+ float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
+
+ float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
+ float dzdy = abs(d01 - d00);
+ */
+
+ // optimized version of above
+ float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
+ float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
+
+ return std::max(dzdx, dzdy);
+}
+
+INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
+{
+ if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
+ {
+ return (1.0f / (1 << 24));
+ }
+ else if (pState->depthFormat == R16_UNORM)
+ {
+ return (1.0f / (1 << 16));
+ }
+ else
+ {
+ SWR_ASSERT(pState->depthFormat == R32_FLOAT);
+
+ // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
+ float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
+ uint32_t zMaxInt = *(uint32_t*)&zMax;
+ zMaxInt &= 0x7f800000;
+ zMax = *(float*)&zMaxInt;
+
+ return zMax * (1.0f / (1 << 23));
+ }
+}
+
+INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
+{
+ if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
+ {
+ return 0.0f;
+ }
+
+ float scale = pState->slopeScaledDepthBias;
+ if (scale != 0.0f)
+ {
+ scale *= ComputeMaxDepthSlope(pTri);
+ }
+
+ float bias = pState->depthBias;
+ if (!pState->depthBiasPreAdjusted)
+ {
+ bias *= ComputeBiasFactor(pState, pTri, z);
+ }
+ bias += scale;
+
+ if (pState->depthBiasClamp > 0.0f)
+ {
+ bias = std::min(bias, pState->depthBiasClamp);
+ }
+ else if (pState->depthBiasClamp < 0.0f)
+ {
+ bias = std::max(bias, pState->depthBiasClamp);
+ }
+
+ return bias;
+}
+
+// Prevent DCE by writing coverage mask from rasterizer to volatile
+#if KNOB_ENABLE_TOSS_POINTS
+__declspec(thread) volatile uint64_t gToss;
+#endif
+
+static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
+// try to avoid _chkstk insertions; make this thread local
+static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
+
+INLINE
+void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
+{
+ edge.a = a;
+ edge.b = b;
+
+ // compute constant steps to adjacent quads
+ edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
+ edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
+
+ // compute constant steps to adjacent raster tiles
+ edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
+ edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
+
+ // compute quad offsets
+ const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
+ const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
+
+ __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
+ __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
+ edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
+
+ // compute raster tile offsets
+ const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
+ const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
+
+ __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
+ __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
+ edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
+}
+
+INLINE
+void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
+{
+ ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary template definition used for partially specializing
+/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
+/// corner to sample position, and test for coverage
+/// @tparam sampleCount: multisample count
+template <typename NumSamplesT>
+INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
+ int32_t &mask0, int32_t &mask1, int32_t &mask2)
+{
+ __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
+ // evaluate edge equations at the tile multisample bounding box
+ vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
+ vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
+ vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
+ mask0 = _mm256_movemask_pd(vSampleBboxTest0);
+ mask1 = _mm256_movemask_pd(vSampleBboxTest1);
+ mask2 = _mm256_movemask_pd(vSampleBboxTest2);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
+/// when only rasterizing a single coverage test point
+template <>
+INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
+ int32_t &mask0, int32_t &mask1, int32_t &mask2)
+{
+ mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
+ mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
+ mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @struct ComputeScissorEdges
+/// @brief Primary template definition. Allows the function to be generically
+/// called. When paired with below specializations, will result in an empty
+/// inlined function if scissor is not enabled
+/// @tparam RasterScissorEdgesT: is scissor enabled?
+/// @tparam IsConservativeT: is conservative rast enabled?
+/// @tparam RT: rasterizer traits
+template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
+struct ComputeScissorEdges
+{
+ INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
+ EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
+/// specialization. Instantiated when conservative rast and scissor are enabled
+template <typename RT>
+struct ComputeScissorEdges<std::true_type, std::true_type, RT>
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
+ /// evaluate edge equations and offset them away from pixel center.
+ INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
+ EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
+ {
+ // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
+ SWR_RECT scissor;
+ scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
+ scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
+ scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
+ scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
+
+ POS topLeft{scissor.xmin, scissor.ymin};
+ POS bottomLeft{scissor.xmin, scissor.ymax};
+ POS topRight{scissor.xmax, scissor.ymin};
+ POS bottomRight{scissor.xmax, scissor.ymax};
+
+ // construct 4 scissor edges in ccw direction
+ ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
+ ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
+ ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
+ ComputeEdgeData(topRight, topLeft, rastEdges[6]);
+
+ vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
+ vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
+ vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
+ vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
+
+ // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
+ adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
+ adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
+ adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
+ adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
+
+ // Upper left rule for scissor
+ vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
+ vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
+/// specialization. Instantiated when scissor is enabled and conservative rast
+/// is disabled.
+template <typename RT>
+struct ComputeScissorEdges<std::true_type, std::false_type, RT>
+{
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Compute scissor edge vectors and evaluate edge equations
+ INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
+ EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
+ {
+ const SWR_RECT &scissor = scissorBBox;
+ POS topLeft{scissor.xmin, scissor.ymin};
+ POS bottomLeft{scissor.xmin, scissor.ymax};
+ POS topRight{scissor.xmax, scissor.ymin};
+ POS bottomRight{scissor.xmax, scissor.ymax};
+
+ // construct 4 scissor edges in ccw direction
+ ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
+ ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
+ ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
+ ComputeEdgeData(topRight, topLeft, rastEdges[6]);
+
+ vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
+ vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
+ vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
+ vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
+
+ // Upper left rule for scissor
+ vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
+ vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary function template for TrivialRejectTest. Should
+/// never be called, but TemplateUnroller instantiates a few unused values,
+/// so it calls a runtime assert instead of a static_assert.
+template <typename ValidEdgeMaskT>
+INLINE bool TrivialRejectTest(const int, const int, const int)
+{
+ SWR_INVALID("Primary templated function should never be called");
+ return false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
+/// and edge 1 for trivial coverage reject
+template <>
+INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
+{
+ return (!(mask0 && mask1)) ? true : false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
+/// and edge 2 for trivial coverage reject
+template <>
+INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
+{
+ return (!(mask0 && mask2)) ? true : false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
+/// and edge 2 for trivial coverage reject
+template <>
+INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
+{
+ return (!(mask1 && mask2)) ? true : false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
+/// primitive edges for trivial coverage reject
+template <>
+INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
+{
+ return (!(mask0 && mask1 && mask2)) ? true : false;;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
+/// point, so return false and rasterize against conservative BBox
+template <>
+INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
+{
+ return false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary function template for TrivialAcceptTest. Always returns
+/// false, since it will only be called for degenerate tris, and as such
+/// will never cover the entire raster tile
+template <typename ScissorEnableT>
+INLINE bool TrivialAcceptTest(const int, const int, const int)
+{
+ return false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
+/// edge masks for a fully covered raster tile
+template <>
+INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
+{
+ return ((mask0 & mask1 & mask2) == 0xf);
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary function template for GenerateSVInnerCoverage. Results
+/// in an empty function call if SVInnerCoverage isn't requested
+template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
+struct GenerateSVInnerCoverage
+{
+ INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t &){};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Specialization of GenerateSVInnerCoverage where all edges
+/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
+/// edge values from OuterConservative to InnerConservative and rasterizes.
+template <typename RT>
+struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
+{
+ INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges, uint64_t &innerCoverageMask)
+ {
+ SWR_CONTEXT *pContext = pDC->pContext;
+
+ double startQuadEdgesAdj[RT::NumEdgesT::value];
+ for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+ {
+ startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
+ }
+
+ // not trivial accept or reject, must rasterize full tile
+ AR_BEGIN(BERasterizePartial, pDC->drawId);
+ innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
+ AR_END(BERasterizePartial, 0);
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
+/// in an empty function call if SVInnerCoverage isn't requested
+template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
+struct UpdateEdgeMasksInnerConservative
+{
+ INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
+ const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
+/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
+/// evaluated at raster tile corners to inner conservative position and
+/// updates edge masks
+template <typename RT>
+struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
+{
+ INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
+ const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
+ {
+ __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
+
+ // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
+ // conservative evaluated edge when adjusting the edge in for inner conservative tests
+ adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
+ adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
+ adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
+
+ UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
+ }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
+/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
+/// cover an entire raster tile, set mask0 to 0 to force it down the
+/// rastierizePartialTile path
+template <typename RT, typename ValidEdgeMaskT>
+struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
+{
+ INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
+ const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
+ {
+ // set one mask to zero to force the triangle down the rastierizePartialTile path
+ mask0 = 0;
+ }
+};
+
+template <typename RT>
+void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
+{
+ SWR_CONTEXT *pContext = pDC->pContext;
+ const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
+#if KNOB_ENABLE_TOSS_POINTS
+ if (KNOB_TOSS_BIN_TRIS)
+ {
+ return;
+ }
+#endif
+ AR_BEGIN(BERasterizeTriangle, pDC->drawId);
+ AR_BEGIN(BETriangleSetup, pDC->drawId);
+
+ const API_STATE &state = GetApiState(pDC);
+ const SWR_RASTSTATE &rastState = state.rastState;
+ const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+ OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
+ triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
+
+ __m128 vX, vY, vZ, vRecipW;
+
+ // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
+ // eg: vX = [x0 x1 x2 dc]
+ vX = _mm_load_ps(workDesc.pTriBuffer);
+ vY = _mm_load_ps(workDesc.pTriBuffer + 4);
+ vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
+ vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
+
+ // convert to fixed point
+ static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
+ __m128i vXi = fpToFixedPoint(vX);
+ __m128i vYi = fpToFixedPoint(vY);
+
+ // quantize floating point position to fixed point precision
+ // to prevent attribute creep around the triangle vertices
+ vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
+ vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
+
+ // triangle setup - A and B edge equation coefs
+ __m128 vA, vB;
+ triangleSetupAB(vX, vY, vA, vB);
+
+ __m128i vAi, vBi;
+ triangleSetupABInt(vXi, vYi, vAi, vBi);
+
+ // determinant
+ float det = calcDeterminantInt(vAi, vBi);
+
+ // Verts in Pixel Coordinate Space at this point
+ // Det > 0 = CW winding order
+ // Convert CW triangles to CCW
+ if (det > 0.0)
+ {
+ vA = _mm_mul_ps(vA, _mm_set1_ps(-1));
+ vB = _mm_mul_ps(vB, _mm_set1_ps(-1));
+ vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
+ vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
+ det = -det;
+ }
+
+ __m128 vC;
+ // Finish triangle setup - C edge coef
+ triangleSetupC(vX, vY, vA, vB, vC);
+
+ if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
+ {
+ // If we have degenerate edge(s) to rasterize, set I and J coefs
+ // to 0 for constant interpolation of attributes
+ triDesc.I[0] = 0.0f;
+ triDesc.I[1] = 0.0f;
+ triDesc.I[2] = 0.0f;
+ triDesc.J[0] = 0.0f;
+ triDesc.J[1] = 0.0f;
+ triDesc.J[2] = 0.0f;
+
+ // Degenerate triangles have no area
+ triDesc.recipDet = 0.0f;
+ }
+ else
+ {
+ // only extract coefs for 2 of the barycentrics; the 3rd can be
+ // determined from the barycentric equation:
+ // i + j + k = 1 <=> k = 1 - j - i
+ _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
+ _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
+ _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
+ _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
+ _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
+ _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
+
+ // compute recipDet, used to calculate barycentric i and j in the backend
+ triDesc.recipDet = 1.0f/det;
+ }
+
+ OSALIGNSIMD(float) oneOverW[4];
+ _mm_store_ps(oneOverW, vRecipW);
+ triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
+ triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
+ triDesc.OneOverW[2] = oneOverW[2];
+
+ // calculate perspective correct coefs per vertex attrib
+ float* pPerspAttribs = perspAttribsTLS;
+ float* pAttribs = workDesc.pAttribs;
+ triDesc.pPerspAttribs = pPerspAttribs;
+ triDesc.pAttribs = pAttribs;
+ float *pRecipW = workDesc.pTriBuffer + 12;
+ triDesc.pRecipW = pRecipW;
+ __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
+ __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
+ __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
+ for(uint32_t i = 0; i < workDesc.numAttribs; i++)
+ {
+ __m128 attribA = _mm_load_ps(pAttribs);
+ __m128 attribB = _mm_load_ps(pAttribs+=4);
+ __m128 attribC = _mm_load_ps(pAttribs+=4);
+ pAttribs+=4;
+
+ attribA = _mm_mul_ps(attribA, vOneOverWV0);
+ attribB = _mm_mul_ps(attribB, vOneOverWV1);
+ attribC = _mm_mul_ps(attribC, vOneOverWV2);
+
+ _mm_store_ps(pPerspAttribs, attribA);
+ _mm_store_ps(pPerspAttribs+=4, attribB);
+ _mm_store_ps(pPerspAttribs+=4, attribC);
+ pPerspAttribs+=4;
+ }
+
+ // compute bary Z
+ // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
+ OSALIGNSIMD(float) a[4];
+ _mm_store_ps(a, vZ);
+ triDesc.Z[0] = a[0] - a[2];
+ triDesc.Z[1] = a[1] - a[2];
+ triDesc.Z[2] = a[2];
+
+ // add depth bias
+ triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
+
+ // Calc bounding box of triangle
+ OSALIGNSIMD(SWR_RECT) bbox;
+ calcBoundingBoxInt(vXi, vYi, bbox);
+
+ const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
+
+ if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
+ {
+ // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
+ bbox.xmin--; bbox.xmax++; bbox.ymin--; bbox.ymax++;
+ SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
+ "Conservative rast degenerate handling requires a valid scissor rect");
+ }
+
+ // Intersect with scissor/viewport
+ OSALIGNSIMD(SWR_RECT) intersect;
+ intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
+ intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
+ intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
+ intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
+
+ triDesc.triFlags = workDesc.triFlags;
+
+ // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
+ uint32_t macroX, macroY;
+ MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
+ int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
+ int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
+ int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
+ int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
+
+ intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
+ intersect.ymin = std::max(intersect.ymin, macroBoxTop);
+ intersect.xmax = std::min(intersect.xmax, macroBoxRight);
+ intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
+
+ SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
+
+ AR_END(BETriangleSetup, 0);
+
+ // update triangle desc
+ uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+ uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+ uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+ uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+ uint32_t numTilesX = maxTileX - minTileX + 1;
+ uint32_t numTilesY = maxTileY - minTileY + 1;
+
+ if (numTilesX == 0 || numTilesY == 0)
+ {
+ RDTSC_EVENT(BEEmptyTriangle, 1, 0);
+ AR_END(BERasterizeTriangle, 1);
+ return;
+ }
+
+ AR_BEGIN(BEStepSetup, pDC->drawId);
+
+ // Step to pixel center of top-left pixel of the triangle bbox
+ // Align intersect bbox (top/left) to raster tile's (top/left).
+ int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
+ int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
+
+ // convenience typedef
+ typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
+
+ // single sample rasterization evaluates edges at pixel center,
+ // multisample evaluates edges UL pixel corner and steps to each sample position
+ if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
+ {
+ // Add 0.5, in fixed point, to offset to pixel center
+ x += (FIXED_POINT_SCALE / 2);
+ y += (FIXED_POINT_SCALE / 2);
+ }
+
+ __m128i vTopLeftX = _mm_set1_epi32(x);
+ __m128i vTopLeftY = _mm_set1_epi32(y);
+
+ // evaluate edge equations at top-left pixel using 64bit math
+ //
+ // line = Ax + By + C
+ // solving for C:
+ // C = -Ax - By
+ // we know x0 and y0 are on the line; plug them in:
+ // C = -Ax0 - By0
+ // plug C back into line equation:
+ // line = Ax - By - Ax0 - By0
+ // line = A(x - x0) + B(y - y0)
+ // dX = (x-x0), dY = (y-y0)
+ // so all this simplifies to
+ // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
+
+ __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
+ __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
+
+ // evaluate A(dx) and B(dY) for all points
+ __m256d vAipd = _mm256_cvtepi32_pd(vAi);
+ __m256d vBipd = _mm256_cvtepi32_pd(vBi);
+ __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
+ __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
+
+ __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
+ __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
+ __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
+
+ // apply any edge adjustments(top-left, crast, etc)
+ adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
+
+ // broadcast respective edge results to all lanes
+ double* pEdge = (double*)&vEdge;
+ __m256d vEdgeFix16[7];
+ vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
+ vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
+ vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
+
+ OSALIGNSIMD(int32_t) aAi[4], aBi[4];
+ _mm_store_si128((__m128i*)aAi, vAi);
+ _mm_store_si128((__m128i*)aBi, vBi);
+ EDGE rastEdges[RT::NumEdgesT::value];
+
+ // Compute and store triangle edge data
+ ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
+ ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
+ ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
+
+ // Compute and store triangle edge data if scissor needs to rasterized
+ ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
+ (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
+
+ // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
+ // used to for testing if entire raster tile is inside a triangle
+ for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+ {
+ vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
+ }
+
+ // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
+ // step sample positions to the raster tile bbox of multisample points
+ // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples)
+ // | |
+ // | |
+ // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
+ __m256d vEdgeTileBbox[3];
+ if (NumCoverageSamplesT::value > 1)
+ {
+ const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
+ const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
+ const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
+
+ __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
+ __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
+
+ // step edge equation tests from Tile
+ // used to for testing if entire raster tile is inside a triangle
+ for (uint32_t e = 0; e < 3; ++e)
+ {
+ __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
+ __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
+ vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+
+ // adjust for msaa tile bbox edges outward for conservative rast, if enabled
+ adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
+ }
+ }
+
+ AR_END(BEStepSetup, 0);
+
+ uint32_t tY = minTileY;
+ uint32_t tX = minTileX;
+ uint32_t maxY = maxTileY;
+ uint32_t maxX = maxTileX;
+
+ RenderOutputBuffers renderBuffers, currentRenderBufferRow;
+ GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
+ currentRenderBufferRow = renderBuffers;
+
+ // rasterize and generate coverage masks per sample
+ for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
+ {
+ __m256d vStartOfRowEdge[RT::NumEdgesT::value];
+ for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+ {
+ vStartOfRowEdge[e] = vEdgeFix16[e];
+ }
+
+ for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
+ {
+ triDesc.anyCoveredSamples = 0;
+
+ // is the corner of the edge outside of the raster tile? (vEdge < 0)
+ int mask0, mask1, mask2;
+ UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
+
+ for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
+ {
+ // trivial reject, at least one edge has all 4 corners of raster tile outside
+ bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
+
+ if (!trivialReject)
+ {
+ // trivial accept mask
+ triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
+
+ // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
+ UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
+ (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
+
+ // @todo Make this a bit smarter to allow use of trivial accept when:
+ // 1) scissor/vp intersection rect is raster tile aligned
+ // 2) raster tile is entirely within scissor/vp intersection rect
+ if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
+ {
+ // trivial accept, all 4 corners of all 3 edges are negative
+ // i.e. raster tile completely inside triangle
+ triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
+ if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
+ {
+ triDesc.innerCoverageMask = 0xffffffffffffffffULL;
+ }
+ RDTSC_EVENT(BETrivialAccept, 1, 0);
+ }
+ else
+ {
+ __m256d vEdgeAtSample[RT::NumEdgesT::value];
+ if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
+ {
+ // should get optimized out for single sample case (global value numbering or copy propagation)
+ for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+ {
+ vEdgeAtSample[e] = vEdgeFix16[e];
+ }
+ }
+ else
+ {
+ const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
+ __m128i vSampleOffsetXh = samplePos.vXi(sampleNum);
+ __m128i vSampleOffsetYh = samplePos.vYi(sampleNum);
+ __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
+ __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
+
+ // step edge equation tests from UL tile corner to pixel sample position
+ for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+ {
+ __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
+ __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
+ vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+ vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
+ }
+ }
+
+ double startQuadEdges[RT::NumEdgesT::value];
+ const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+ for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+ {
+ _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
+ }
+
+ // not trivial accept or reject, must rasterize full tile
+ AR_BEGIN(BERasterizePartial, pDC->drawId);
+ triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
+ AR_END(BERasterizePartial, 0);
+
+ triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
+
+ // Output SV InnerCoverage, if needed
+ GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
+ }
+ }
+ else
+ {
+ // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
+ if(NumCoverageSamplesT::value > 1)
+ {
+ triDesc.coverageMask[sampleNum] = 0;
+ }
+ RDTSC_EVENT(BETrivialReject, 1, 0);
+ }
+ }
+
+#if KNOB_ENABLE_TOSS_POINTS
+ if(KNOB_TOSS_RS)
+ {
+ gToss = triDesc.coverageMask[0];
+ }
+ else
+#endif
+ if(triDesc.anyCoveredSamples)
+ {
+ // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
+ // copy conservative coverage result to all samples
+ if(RT::IsConservativeT::value)
+ {
+ auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
+ UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
+ }
+
+ AR_BEGIN(BEPixelBackend, pDC->drawId);
+ backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
+ AR_END(BEPixelBackend, 0);
+ }
+
+ // step to the next tile in X
+ for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+ {
+ vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
+ }
+ StepRasterTileX<RT>(state.psState.numRenderTargets, renderBuffers);
+ }
+
+ // step to the next tile in Y
+ for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+ {
+ vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
+ }
+ StepRasterTileY<RT>(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow);
+ }
+
+ AR_END(BERasterizeTriangle, 1);
+}
+
+// Get pointers to hot tile memory for color RT, depth, stencil
+template <uint32_t numSamples>
+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
+{
+ const API_STATE& state = GetApiState(pDC);
+ SWR_CONTEXT *pContext = pDC->pContext;
+
+ uint32_t mx, my;
+ MacroTileMgr::getTileIndices(macroID, mx, my);
+ tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
+ tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
+
+ // compute tile offset for active hottile buffers
+ const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
+ uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
+ offset*=numSamples;
+
+ unsigned long rtSlot = 0;
+ uint32_t colorHottileEnableMask = state.colorHottileEnable;
+ while(_BitScanForward(&rtSlot, colorHottileEnableMask))
+ {
+ HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true,
+ numSamples, renderTargetArrayIndex);
+ pColor->state = HOTTILE_DIRTY;
+ renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
+
+ colorHottileEnableMask &= ~(1 << rtSlot);
+ }
+ if(state.depthHottileEnable)
+ {
+ const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
+ uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
+ offset*=numSamples;
+ HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true,
+ numSamples, renderTargetArrayIndex);
+ pDepth->state = HOTTILE_DIRTY;
+ SWR_ASSERT(pDepth->pBuffer != nullptr);
+ renderBuffers.pDepth = pDepth->pBuffer + offset;
+ }
+ if(state.stencilHottileEnable)
+ {
+ const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
+ uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
+ offset*=numSamples;
+ HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true,
+ numSamples, renderTargetArrayIndex);
+ pStencil->state = HOTTILE_DIRTY;
+ SWR_ASSERT(pStencil->pBuffer != nullptr);
+ renderBuffers.pStencil = pStencil->pBuffer + offset;
+ }
+}
+
+template <typename RT>
+INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers)
+{
+ for(uint32_t rt = 0; rt < NumRT; ++rt)
+ {
+ buffers.pColor[rt] += RT::colorRasterTileStep;
+ }
+
+ buffers.pDepth += RT::depthRasterTileStep;
+ buffers.pStencil += RT::stencilRasterTileStep;
+}
+
+template <typename RT>
+INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
+{
+ for(uint32_t rt = 0; rt < NumRT; ++rt)
+ {
+ startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
+ buffers.pColor[rt] = startBufferRow.pColor[rt];
+ }
+ startBufferRow.pDepth += RT::depthRasterTileRowStep;
+ buffers.pDepth = startBufferRow.pDepth;
+
+ startBufferRow.pStencil += RT::stencilRasterTileRowStep;
+ buffers.pStencil = startBufferRow.pStencil;
+}
+