1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * @file rasterizer.cpp
25 * @brief Implementation for the rasterizer.
27 ******************************************************************************/
32 #include "rasterizer.h"
33 #include "rdtsc_core.h"
38 #include "memory/tilingtraits.h"
40 extern PFN_WORK_FUNC gRasterizerFuncs
[SWR_MULTISAMPLE_TYPE_COUNT
][2][2][SWR_INPUT_COVERAGE_COUNT
]
41 [STATE_VALID_TRI_EDGE_COUNT
][2];
43 template <uint32_t numSamples
= 1>
44 void GetRenderHotTiles(DRAW_CONTEXT
* pDC
,
49 RenderOutputBuffers
& renderBuffers
,
50 uint32_t renderTargetArrayIndex
);
51 template <typename RT
>
52 void StepRasterTileX(uint32_t colorHotTileMask
, RenderOutputBuffers
& buffers
);
53 template <typename RT
>
54 void StepRasterTileY(uint32_t colorHotTileMask
,
55 RenderOutputBuffers
& buffers
,
56 RenderOutputBuffers
& startBufferRow
);
58 #define MASKTOVEC(i3, i2, i1, i0) \
62 static const __m256d gMaskToVecpd
[] = {
63 MASKTOVEC(0, 0, 0, 0),
64 MASKTOVEC(0, 0, 0, 1),
65 MASKTOVEC(0, 0, 1, 0),
66 MASKTOVEC(0, 0, 1, 1),
67 MASKTOVEC(0, 1, 0, 0),
68 MASKTOVEC(0, 1, 0, 1),
69 MASKTOVEC(0, 1, 1, 0),
70 MASKTOVEC(0, 1, 1, 1),
71 MASKTOVEC(1, 0, 0, 0),
72 MASKTOVEC(1, 0, 0, 1),
73 MASKTOVEC(1, 0, 1, 0),
74 MASKTOVEC(1, 0, 1, 1),
75 MASKTOVEC(1, 1, 0, 0),
76 MASKTOVEC(1, 1, 0, 1),
77 MASKTOVEC(1, 1, 1, 0),
78 MASKTOVEC(1, 1, 1, 1),
88 double a
, b
; // a, b edge coefficients in fix8
89 double stepQuadX
; // step to adjacent horizontal quad in fix16
90 double stepQuadY
; // step to adjacent vertical quad in fix16
91 double stepRasterTileX
; // step to adjacent horizontal raster tile in fix16
92 double stepRasterTileY
; // step to adjacent vertical raster tile in fix16
94 __m256d vQuadOffsets
; // offsets for 4 samples of a quad
95 __m256d vRasterTileOffsets
; // offsets for the 4 corners of a raster tile
98 //////////////////////////////////////////////////////////////////////////
99 /// @brief rasterize a raster tile partially covered by the triangle
100 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster
102 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
103 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
104 /// Used to step between quads when sweeping over the raster tile.
105 template <uint32_t NumEdges
, typename EdgeMaskT
>
106 INLINE
uint64_t rasterizePartialTile(DRAW_CONTEXT
* pDC
,
107 double startEdges
[NumEdges
],
110 uint64_t coverageMask
= 0;
112 __m256d vEdges
[NumEdges
];
113 __m256d vStepX
[NumEdges
];
114 __m256d vStepY
[NumEdges
];
116 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
118 // Step to the pixel sample locations of the 1st quad
119 vEdges
[e
] = _mm256_add_pd(_mm256_set1_pd(startEdges
[e
]), pRastEdges
[e
].vQuadOffsets
);
121 // compute step to next quad (mul by 2 in x and y direction)
122 vStepX
[e
] = _mm256_set1_pd(pRastEdges
[e
].stepQuadX
);
123 vStepY
[e
] = _mm256_set1_pd(pRastEdges
[e
].stepQuadY
);
126 // fast unrolled version for 8x8 tile
127 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
128 int edgeMask
[NumEdges
];
131 auto eval_lambda
= [&](int e
) { edgeMask
[e
] = _mm256_movemask_pd(vEdges
[e
]); };
132 auto update_lambda
= [&](int e
) { mask
&= edgeMask
[e
]; };
133 auto incx_lambda
= [&](int e
) { vEdges
[e
] = _mm256_add_pd(vEdges
[e
], vStepX
[e
]); };
134 auto incy_lambda
= [&](int e
) { vEdges
[e
] = _mm256_add_pd(vEdges
[e
], vStepY
[e
]); };
135 auto decx_lambda
= [&](int e
) { vEdges
[e
] = _mm256_sub_pd(vEdges
[e
], vStepX
[e
]); };
137 // evaluate which pixels in the quad are covered
138 #define EVAL UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
140 // update coverage mask
141 // if edge 0 is degenerate and will be skipped; init the mask
142 #define UPDATE_MASK(bit) \
143 if (std::is_same<EdgeMaskT, E1E2ValidT>::value || \
144 std::is_same<EdgeMaskT, NoEdgesValidT>::value) \
150 mask = edgeMask[0]; \
152 UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
153 coverageMask |= (mask << bit);
155 // step in the +x direction to the next quad
156 #define INCX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
158 // step in the +y direction to the next quad
159 #define INCY UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
161 // step in the -x direction to the next quad
162 #define DECX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
164 // sweep 2x2 quad back and forth through the raster tile,
165 // computing coverage masks for the entire tile
170 // x x ------------------>
172 // <-----------------x x V
231 for (uint32_t y
= 0; y
< KNOB_TILE_Y_DIM
/ 2; ++y
)
233 __m256d vStartOfRowEdge
[NumEdges
];
234 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
236 vStartOfRowEdge
[e
] = vEdges
[e
];
239 for (uint32_t x
= 0; x
< KNOB_TILE_X_DIM
/ 2; ++x
)
241 int edgeMask
[NumEdges
];
242 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
244 edgeMask
[e
] = _mm256_movemask_pd(vEdges
[e
]);
247 uint64_t mask
= edgeMask
[0];
248 for (uint32_t e
= 1; e
< NumEdges
; ++e
)
252 coverageMask
|= (mask
<< bit
);
254 // step to the next pixel in the x
255 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
257 vEdges
[e
] = _mm256_add_pd(vEdges
[e
], vStepX
[e
]);
262 // step to the next row
263 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
265 vEdges
[e
] = _mm256_add_pd(vStartOfRowEdge
[e
], vStepY
[e
]);
272 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
273 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it
274 // is a 'left' edge Top left: a sample is in if it is a top or left edge. Out: !(horizontal &&
275 // above) = !horizontal && below Out: !horizontal && left = !(!horizontal && left) = horizontal and
277 INLINE
void adjustTopLeftRuleIntFix16(const __m128i vA
, const __m128i vB
, __m256d
& vEdge
)
280 // if vA == 0 && vB < 0, vC--
282 __m256d vEdgeOut
= vEdge
;
283 __m256d vEdgeAdjust
= _mm256_sub_pd(vEdge
, _mm256_set1_pd(1.0));
285 // if vA < 0 (line is not horizontal and below)
286 int msk
= _mm_movemask_ps(_mm_castsi128_ps(vA
));
288 // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
289 __m128i vCmp
= _mm_cmpeq_epi32(vA
, _mm_setzero_si128());
290 int msk2
= _mm_movemask_ps(_mm_castsi128_ps(vCmp
));
291 msk2
&= _mm_movemask_ps(_mm_castsi128_ps(vB
));
293 // if either of these are true and we're on the line (edge == 0), bump it outside the line
294 vEdge
= _mm256_blendv_pd(vEdgeOut
, vEdgeAdjust
, gMaskToVecpd
[msk
| msk2
]);
297 //////////////////////////////////////////////////////////////////////////
298 /// @brief calculates difference in precision between the result of manh
299 /// calculation and the edge precision, based on compile time trait values
300 template <typename RT
>
301 constexpr int64_t ManhToEdgePrecisionAdjust()
303 static_assert(RT::PrecisionT::BitsT::value
+ RT::ConservativePrecisionT::BitsT::value
>=
304 RT::EdgePrecisionT::BitsT::value
,
305 "Inadequate precision of result of manh calculation ");
306 return ((RT::PrecisionT::BitsT::value
+ RT::ConservativePrecisionT::BitsT::value
) -
307 RT::EdgePrecisionT::BitsT::value
);
310 //////////////////////////////////////////////////////////////////////////
311 /// @struct adjustEdgeConservative
312 /// @brief Primary template definition used for partially specializing
313 /// the adjustEdgeConservative function. This struct should never
315 /// @tparam RT: rasterizer traits
316 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
317 template <typename RT
, typename ConservativeEdgeOffsetT
>
318 struct adjustEdgeConservative
320 //////////////////////////////////////////////////////////////////////////
321 /// @brief Performs calculations to adjust each edge of a triangle away
322 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
325 /// Uncertainty regions arise from fixed point rounding, which
326 /// can snap a vertex +/- by min fixed point value.
327 /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
328 /// This allows the rasterizer to test for coverage only at the pixel center,
329 /// instead of having to test individual pixel corners for conservative coverage
330 INLINE
adjustEdgeConservative(const __m128i
& vAi
, const __m128i
& vBi
, __m256d
& vEdge
)
332 // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge
333 // away from the pixel center (in the direction of the edge normal A/B)
335 // edge = Ax + Bx + C - (manh/e)
336 // manh = manhattan distance = abs(A) + abs(B)
337 // e = absolute rounding error from snapping from float to fixed point precision
339 // 'fixed point' multiply (in double to be avx1 friendly)
340 // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
341 __m256d vAai
= _mm256_cvtepi32_pd(_mm_abs_epi32(vAi
)),
342 vBai
= _mm256_cvtepi32_pd(_mm_abs_epi32(vBi
));
344 _mm256_add_pd(_mm256_mul_pd(vAai
, _mm256_set1_pd(ConservativeEdgeOffsetT::value
)),
345 _mm256_mul_pd(vBai
, _mm256_set1_pd(ConservativeEdgeOffsetT::value
)));
347 static_assert(RT::PrecisionT::BitsT::value
+ RT::ConservativePrecisionT::BitsT::value
>=
348 RT::EdgePrecisionT::BitsT::value
,
349 "Inadequate precision of result of manh calculation ");
351 // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the
352 // same precision since we're doing fixed math in double format, multiply by multiples of
353 // 1/2 instead of a bit shift right
354 manh
= _mm256_mul_pd(manh
, _mm256_set1_pd(ManhToEdgePrecisionAdjust
<RT
>() * 0.5));
356 // move the edge away from the pixel center by the required conservative precision + 1/2
357 // pixel this allows the rasterizer to do a single conservative coverage test to see if the
358 // primitive intersects the pixel at all
359 vEdge
= _mm256_sub_pd(vEdge
, manh
);
363 //////////////////////////////////////////////////////////////////////////
364 /// @brief adjustEdgeConservative specialization where no edge offset is needed
365 template <typename RT
>
366 struct adjustEdgeConservative
<RT
, std::integral_constant
<int32_t, 0>>
368 INLINE
adjustEdgeConservative(const __m128i
& vAi
, const __m128i
& vBi
, __m256d
& vEdge
){};
371 //////////////////////////////////////////////////////////////////////////
372 /// @brief calculates the distance a degenerate BBox needs to be adjusted
373 /// for conservative rast based on compile time trait values
374 template <typename RT
>
375 constexpr int64_t ConservativeScissorOffset()
377 static_assert(RT::ConservativePrecisionT::BitsT::value
- RT::PrecisionT::BitsT::value
>= 0,
378 "Rasterizer precision > conservative precision");
379 // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox
380 // when calculating scissor edges
381 typedef std::integral_constant
<int32_t, (RT::ValidEdgeMaskT::value
== ALL_EDGES_VALID
) ? 0 : 1>
382 DegenerateEdgeOffsetT
;
383 // 1/2 pixel edge offset + conservative offset - degenerateTriangle
384 return RT::ConservativeEdgeOffsetT::value
-
385 (DegenerateEdgeOffsetT::value
386 << (RT::ConservativePrecisionT::BitsT::value
- RT::PrecisionT::BitsT::value
));
389 //////////////////////////////////////////////////////////////////////////
390 /// @brief Performs calculations to adjust each a vector of evaluated edges out
391 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
393 template <typename RT
>
394 INLINE
void adjustScissorEdge(const double a
, const double b
, __m256d
& vEdge
)
396 int64_t aabs
= std::abs(static_cast<int64_t>(a
)), babs
= std::abs(static_cast<int64_t>(b
));
398 ((aabs
* ConservativeScissorOffset
<RT
>()) + (babs
* ConservativeScissorOffset
<RT
>())) >>
399 ManhToEdgePrecisionAdjust
<RT
>();
400 vEdge
= _mm256_sub_pd(vEdge
, _mm256_set1_pd(manh
));
403 //////////////////////////////////////////////////////////////////////////
404 /// @brief Performs calculations to adjust each a scalar evaluated edge out
405 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
407 template <typename RT
, typename OffsetT
>
408 INLINE
double adjustScalarEdge(const double a
, const double b
, const double Edge
)
410 int64_t aabs
= std::abs(static_cast<int64_t>(a
)), babs
= std::abs(static_cast<int64_t>(b
));
412 ((aabs
* OffsetT::value
) + (babs
* OffsetT::value
)) >> ManhToEdgePrecisionAdjust
<RT
>();
413 return (Edge
- manh
);
416 //////////////////////////////////////////////////////////////////////////
417 /// @brief Perform any needed adjustments to evaluated triangle edges
418 template <typename RT
, typename EdgeOffsetT
>
419 struct adjustEdgesFix16
421 INLINE
adjustEdgesFix16(const __m128i
& vAi
, const __m128i
& vBi
, __m256d
& vEdge
)
424 std::is_same
<typename
RT::EdgePrecisionT
, FixedPointTraits
<Fixed_X_16
>>::value
,
425 "Edge equation expected to be in x.16 fixed point");
427 static_assert(RT::IsConservativeT::value
,
428 "Edge offset assumes conservative rasterization is enabled");
430 // need to apply any edge offsets before applying the top-left rule
431 adjustEdgeConservative
<RT
, EdgeOffsetT
>(vAi
, vBi
, vEdge
);
433 adjustTopLeftRuleIntFix16(vAi
, vBi
, vEdge
);
437 //////////////////////////////////////////////////////////////////////////
438 /// @brief Perform top left adjustments to evaluated triangle edges
439 template <typename RT
>
440 struct adjustEdgesFix16
<RT
, std::integral_constant
<int32_t, 0>>
442 INLINE
adjustEdgesFix16(const __m128i
& vAi
, const __m128i
& vBi
, __m256d
& vEdge
)
444 adjustTopLeftRuleIntFix16(vAi
, vBi
, vEdge
);
448 // max(abs(dz/dx), abs(dz,dy)
449 INLINE
float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC
* pDesc
)
452 // evaluate i,j at (0,0)
453 float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
454 float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
456 // evaluate i,j at (1,0)
457 float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
458 float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
461 float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
462 float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
463 float dzdx = abs(d10 - d00);
465 // evaluate i,j at (0,1)
466 float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
467 float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
469 float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
470 float dzdy = abs(d01 - d00);
473 // optimized version of above
474 float dzdx
= fabsf(pDesc
->recipDet
* (pDesc
->Z
[0] * pDesc
->I
[0] + pDesc
->Z
[1] * pDesc
->J
[0]));
475 float dzdy
= fabsf(pDesc
->recipDet
* (pDesc
->Z
[0] * pDesc
->I
[1] + pDesc
->Z
[1] * pDesc
->J
[1]));
477 return std::max(dzdx
, dzdy
);
481 ComputeBiasFactor(const SWR_RASTSTATE
* pState
, const SWR_TRIANGLE_DESC
* pDesc
, const float* z
)
483 if (pState
->depthFormat
== R24_UNORM_X8_TYPELESS
)
485 return (1.0f
/ (1 << 24));
487 else if (pState
->depthFormat
== R16_UNORM
)
489 return (1.0f
/ (1 << 16));
493 SWR_ASSERT(pState
->depthFormat
== R32_FLOAT
);
495 // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
496 float zMax
= std::max(fabsf(z
[0]), std::max(fabsf(z
[1]), fabsf(z
[2])));
497 uint32_t zMaxInt
= *(uint32_t*)&zMax
;
498 zMaxInt
&= 0x7f800000;
499 zMax
= *(float*)&zMaxInt
;
501 return zMax
* (1.0f
/ (1 << 23));
506 ComputeDepthBias(const SWR_RASTSTATE
* pState
, const SWR_TRIANGLE_DESC
* pTri
, const float* z
)
508 if (pState
->depthBias
== 0 && pState
->slopeScaledDepthBias
== 0)
513 float scale
= pState
->slopeScaledDepthBias
;
516 scale
*= ComputeMaxDepthSlope(pTri
);
519 float bias
= pState
->depthBias
;
520 if (!pState
->depthBiasPreAdjusted
)
522 bias
*= ComputeBiasFactor(pState
, pTri
, z
);
526 if (pState
->depthBiasClamp
> 0.0f
)
528 bias
= std::min(bias
, pState
->depthBiasClamp
);
530 else if (pState
->depthBiasClamp
< 0.0f
)
532 bias
= std::max(bias
, pState
->depthBiasClamp
);
538 // Prevent DCE by writing coverage mask from rasterizer to volatile
539 #if KNOB_ENABLE_TOSS_POINTS
540 __declspec(thread
) volatile uint64_t gToss
;
543 static const uint32_t vertsPerTri
= 3, componentsPerAttrib
= 4;
544 // try to avoid _chkstk insertions; make this thread local
546 OSALIGNLINE(float) perspAttribsTLS
[vertsPerTri
* SWR_VTX_NUM_SLOTS
* componentsPerAttrib
];
549 void ComputeEdgeData(int32_t a
, int32_t b
, EDGE
& edge
)
554 // compute constant steps to adjacent quads
555 edge
.stepQuadX
= (double)((int64_t)a
* (int64_t)(2 * FIXED_POINT_SCALE
));
556 edge
.stepQuadY
= (double)((int64_t)b
* (int64_t)(2 * FIXED_POINT_SCALE
));
558 // compute constant steps to adjacent raster tiles
559 edge
.stepRasterTileX
= (double)((int64_t)a
* (int64_t)(KNOB_TILE_X_DIM
* FIXED_POINT_SCALE
));
560 edge
.stepRasterTileY
= (double)((int64_t)b
* (int64_t)(KNOB_TILE_Y_DIM
* FIXED_POINT_SCALE
));
562 // compute quad offsets
563 const __m256d vQuadOffsetsXIntFix8
= _mm256_set_pd(FIXED_POINT_SCALE
, 0, FIXED_POINT_SCALE
, 0);
564 const __m256d vQuadOffsetsYIntFix8
= _mm256_set_pd(FIXED_POINT_SCALE
, FIXED_POINT_SCALE
, 0, 0);
566 __m256d vQuadStepXFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.a
), vQuadOffsetsXIntFix8
);
567 __m256d vQuadStepYFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.b
), vQuadOffsetsYIntFix8
);
568 edge
.vQuadOffsets
= _mm256_add_pd(vQuadStepXFix16
, vQuadStepYFix16
);
570 // compute raster tile offsets
571 const __m256d vTileOffsetsXIntFix8
= _mm256_set_pd(
572 (KNOB_TILE_X_DIM
- 1) * FIXED_POINT_SCALE
, 0, (KNOB_TILE_X_DIM
- 1) * FIXED_POINT_SCALE
, 0);
573 const __m256d vTileOffsetsYIntFix8
= _mm256_set_pd(
574 (KNOB_TILE_Y_DIM
- 1) * FIXED_POINT_SCALE
, (KNOB_TILE_Y_DIM
- 1) * FIXED_POINT_SCALE
, 0, 0);
576 __m256d vTileStepXFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.a
), vTileOffsetsXIntFix8
);
577 __m256d vTileStepYFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.b
), vTileOffsetsYIntFix8
);
578 edge
.vRasterTileOffsets
= _mm256_add_pd(vTileStepXFix16
, vTileStepYFix16
);
582 void ComputeEdgeData(const POS
& p0
, const POS
& p1
, EDGE
& edge
)
584 ComputeEdgeData(p0
.y
- p1
.y
, p1
.x
- p0
.x
, edge
);
587 //////////////////////////////////////////////////////////////////////////
588 /// @brief Primary template definition used for partially specializing
589 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
590 /// corner to sample position, and test for coverage
591 /// @tparam sampleCount: multisample count
592 template <typename NumSamplesT
>
593 INLINE
void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox
)[3],
594 const __m256d
* vEdgeFix16
,
599 __m256d vSampleBboxTest0
, vSampleBboxTest1
, vSampleBboxTest2
;
600 // evaluate edge equations at the tile multisample bounding box
601 vSampleBboxTest0
= _mm256_add_pd(vEdgeTileBbox
[0], vEdgeFix16
[0]);
602 vSampleBboxTest1
= _mm256_add_pd(vEdgeTileBbox
[1], vEdgeFix16
[1]);
603 vSampleBboxTest2
= _mm256_add_pd(vEdgeTileBbox
[2], vEdgeFix16
[2]);
604 mask0
= _mm256_movemask_pd(vSampleBboxTest0
);
605 mask1
= _mm256_movemask_pd(vSampleBboxTest1
);
606 mask2
= _mm256_movemask_pd(vSampleBboxTest2
);
609 //////////////////////////////////////////////////////////////////////////
610 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
611 /// when only rasterizing a single coverage test point
613 INLINE
void UpdateEdgeMasks
<SingleSampleT
>(
614 const __m256d (&)[3], const __m256d
* vEdgeFix16
, int32_t& mask0
, int32_t& mask1
, int32_t& mask2
)
616 mask0
= _mm256_movemask_pd(vEdgeFix16
[0]);
617 mask1
= _mm256_movemask_pd(vEdgeFix16
[1]);
618 mask2
= _mm256_movemask_pd(vEdgeFix16
[2]);
621 //////////////////////////////////////////////////////////////////////////
622 /// @struct ComputeScissorEdges
623 /// @brief Primary template definition. Allows the function to be generically
624 /// called. When paired with below specializations, will result in an empty
625 /// inlined function if scissor is not enabled
626 /// @tparam RasterScissorEdgesT: is scissor enabled?
627 /// @tparam IsConservativeT: is conservative rast enabled?
628 /// @tparam RT: rasterizer traits
629 template <typename RasterScissorEdgesT
, typename IsConservativeT
, typename RT
>
630 struct ComputeScissorEdges
632 INLINE
ComputeScissorEdges(const SWR_RECT
& triBBox
,
633 const SWR_RECT
& scissorBBox
,
636 EDGE (&rastEdges
)[RT::NumEdgesT::value
],
637 __m256d (&vEdgeFix16
)[7]){};
640 //////////////////////////////////////////////////////////////////////////
641 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
642 /// specialization. Instantiated when conservative rast and scissor are enabled
643 template <typename RT
>
644 struct ComputeScissorEdges
<std::true_type
, std::true_type
, RT
>
646 //////////////////////////////////////////////////////////////////////////
647 /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
648 /// evaluate edge equations and offset them away from pixel center.
649 INLINE
ComputeScissorEdges(const SWR_RECT
& triBBox
,
650 const SWR_RECT
& scissorBBox
,
653 EDGE (&rastEdges
)[RT::NumEdgesT::value
],
654 __m256d (&vEdgeFix16
)[7])
656 // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
658 scissor
.xmin
= std::max(triBBox
.xmin
, scissorBBox
.xmin
);
659 scissor
.xmax
= std::min(triBBox
.xmax
, scissorBBox
.xmax
);
660 scissor
.ymin
= std::max(triBBox
.ymin
, scissorBBox
.ymin
);
661 scissor
.ymax
= std::min(triBBox
.ymax
, scissorBBox
.ymax
);
663 POS topLeft
{scissor
.xmin
, scissor
.ymin
};
664 POS bottomLeft
{scissor
.xmin
, scissor
.ymax
};
665 POS topRight
{scissor
.xmax
, scissor
.ymin
};
666 POS bottomRight
{scissor
.xmax
, scissor
.ymax
};
668 // construct 4 scissor edges in ccw direction
669 ComputeEdgeData(topLeft
, bottomLeft
, rastEdges
[3]);
670 ComputeEdgeData(bottomLeft
, bottomRight
, rastEdges
[4]);
671 ComputeEdgeData(bottomRight
, topRight
, rastEdges
[5]);
672 ComputeEdgeData(topRight
, topLeft
, rastEdges
[6]);
674 vEdgeFix16
[3] = _mm256_set1_pd((rastEdges
[3].a
* (x
- scissor
.xmin
)) +
675 (rastEdges
[3].b
* (y
- scissor
.ymin
)));
676 vEdgeFix16
[4] = _mm256_set1_pd((rastEdges
[4].a
* (x
- scissor
.xmin
)) +
677 (rastEdges
[4].b
* (y
- scissor
.ymax
)));
678 vEdgeFix16
[5] = _mm256_set1_pd((rastEdges
[5].a
* (x
- scissor
.xmax
)) +
679 (rastEdges
[5].b
* (y
- scissor
.ymax
)));
680 vEdgeFix16
[6] = _mm256_set1_pd((rastEdges
[6].a
* (x
- scissor
.xmax
)) +
681 (rastEdges
[6].b
* (y
- scissor
.ymin
)));
683 // if conservative rasterizing, need to bump the scissor edges out by the conservative
684 // uncertainty distance, else do nothing
685 adjustScissorEdge
<RT
>(rastEdges
[3].a
, rastEdges
[3].b
, vEdgeFix16
[3]);
686 adjustScissorEdge
<RT
>(rastEdges
[4].a
, rastEdges
[4].b
, vEdgeFix16
[4]);
687 adjustScissorEdge
<RT
>(rastEdges
[5].a
, rastEdges
[5].b
, vEdgeFix16
[5]);
688 adjustScissorEdge
<RT
>(rastEdges
[6].a
, rastEdges
[6].b
, vEdgeFix16
[6]);
690 // Upper left rule for scissor
691 vEdgeFix16
[3] = _mm256_sub_pd(vEdgeFix16
[3], _mm256_set1_pd(1.0));
692 vEdgeFix16
[6] = _mm256_sub_pd(vEdgeFix16
[6], _mm256_set1_pd(1.0));
696 //////////////////////////////////////////////////////////////////////////
697 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
698 /// specialization. Instantiated when scissor is enabled and conservative rast
700 template <typename RT
>
701 struct ComputeScissorEdges
<std::true_type
, std::false_type
, RT
>
703 //////////////////////////////////////////////////////////////////////////
704 /// @brief Compute scissor edge vectors and evaluate edge equations
705 INLINE
ComputeScissorEdges(const SWR_RECT
&,
706 const SWR_RECT
& scissorBBox
,
709 EDGE (&rastEdges
)[RT::NumEdgesT::value
],
710 __m256d (&vEdgeFix16
)[7])
712 const SWR_RECT
& scissor
= scissorBBox
;
713 POS topLeft
{scissor
.xmin
, scissor
.ymin
};
714 POS bottomLeft
{scissor
.xmin
, scissor
.ymax
};
715 POS topRight
{scissor
.xmax
, scissor
.ymin
};
716 POS bottomRight
{scissor
.xmax
, scissor
.ymax
};
718 // construct 4 scissor edges in ccw direction
719 ComputeEdgeData(topLeft
, bottomLeft
, rastEdges
[3]);
720 ComputeEdgeData(bottomLeft
, bottomRight
, rastEdges
[4]);
721 ComputeEdgeData(bottomRight
, topRight
, rastEdges
[5]);
722 ComputeEdgeData(topRight
, topLeft
, rastEdges
[6]);
724 vEdgeFix16
[3] = _mm256_set1_pd((rastEdges
[3].a
* (x
- scissor
.xmin
)) +
725 (rastEdges
[3].b
* (y
- scissor
.ymin
)));
726 vEdgeFix16
[4] = _mm256_set1_pd((rastEdges
[4].a
* (x
- scissor
.xmin
)) +
727 (rastEdges
[4].b
* (y
- scissor
.ymax
)));
728 vEdgeFix16
[5] = _mm256_set1_pd((rastEdges
[5].a
* (x
- scissor
.xmax
)) +
729 (rastEdges
[5].b
* (y
- scissor
.ymax
)));
730 vEdgeFix16
[6] = _mm256_set1_pd((rastEdges
[6].a
* (x
- scissor
.xmax
)) +
731 (rastEdges
[6].b
* (y
- scissor
.ymin
)));
733 // Upper left rule for scissor
734 vEdgeFix16
[3] = _mm256_sub_pd(vEdgeFix16
[3], _mm256_set1_pd(1.0));
735 vEdgeFix16
[6] = _mm256_sub_pd(vEdgeFix16
[6], _mm256_set1_pd(1.0));
739 //////////////////////////////////////////////////////////////////////////
740 /// @brief Primary function template for TrivialRejectTest. Should
741 /// never be called, but TemplateUnroller instantiates a few unused values,
742 /// so it calls a runtime assert instead of a static_assert.
743 template <typename ValidEdgeMaskT
>
744 INLINE
bool TrivialRejectTest(const int, const int, const int)
746 SWR_INVALID("Primary templated function should never be called");
750 //////////////////////////////////////////////////////////////////////////
751 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
752 /// and edge 1 for trivial coverage reject
754 INLINE
bool TrivialRejectTest
<E0E1ValidT
>(const int mask0
, const int mask1
, const int)
756 return (!(mask0
&& mask1
)) ? true : false;
759 //////////////////////////////////////////////////////////////////////////
760 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
761 /// and edge 2 for trivial coverage reject
763 INLINE
bool TrivialRejectTest
<E0E2ValidT
>(const int mask0
, const int, const int mask2
)
765 return (!(mask0
&& mask2
)) ? true : false;
768 //////////////////////////////////////////////////////////////////////////
769 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
770 /// and edge 2 for trivial coverage reject
772 INLINE
bool TrivialRejectTest
<E1E2ValidT
>(const int, const int mask1
, const int mask2
)
774 return (!(mask1
&& mask2
)) ? true : false;
777 //////////////////////////////////////////////////////////////////////////
778 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
779 /// primitive edges for trivial coverage reject
781 INLINE
bool TrivialRejectTest
<AllEdgesValidT
>(const int mask0
, const int mask1
, const int mask2
)
783 return (!(mask0
&& mask1
&& mask2
)) ? true : false;
787 //////////////////////////////////////////////////////////////////////////
788 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
789 /// point, so return false and rasterize against conservative BBox
791 INLINE
bool TrivialRejectTest
<NoEdgesValidT
>(const int, const int, const int)
796 //////////////////////////////////////////////////////////////////////////
797 /// @brief Primary function template for TrivialAcceptTest. Always returns
798 /// false, since it will only be called for degenerate tris, and as such
799 /// will never cover the entire raster tile
800 template <typename ScissorEnableT
>
801 INLINE
bool TrivialAcceptTest(const int, const int, const int)
806 //////////////////////////////////////////////////////////////////////////
807 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
808 /// edge masks for a fully covered raster tile
810 INLINE
bool TrivialAcceptTest
<std::false_type
>(const int mask0
, const int mask1
, const int mask2
)
812 return ((mask0
& mask1
& mask2
) == 0xf);
815 //////////////////////////////////////////////////////////////////////////
816 /// @brief Primary function template for GenerateSVInnerCoverage. Results
817 /// in an empty function call if SVInnerCoverage isn't requested
818 template <typename RT
, typename ValidEdgeMaskT
, typename InputCoverageT
>
819 struct GenerateSVInnerCoverage
821 INLINE
GenerateSVInnerCoverage(DRAW_CONTEXT
*, uint32_t, EDGE
*, double*, uint64_t&){};
824 //////////////////////////////////////////////////////////////////////////
825 /// @brief Specialization of GenerateSVInnerCoverage where all edges
826 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
827 /// edge values from OuterConservative to InnerConservative and rasterizes.
828 template <typename RT
>
829 struct GenerateSVInnerCoverage
<RT
, AllEdgesValidT
, InnerConservativeCoverageT
>
831 INLINE
GenerateSVInnerCoverage(DRAW_CONTEXT
* pDC
,
834 double* pStartQuadEdges
,
835 uint64_t& innerCoverageMask
)
837 double startQuadEdgesAdj
[RT::NumEdgesT::value
];
838 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
840 startQuadEdgesAdj
[e
] = adjustScalarEdge
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(
841 pRastEdges
[e
].a
, pRastEdges
[e
].b
, pStartQuadEdges
[e
]);
844 // not trivial accept or reject, must rasterize full tile
845 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BERasterizePartial
, pDC
->drawId
);
846 innerCoverageMask
= rasterizePartialTile
<RT::NumEdgesT::value
, typename
RT::ValidEdgeMaskT
>(
847 pDC
, startQuadEdgesAdj
, pRastEdges
);
848 RDTSC_END(pDC
->pContext
->pBucketMgr
, BERasterizePartial
, 0);
852 //////////////////////////////////////////////////////////////////////////
853 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
854 /// in an empty function call if SVInnerCoverage isn't requested
855 template <typename RT
, typename ValidEdgeMaskT
, typename InputCoverageT
>
856 struct UpdateEdgeMasksInnerConservative
858 INLINE
UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox
)[3],
867 //////////////////////////////////////////////////////////////////////////
868 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
869 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
870 /// evaluated at raster tile corners to inner conservative position and
871 /// updates edge masks
872 template <typename RT
>
873 struct UpdateEdgeMasksInnerConservative
<RT
, AllEdgesValidT
, InnerConservativeCoverageT
>
875 INLINE
UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox
)[3],
876 const __m256d
* vEdgeFix16
,
883 __m256d vTempEdge
[3]{vEdgeFix16
[0], vEdgeFix16
[1], vEdgeFix16
[2]};
885 // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
886 // conservative evaluated edge when adjusting the edge in for inner conservative tests
887 adjustEdgeConservative
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(
888 vAi
, vBi
, vTempEdge
[0]);
889 adjustEdgeConservative
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(
890 vAi
, vBi
, vTempEdge
[1]);
891 adjustEdgeConservative
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(
892 vAi
, vBi
, vTempEdge
[2]);
894 UpdateEdgeMasks
<typename
RT::NumCoverageSamplesT
>(
895 vEdgeTileBbox
, vTempEdge
, mask0
, mask1
, mask2
);
899 //////////////////////////////////////////////////////////////////////////
900 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
901 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
902 /// cover an entire raster tile, set mask0 to 0 to force it down the
903 /// rastierizePartialTile path
904 template <typename RT
, typename ValidEdgeMaskT
>
905 struct UpdateEdgeMasksInnerConservative
<RT
, ValidEdgeMaskT
, InnerConservativeCoverageT
>
907 INLINE
UpdateEdgeMasksInnerConservative(const __m256d (&)[3],
915 // set one mask to zero to force the triangle down the rastierizePartialTile path
920 template <typename RT
>
921 void RasterizeTriangle(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t macroTile
, void* pDesc
)
923 const TRIANGLE_WORK_DESC
& workDesc
= *((TRIANGLE_WORK_DESC
*)pDesc
);
924 #if KNOB_ENABLE_TOSS_POINTS
925 if (KNOB_TOSS_BIN_TRIS
)
930 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BERasterizeTriangle
, pDC
->drawId
);
931 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BETriangleSetup
, pDC
->drawId
);
933 const API_STATE
& state
= GetApiState(pDC
);
934 const SWR_RASTSTATE
& rastState
= state
.rastState
;
935 const BACKEND_FUNCS
& backendFuncs
= pDC
->pState
->backendFuncs
;
937 OSALIGNSIMD(SWR_TRIANGLE_DESC
) triDesc
;
938 triDesc
.pUserClipBuffer
= workDesc
.pUserClipBuffer
;
940 __m128 vX
, vY
, vZ
, vRecipW
;
942 // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
943 // eg: vX = [x0 x1 x2 dc]
944 vX
= _mm_load_ps(workDesc
.pTriBuffer
);
945 vY
= _mm_load_ps(workDesc
.pTriBuffer
+ 4);
946 vZ
= _mm_load_ps(workDesc
.pTriBuffer
+ 8);
947 vRecipW
= _mm_load_ps(workDesc
.pTriBuffer
+ 12);
949 // convert to fixed point
950 static_assert(std::is_same
<typename
RT::PrecisionT
, FixedPointTraits
<Fixed_16_8
>>::value
,
951 "Rasterizer expects 16.8 fixed point precision");
952 __m128i vXi
= fpToFixedPoint(vX
);
953 __m128i vYi
= fpToFixedPoint(vY
);
955 // quantize floating point position to fixed point precision
956 // to prevent attribute creep around the triangle vertices
957 vX
= _mm_mul_ps(_mm_cvtepi32_ps(vXi
), _mm_set1_ps(1.0f
/ FIXED_POINT_SCALE
));
958 vY
= _mm_mul_ps(_mm_cvtepi32_ps(vYi
), _mm_set1_ps(1.0f
/ FIXED_POINT_SCALE
));
960 // triangle setup - A and B edge equation coefs
962 triangleSetupAB(vX
, vY
, vA
, vB
);
965 triangleSetupABInt(vXi
, vYi
, vAi
, vBi
);
968 float det
= calcDeterminantInt(vAi
, vBi
);
970 // Verts in Pixel Coordinate Space at this point
971 // Det > 0 = CW winding order
972 // Convert CW triangles to CCW
975 vA
= _mm_mul_ps(vA
, _mm_set1_ps(-1));
976 vB
= _mm_mul_ps(vB
, _mm_set1_ps(-1));
977 vAi
= _mm_mullo_epi32(vAi
, _mm_set1_epi32(-1));
978 vBi
= _mm_mullo_epi32(vBi
, _mm_set1_epi32(-1));
983 // Finish triangle setup - C edge coef
984 triangleSetupC(vX
, vY
, vA
, vB
, vC
);
986 if (RT::ValidEdgeMaskT::value
!= ALL_EDGES_VALID
)
988 // If we have degenerate edge(s) to rasterize, set I and J coefs
989 // to 0 for constant interpolation of attributes
997 // Degenerate triangles have no area
998 triDesc
.recipDet
= 0.0f
;
1002 // only extract coefs for 2 of the barycentrics; the 3rd can be
1003 // determined from the barycentric equation:
1004 // i + j + k = 1 <=> k = 1 - j - i
1005 _MM_EXTRACT_FLOAT(triDesc
.I
[0], vA
, 1);
1006 _MM_EXTRACT_FLOAT(triDesc
.I
[1], vB
, 1);
1007 _MM_EXTRACT_FLOAT(triDesc
.I
[2], vC
, 1);
1008 _MM_EXTRACT_FLOAT(triDesc
.J
[0], vA
, 2);
1009 _MM_EXTRACT_FLOAT(triDesc
.J
[1], vB
, 2);
1010 _MM_EXTRACT_FLOAT(triDesc
.J
[2], vC
, 2);
1012 // compute recipDet, used to calculate barycentric i and j in the backend
1013 triDesc
.recipDet
= 1.0f
/ det
;
1016 OSALIGNSIMD(float) oneOverW
[4];
1017 _mm_store_ps(oneOverW
, vRecipW
);
1018 triDesc
.OneOverW
[0] = oneOverW
[0] - oneOverW
[2];
1019 triDesc
.OneOverW
[1] = oneOverW
[1] - oneOverW
[2];
1020 triDesc
.OneOverW
[2] = oneOverW
[2];
1022 // calculate perspective correct coefs per vertex attrib
1023 float* pPerspAttribs
= perspAttribsTLS
;
1024 float* pAttribs
= workDesc
.pAttribs
;
1025 triDesc
.pPerspAttribs
= pPerspAttribs
;
1026 triDesc
.pAttribs
= pAttribs
;
1027 float* pRecipW
= workDesc
.pTriBuffer
+ 12;
1028 triDesc
.pRecipW
= pRecipW
;
1029 __m128 vOneOverWV0
= _mm_broadcast_ss(pRecipW
);
1030 __m128 vOneOverWV1
= _mm_broadcast_ss(pRecipW
+= 1);
1031 __m128 vOneOverWV2
= _mm_broadcast_ss(pRecipW
+= 1);
1032 for (uint32_t i
= 0; i
< workDesc
.numAttribs
; i
++)
1034 __m128 attribA
= _mm_load_ps(pAttribs
);
1035 __m128 attribB
= _mm_load_ps(pAttribs
+= 4);
1036 __m128 attribC
= _mm_load_ps(pAttribs
+= 4);
1039 attribA
= _mm_mul_ps(attribA
, vOneOverWV0
);
1040 attribB
= _mm_mul_ps(attribB
, vOneOverWV1
);
1041 attribC
= _mm_mul_ps(attribC
, vOneOverWV2
);
1043 _mm_store_ps(pPerspAttribs
, attribA
);
1044 _mm_store_ps(pPerspAttribs
+= 4, attribB
);
1045 _mm_store_ps(pPerspAttribs
+= 4, attribC
);
1050 // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
1051 OSALIGNSIMD(float) a
[4];
1052 _mm_store_ps(a
, vZ
);
1053 triDesc
.Z
[0] = a
[0] - a
[2];
1054 triDesc
.Z
[1] = a
[1] - a
[2];
1055 triDesc
.Z
[2] = a
[2];
1058 triDesc
.Z
[2] += ComputeDepthBias(&rastState
, &triDesc
, workDesc
.pTriBuffer
+ 8);
1060 // Calc bounding box of triangle
1061 OSALIGNSIMD(SWR_RECT
) bbox
;
1062 calcBoundingBoxInt(vXi
, vYi
, bbox
);
1064 const SWR_RECT
& scissorInFixedPoint
=
1065 state
.scissorsInFixedPoint
[workDesc
.triFlags
.viewportIndex
];
1067 if (RT::ValidEdgeMaskT::value
!= ALL_EDGES_VALID
)
1069 // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is
1075 SWR_ASSERT(scissorInFixedPoint
.xmin
>= 0 && scissorInFixedPoint
.ymin
>= 0,
1076 "Conservative rast degenerate handling requires a valid scissor rect");
1079 // Intersect with scissor/viewport
1080 OSALIGNSIMD(SWR_RECT
) intersect
;
1081 intersect
.xmin
= std::max(bbox
.xmin
, scissorInFixedPoint
.xmin
);
1082 intersect
.xmax
= std::min(bbox
.xmax
- 1, scissorInFixedPoint
.xmax
);
1083 intersect
.ymin
= std::max(bbox
.ymin
, scissorInFixedPoint
.ymin
);
1084 intersect
.ymax
= std::min(bbox
.ymax
- 1, scissorInFixedPoint
.ymax
);
1086 triDesc
.triFlags
= workDesc
.triFlags
;
1088 // further constrain backend to intersecting bounding box of macro tile and scissored triangle
1090 uint32_t macroX
, macroY
;
1091 MacroTileMgr::getTileIndices(macroTile
, macroX
, macroY
);
1092 int32_t macroBoxLeft
= macroX
* KNOB_MACROTILE_X_DIM_FIXED
;
1093 int32_t macroBoxRight
= macroBoxLeft
+ KNOB_MACROTILE_X_DIM_FIXED
- 1;
1094 int32_t macroBoxTop
= macroY
* KNOB_MACROTILE_Y_DIM_FIXED
;
1095 int32_t macroBoxBottom
= macroBoxTop
+ KNOB_MACROTILE_Y_DIM_FIXED
- 1;
1097 intersect
.xmin
= std::max(intersect
.xmin
, macroBoxLeft
);
1098 intersect
.ymin
= std::max(intersect
.ymin
, macroBoxTop
);
1099 intersect
.xmax
= std::min(intersect
.xmax
, macroBoxRight
);
1100 intersect
.ymax
= std::min(intersect
.ymax
, macroBoxBottom
);
1102 SWR_ASSERT(intersect
.xmin
<= intersect
.xmax
&& intersect
.ymin
<= intersect
.ymax
&&
1103 intersect
.xmin
>= 0 && intersect
.xmax
>= 0 && intersect
.ymin
>= 0 &&
1104 intersect
.ymax
>= 0);
1106 RDTSC_END(pDC
->pContext
->pBucketMgr
, BETriangleSetup
, 0);
1108 // update triangle desc
1109 uint32_t minTileX
= intersect
.xmin
>> (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1110 uint32_t minTileY
= intersect
.ymin
>> (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1111 uint32_t maxTileX
= intersect
.xmax
>> (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1112 uint32_t maxTileY
= intersect
.ymax
>> (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1113 uint32_t numTilesX
= maxTileX
- minTileX
+ 1;
1114 uint32_t numTilesY
= maxTileY
- minTileY
+ 1;
1116 if (numTilesX
== 0 || numTilesY
== 0)
1118 RDTSC_EVENT(pDC
->pContext
->pBucketMgr
, BEEmptyTriangle
, 1, 0);
1119 RDTSC_END(pDC
->pContext
->pBucketMgr
, BERasterizeTriangle
, 1);
1123 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEStepSetup
, pDC
->drawId
);
1125 // Step to pixel center of top-left pixel of the triangle bbox
1126 // Align intersect bbox (top/left) to raster tile's (top/left).
1127 int32_t x
= AlignDown(intersect
.xmin
, (FIXED_POINT_SCALE
* KNOB_TILE_X_DIM
));
1128 int32_t y
= AlignDown(intersect
.ymin
, (FIXED_POINT_SCALE
* KNOB_TILE_Y_DIM
));
1130 // convenience typedef
1131 typedef typename
RT::NumCoverageSamplesT NumCoverageSamplesT
;
1133 // single sample rasterization evaluates edges at pixel center,
1134 // multisample evaluates edges UL pixel corner and steps to each sample position
1135 if (std::is_same
<NumCoverageSamplesT
, SingleSampleT
>::value
)
1137 // Add 0.5, in fixed point, to offset to pixel center
1138 x
+= (FIXED_POINT_SCALE
/ 2);
1139 y
+= (FIXED_POINT_SCALE
/ 2);
1142 __m128i vTopLeftX
= _mm_set1_epi32(x
);
1143 __m128i vTopLeftY
= _mm_set1_epi32(y
);
1145 // evaluate edge equations at top-left pixel using 64bit math
1147 // line = Ax + By + C
1150 // we know x0 and y0 are on the line; plug them in:
1152 // plug C back into line equation:
1153 // line = Ax - By - Ax0 - By0
1154 // line = A(x - x0) + B(y - y0)
1155 // dX = (x-x0), dY = (y-y0)
1156 // so all this simplifies to
1157 // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
1159 __m128i vDeltaX
= _mm_sub_epi32(vTopLeftX
, vXi
);
1160 __m128i vDeltaY
= _mm_sub_epi32(vTopLeftY
, vYi
);
1162 // evaluate A(dx) and B(dY) for all points
1163 __m256d vAipd
= _mm256_cvtepi32_pd(vAi
);
1164 __m256d vBipd
= _mm256_cvtepi32_pd(vBi
);
1165 __m256d vDeltaXpd
= _mm256_cvtepi32_pd(vDeltaX
);
1166 __m256d vDeltaYpd
= _mm256_cvtepi32_pd(vDeltaY
);
1168 __m256d vAiDeltaXFix16
= _mm256_mul_pd(vAipd
, vDeltaXpd
);
1169 __m256d vBiDeltaYFix16
= _mm256_mul_pd(vBipd
, vDeltaYpd
);
1170 __m256d vEdge
= _mm256_add_pd(vAiDeltaXFix16
, vBiDeltaYFix16
);
1172 // apply any edge adjustments(top-left, crast, etc)
1173 adjustEdgesFix16
<RT
, typename
RT::ConservativeEdgeOffsetT
>(vAi
, vBi
, vEdge
);
1175 // broadcast respective edge results to all lanes
1176 double* pEdge
= (double*)&vEdge
;
1177 __m256d vEdgeFix16
[7];
1178 vEdgeFix16
[0] = _mm256_set1_pd(pEdge
[0]);
1179 vEdgeFix16
[1] = _mm256_set1_pd(pEdge
[1]);
1180 vEdgeFix16
[2] = _mm256_set1_pd(pEdge
[2]);
1182 OSALIGNSIMD(int32_t) aAi
[4], aBi
[4];
1183 _mm_store_si128((__m128i
*)aAi
, vAi
);
1184 _mm_store_si128((__m128i
*)aBi
, vBi
);
1185 EDGE rastEdges
[RT::NumEdgesT::value
];
1187 // Compute and store triangle edge data
1188 ComputeEdgeData(aAi
[0], aBi
[0], rastEdges
[0]);
1189 ComputeEdgeData(aAi
[1], aBi
[1], rastEdges
[1]);
1190 ComputeEdgeData(aAi
[2], aBi
[2], rastEdges
[2]);
1192 // Compute and store triangle edge data if scissor needs to rasterized
1193 ComputeScissorEdges
<typename
RT::RasterizeScissorEdgesT
, typename
RT::IsConservativeT
, RT
>(
1194 bbox
, scissorInFixedPoint
, x
, y
, rastEdges
, vEdgeFix16
);
1196 // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
1197 // used to for testing if entire raster tile is inside a triangle
1198 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1200 vEdgeFix16
[e
] = _mm256_add_pd(vEdgeFix16
[e
], rastEdges
[e
].vRasterTileOffsets
);
1203 // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
1204 // step sample positions to the raster tile bbox of multisample points
1205 // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples)
1208 // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
1209 __m256d vEdgeTileBbox
[3];
1210 if (NumCoverageSamplesT::value
> 1)
1212 const SWR_MULTISAMPLE_POS
& samplePos
= rastState
.samplePositions
;
1213 const __m128i vTileSampleBBoxXh
= samplePos
.TileSampleOffsetsX();
1214 const __m128i vTileSampleBBoxYh
= samplePos
.TileSampleOffsetsY();
1216 __m256d vTileSampleBBoxXFix8
= _mm256_cvtepi32_pd(vTileSampleBBoxXh
);
1217 __m256d vTileSampleBBoxYFix8
= _mm256_cvtepi32_pd(vTileSampleBBoxYh
);
1219 // step edge equation tests from Tile
1220 // used to for testing if entire raster tile is inside a triangle
1221 for (uint32_t e
= 0; e
< 3; ++e
)
1223 __m256d vResultAxFix16
=
1224 _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].a
), vTileSampleBBoxXFix8
);
1225 __m256d vResultByFix16
=
1226 _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].b
), vTileSampleBBoxYFix8
);
1227 vEdgeTileBbox
[e
] = _mm256_add_pd(vResultAxFix16
, vResultByFix16
);
1229 // adjust for msaa tile bbox edges outward for conservative rast, if enabled
1230 adjustEdgeConservative
<RT
, typename
RT::ConservativeEdgeOffsetT
>(
1231 vAi
, vBi
, vEdgeTileBbox
[e
]);
1235 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEStepSetup
, 0);
1237 uint32_t tY
= minTileY
;
1238 uint32_t tX
= minTileX
;
1239 uint32_t maxY
= maxTileY
;
1240 uint32_t maxX
= maxTileX
;
1242 RenderOutputBuffers renderBuffers
, currentRenderBufferRow
;
1243 GetRenderHotTiles
<RT::MT::numSamples
>(pDC
,
1249 triDesc
.triFlags
.renderTargetArrayIndex
);
1250 currentRenderBufferRow
= renderBuffers
;
1252 // rasterize and generate coverage masks per sample
1253 for (uint32_t tileY
= tY
; tileY
<= maxY
; ++tileY
)
1255 __m256d vStartOfRowEdge
[RT::NumEdgesT::value
];
1256 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1258 vStartOfRowEdge
[e
] = vEdgeFix16
[e
];
1261 for (uint32_t tileX
= tX
; tileX
<= maxX
; ++tileX
)
1263 triDesc
.anyCoveredSamples
= 0;
1265 // is the corner of the edge outside of the raster tile? (vEdge < 0)
1266 int mask0
, mask1
, mask2
;
1267 UpdateEdgeMasks
<NumCoverageSamplesT
>(vEdgeTileBbox
, vEdgeFix16
, mask0
, mask1
, mask2
);
1269 for (uint32_t sampleNum
= 0; sampleNum
< NumCoverageSamplesT::value
; sampleNum
++)
1271 // trivial reject, at least one edge has all 4 corners of raster tile outside
1272 bool trivialReject
=
1273 TrivialRejectTest
<typename
RT::ValidEdgeMaskT
>(mask0
, mask1
, mask2
);
1277 // trivial accept mask
1278 triDesc
.coverageMask
[sampleNum
] = 0xffffffffffffffffULL
;
1280 // Update the raster tile edge masks based on inner conservative edge offsets,
1282 UpdateEdgeMasksInnerConservative
<RT
,
1283 typename
RT::ValidEdgeMaskT
,
1284 typename
RT::InputCoverageT
>(
1285 vEdgeTileBbox
, vEdgeFix16
, vAi
, vBi
, mask0
, mask1
, mask2
);
1287 // @todo Make this a bit smarter to allow use of trivial accept when:
1288 // 1) scissor/vp intersection rect is raster tile aligned
1289 // 2) raster tile is entirely within scissor/vp intersection rect
1290 if (TrivialAcceptTest
<typename
RT::RasterizeScissorEdgesT
>(mask0
, mask1
, mask2
))
1292 // trivial accept, all 4 corners of all 3 edges are negative
1293 // i.e. raster tile completely inside triangle
1294 triDesc
.anyCoveredSamples
= triDesc
.coverageMask
[sampleNum
];
1295 if (std::is_same
<typename
RT::InputCoverageT
,
1296 InnerConservativeCoverageT
>::value
)
1298 triDesc
.innerCoverageMask
= 0xffffffffffffffffULL
;
1300 RDTSC_EVENT(pDC
->pContext
->pBucketMgr
, BETrivialAccept
, 1, 0);
1304 __m256d vEdgeAtSample
[RT::NumEdgesT::value
];
1305 if (std::is_same
<NumCoverageSamplesT
, SingleSampleT
>::value
)
1307 // should get optimized out for single sample case (global value
1308 // numbering or copy propagation)
1309 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1311 vEdgeAtSample
[e
] = vEdgeFix16
[e
];
1316 const SWR_MULTISAMPLE_POS
& samplePos
= rastState
.samplePositions
;
1317 __m128i vSampleOffsetXh
= samplePos
.vXi(sampleNum
);
1318 __m128i vSampleOffsetYh
= samplePos
.vYi(sampleNum
);
1319 __m256d vSampleOffsetX
= _mm256_cvtepi32_pd(vSampleOffsetXh
);
1320 __m256d vSampleOffsetY
= _mm256_cvtepi32_pd(vSampleOffsetYh
);
1322 // step edge equation tests from UL tile corner to pixel sample position
1323 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1325 __m256d vResultAxFix16
=
1326 _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].a
), vSampleOffsetX
);
1327 __m256d vResultByFix16
=
1328 _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].b
), vSampleOffsetY
);
1329 vEdgeAtSample
[e
] = _mm256_add_pd(vResultAxFix16
, vResultByFix16
);
1330 vEdgeAtSample
[e
] = _mm256_add_pd(vEdgeFix16
[e
], vEdgeAtSample
[e
]);
1334 double startQuadEdges
[RT::NumEdgesT::value
];
1335 const __m256i vLane0Mask
= _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1336 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1338 _mm256_maskstore_pd(&startQuadEdges
[e
], vLane0Mask
, vEdgeAtSample
[e
]);
1341 // not trivial accept or reject, must rasterize full tile
1342 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BERasterizePartial
, pDC
->drawId
);
1343 triDesc
.coverageMask
[sampleNum
] =
1344 rasterizePartialTile
<RT::NumEdgesT::value
, typename
RT::ValidEdgeMaskT
>(
1345 pDC
, startQuadEdges
, rastEdges
);
1346 RDTSC_END(pDC
->pContext
->pBucketMgr
, BERasterizePartial
, 0);
1348 triDesc
.anyCoveredSamples
|= triDesc
.coverageMask
[sampleNum
];
1350 // Output SV InnerCoverage, if needed
1351 GenerateSVInnerCoverage
<RT
,
1352 typename
RT::ValidEdgeMaskT
,
1353 typename
RT::InputCoverageT
>(
1354 pDC
, workerId
, rastEdges
, startQuadEdges
, triDesc
.innerCoverageMask
);
1359 // if we're calculating coverage per sample, need to store it off. otherwise no
1360 // covered samples, don't need to do anything
1361 if (NumCoverageSamplesT::value
> 1)
1363 triDesc
.coverageMask
[sampleNum
] = 0;
1365 RDTSC_EVENT(pDC
->pContext
->pBucketMgr
, BETrivialReject
, 1, 0);
1369 #if KNOB_ENABLE_TOSS_POINTS
1372 gToss
= triDesc
.coverageMask
[0];
1376 if (triDesc
.anyCoveredSamples
)
1378 // if conservative rast and MSAA are enabled, conservative coverage for a pixel
1379 // means all samples in that pixel are covered copy conservative coverage result to
1381 if (RT::IsConservativeT::value
)
1383 auto copyCoverage
= [&](int sample
) {
1384 triDesc
.coverageMask
[sample
] = triDesc
.coverageMask
[0];
1386 UnrollerL
<1, RT::MT::numSamples
, 1>::step(copyCoverage
);
1389 // Track rasterized subspans
1390 AR_EVENT(RasterTileCount(pDC
->drawId
, 1));
1392 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, BEPixelBackend
, pDC
->drawId
);
1393 backendFuncs
.pfnBackend(pDC
,
1395 tileX
<< KNOB_TILE_X_DIM_SHIFT
,
1396 tileY
<< KNOB_TILE_Y_DIM_SHIFT
,
1399 RDTSC_END(pDC
->pContext
->pBucketMgr
, BEPixelBackend
, 0);
1402 // step to the next tile in X
1403 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1406 _mm256_add_pd(vEdgeFix16
[e
], _mm256_set1_pd(rastEdges
[e
].stepRasterTileX
));
1408 StepRasterTileX
<RT
>(state
.colorHottileEnable
, renderBuffers
);
1411 // step to the next tile in Y
1412 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1415 _mm256_add_pd(vStartOfRowEdge
[e
], _mm256_set1_pd(rastEdges
[e
].stepRasterTileY
));
1417 StepRasterTileY
<RT
>(state
.colorHottileEnable
, renderBuffers
, currentRenderBufferRow
);
1420 RDTSC_END(pDC
->pContext
->pBucketMgr
, BERasterizeTriangle
, 1);
1423 // Get pointers to hot tile memory for color RT, depth, stencil
1424 template <uint32_t numSamples
>
1425 void GetRenderHotTiles(DRAW_CONTEXT
* pDC
,
1430 RenderOutputBuffers
& renderBuffers
,
1431 uint32_t renderTargetArrayIndex
)
1433 const API_STATE
& state
= GetApiState(pDC
);
1434 SWR_CONTEXT
* pContext
= pDC
->pContext
;
1435 HANDLE hWorkerPrivateData
= pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
1438 MacroTileMgr::getTileIndices(macroID
, mx
, my
);
1439 tileX
-= KNOB_MACROTILE_X_DIM_IN_TILES
* mx
;
1440 tileY
-= KNOB_MACROTILE_Y_DIM_IN_TILES
* my
;
1442 // compute tile offset for active hottile buffers
1443 const uint32_t pitch
= KNOB_MACROTILE_X_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8;
1444 uint32_t offset
= ComputeTileOffset2D
<
1445 TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
>>(
1446 pitch
, tileX
, tileY
);
1447 offset
*= numSamples
;
1449 unsigned long rtSlot
= 0;
1450 uint32_t colorHottileEnableMask
= state
.colorHottileEnable
;
1451 while (_BitScanForward(&rtSlot
, colorHottileEnableMask
))
1453 HOTTILE
* pColor
= pContext
->pHotTileMgr
->GetHotTile(
1458 (SWR_RENDERTARGET_ATTACHMENT
)(SWR_ATTACHMENT_COLOR0
+ rtSlot
),
1461 renderTargetArrayIndex
);
1462 renderBuffers
.pColor
[rtSlot
] = pColor
->pBuffer
+ offset
;
1463 renderBuffers
.pColorHotTile
[rtSlot
] = pColor
;
1465 colorHottileEnableMask
&= ~(1 << rtSlot
);
1467 if (state
.depthHottileEnable
)
1469 const uint32_t pitch
=
1470 KNOB_MACROTILE_X_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8;
1471 uint32_t offset
= ComputeTileOffset2D
<
1472 TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
>>(
1473 pitch
, tileX
, tileY
);
1474 offset
*= numSamples
;
1475 HOTTILE
* pDepth
= pContext
->pHotTileMgr
->GetHotTile(pContext
,
1479 SWR_ATTACHMENT_DEPTH
,
1482 renderTargetArrayIndex
);
1483 pDepth
->state
= HOTTILE_DIRTY
;
1484 SWR_ASSERT(pDepth
->pBuffer
!= nullptr);
1485 renderBuffers
.pDepth
= pDepth
->pBuffer
+ offset
;
1486 renderBuffers
.pDepthHotTile
= pDepth
;
1488 if (state
.stencilHottileEnable
)
1490 const uint32_t pitch
=
1491 KNOB_MACROTILE_X_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8;
1492 uint32_t offset
= ComputeTileOffset2D
<
1493 TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
>>(
1494 pitch
, tileX
, tileY
);
1495 offset
*= numSamples
;
1496 HOTTILE
* pStencil
= pContext
->pHotTileMgr
->GetHotTile(pContext
,
1500 SWR_ATTACHMENT_STENCIL
,
1503 renderTargetArrayIndex
);
1504 pStencil
->state
= HOTTILE_DIRTY
;
1505 SWR_ASSERT(pStencil
->pBuffer
!= nullptr);
1506 renderBuffers
.pStencil
= pStencil
->pBuffer
+ offset
;
1507 renderBuffers
.pStencilHotTile
= pStencil
;
1511 template <typename RT
>
1512 INLINE
void StepRasterTileX(uint32_t colorHotTileMask
, RenderOutputBuffers
& buffers
)
1515 while (_BitScanForward(&rt
, colorHotTileMask
))
1517 colorHotTileMask
&= ~(1 << rt
);
1518 buffers
.pColor
[rt
] += RT::colorRasterTileStep
;
1521 buffers
.pDepth
+= RT::depthRasterTileStep
;
1522 buffers
.pStencil
+= RT::stencilRasterTileStep
;
1525 template <typename RT
>
1526 INLINE
void StepRasterTileY(uint32_t colorHotTileMask
,
1527 RenderOutputBuffers
& buffers
,
1528 RenderOutputBuffers
& startBufferRow
)
1531 while (_BitScanForward(&rt
, colorHotTileMask
))
1533 colorHotTileMask
&= ~(1 << rt
);
1534 startBufferRow
.pColor
[rt
] += RT::colorRasterTileRowStep
;
1535 buffers
.pColor
[rt
] = startBufferRow
.pColor
[rt
];
1537 startBufferRow
.pDepth
+= RT::depthRasterTileRowStep
;
1538 buffers
.pDepth
= startBufferRow
.pDepth
;
1540 startBufferRow
.pStencil
+= RT::stencilRasterTileRowStep
;
1541 buffers
.pStencil
= startBufferRow
.pStencil
;