1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * @file rasterizer.cpp
25 * @brief Implementation for the rasterizer.
27 ******************************************************************************/
32 #include "rasterizer.h"
33 #include "rdtsc_core.h"
38 #include "memory/tilingtraits.h"
40 extern PFN_WORK_FUNC gRasterizerFuncs
[SWR_MULTISAMPLE_TYPE_COUNT
][2][2][SWR_INPUT_COVERAGE_COUNT
][STATE_VALID_TRI_EDGE_COUNT
][2];
42 template <uint32_t numSamples
= 1>
43 void GetRenderHotTiles(DRAW_CONTEXT
*pDC
, uint32_t macroID
, uint32_t x
, uint32_t y
, RenderOutputBuffers
&renderBuffers
, uint32_t renderTargetArrayIndex
);
44 template <typename RT
>
45 void StepRasterTileX(uint32_t colorHotTileMask
, RenderOutputBuffers
&buffers
);
46 template <typename RT
>
47 void StepRasterTileY(uint32_t colorHotTileMask
, RenderOutputBuffers
&buffers
, RenderOutputBuffers
&startBufferRow
);
49 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
50 static const __m256d gMaskToVecpd
[] =
52 MASKTOVEC(0, 0, 0, 0),
53 MASKTOVEC(0, 0, 0, 1),
54 MASKTOVEC(0, 0, 1, 0),
55 MASKTOVEC(0, 0, 1, 1),
56 MASKTOVEC(0, 1, 0, 0),
57 MASKTOVEC(0, 1, 0, 1),
58 MASKTOVEC(0, 1, 1, 0),
59 MASKTOVEC(0, 1, 1, 1),
60 MASKTOVEC(1, 0, 0, 0),
61 MASKTOVEC(1, 0, 0, 1),
62 MASKTOVEC(1, 0, 1, 0),
63 MASKTOVEC(1, 0, 1, 1),
64 MASKTOVEC(1, 1, 0, 0),
65 MASKTOVEC(1, 1, 0, 1),
66 MASKTOVEC(1, 1, 1, 0),
67 MASKTOVEC(1, 1, 1, 1),
77 double a
, b
; // a, b edge coefficients in fix8
78 double stepQuadX
; // step to adjacent horizontal quad in fix16
79 double stepQuadY
; // step to adjacent vertical quad in fix16
80 double stepRasterTileX
; // step to adjacent horizontal raster tile in fix16
81 double stepRasterTileY
; // step to adjacent vertical raster tile in fix16
83 __m256d vQuadOffsets
; // offsets for 4 samples of a quad
84 __m256d vRasterTileOffsets
; // offsets for the 4 corners of a raster tile
87 //////////////////////////////////////////////////////////////////////////
88 /// @brief rasterize a raster tile partially covered by the triangle
89 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
90 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
91 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
92 /// Used to step between quads when sweeping over the raster tile.
93 template<uint32_t NumEdges
, typename EdgeMaskT
>
94 INLINE
uint64_t rasterizePartialTile(DRAW_CONTEXT
*pDC
, double startEdges
[NumEdges
], EDGE
*pRastEdges
)
96 uint64_t coverageMask
= 0;
98 __m256d vEdges
[NumEdges
];
99 __m256d vStepX
[NumEdges
];
100 __m256d vStepY
[NumEdges
];
102 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
104 // Step to the pixel sample locations of the 1st quad
105 vEdges
[e
] = _mm256_add_pd(_mm256_set1_pd(startEdges
[e
]), pRastEdges
[e
].vQuadOffsets
);
107 // compute step to next quad (mul by 2 in x and y direction)
108 vStepX
[e
] = _mm256_set1_pd(pRastEdges
[e
].stepQuadX
);
109 vStepY
[e
] = _mm256_set1_pd(pRastEdges
[e
].stepQuadY
);
112 // fast unrolled version for 8x8 tile
113 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
114 int edgeMask
[NumEdges
];
117 auto eval_lambda
= [&](int e
){edgeMask
[e
] = _mm256_movemask_pd(vEdges
[e
]);};
118 auto update_lambda
= [&](int e
){mask
&= edgeMask
[e
];};
119 auto incx_lambda
= [&](int e
){vEdges
[e
] = _mm256_add_pd(vEdges
[e
], vStepX
[e
]);};
120 auto incy_lambda
= [&](int e
){vEdges
[e
] = _mm256_add_pd(vEdges
[e
], vStepY
[e
]);};
121 auto decx_lambda
= [&](int e
){vEdges
[e
] = _mm256_sub_pd(vEdges
[e
], vStepX
[e
]);};
123 // evaluate which pixels in the quad are covered
125 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
127 // update coverage mask
128 // if edge 0 is degenerate and will be skipped; init the mask
129 #define UPDATE_MASK(bit) \
130 if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
134 mask = edgeMask[0]; \
136 UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
137 coverageMask |= (mask << bit);
139 // step in the +x direction to the next quad
141 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
143 // step in the +y direction to the next quad
145 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
147 // step in the -x direction to the next quad
149 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
151 // sweep 2x2 quad back and forth through the raster tile,
152 // computing coverage masks for the entire tile
157 // x x ------------------>
159 // <-----------------x x V
218 for (uint32_t y
= 0; y
< KNOB_TILE_Y_DIM
/2; ++y
)
220 __m256d vStartOfRowEdge
[NumEdges
];
221 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
223 vStartOfRowEdge
[e
] = vEdges
[e
];
226 for (uint32_t x
= 0; x
< KNOB_TILE_X_DIM
/2; ++x
)
228 int edgeMask
[NumEdges
];
229 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
231 edgeMask
[e
] = _mm256_movemask_pd(vEdges
[e
]);
234 uint64_t mask
= edgeMask
[0];
235 for (uint32_t e
= 1; e
< NumEdges
; ++e
)
239 coverageMask
|= (mask
<< bit
);
241 // step to the next pixel in the x
242 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
244 vEdges
[e
] = _mm256_add_pd(vEdges
[e
], vStepX
[e
]);
249 // step to the next row
250 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
252 vEdges
[e
] = _mm256_add_pd(vStartOfRowEdge
[e
], vStepY
[e
]);
260 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
261 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
262 // Top left: a sample is in if it is a top or left edge.
263 // Out: !(horizontal && above) = !horizontal && below
264 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right
265 INLINE
void adjustTopLeftRuleIntFix16(const __m128i vA
, const __m128i vB
, __m256d
&vEdge
)
268 // if vA == 0 && vB < 0, vC--
270 __m256d vEdgeOut
= vEdge
;
271 __m256d vEdgeAdjust
= _mm256_sub_pd(vEdge
, _mm256_set1_pd(1.0));
273 // if vA < 0 (line is not horizontal and below)
274 int msk
= _mm_movemask_ps(_mm_castsi128_ps(vA
));
276 // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
277 __m128i vCmp
= _mm_cmpeq_epi32(vA
, _mm_setzero_si128());
278 int msk2
= _mm_movemask_ps(_mm_castsi128_ps(vCmp
));
279 msk2
&= _mm_movemask_ps(_mm_castsi128_ps(vB
));
281 // if either of these are true and we're on the line (edge == 0), bump it outside the line
282 vEdge
= _mm256_blendv_pd(vEdgeOut
, vEdgeAdjust
, gMaskToVecpd
[msk
| msk2
]);
285 //////////////////////////////////////////////////////////////////////////
286 /// @brief calculates difference in precision between the result of manh
287 /// calculation and the edge precision, based on compile time trait values
288 template<typename RT
>
289 constexpr int64_t ManhToEdgePrecisionAdjust()
291 static_assert(RT::PrecisionT::BitsT::value
+ RT::ConservativePrecisionT::BitsT::value
>= RT::EdgePrecisionT::BitsT::value
,
292 "Inadequate precision of result of manh calculation ");
293 return ((RT::PrecisionT::BitsT::value
+ RT::ConservativePrecisionT::BitsT::value
) - RT::EdgePrecisionT::BitsT::value
);
296 //////////////////////////////////////////////////////////////////////////
297 /// @struct adjustEdgeConservative
298 /// @brief Primary template definition used for partially specializing
299 /// the adjustEdgeConservative function. This struct should never
301 /// @tparam RT: rasterizer traits
302 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
303 template <typename RT
, typename ConservativeEdgeOffsetT
>
304 struct adjustEdgeConservative
306 //////////////////////////////////////////////////////////////////////////
307 /// @brief Performs calculations to adjust each edge of a triangle away
308 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
311 /// Uncertainty regions arise from fixed point rounding, which
312 /// can snap a vertex +/- by min fixed point value.
313 /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
314 /// This allows the rasterizer to test for coverage only at the pixel center,
315 /// instead of having to test individual pixel corners for conservative coverage
316 INLINE
adjustEdgeConservative(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
)
318 // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
319 // from the pixel center (in the direction of the edge normal A/B)
321 // edge = Ax + Bx + C - (manh/e)
322 // manh = manhattan distance = abs(A) + abs(B)
323 // e = absolute rounding error from snapping from float to fixed point precision
325 // 'fixed point' multiply (in double to be avx1 friendly)
326 // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
327 __m256d vAai
= _mm256_cvtepi32_pd(_mm_abs_epi32(vAi
)), vBai
= _mm256_cvtepi32_pd(_mm_abs_epi32(vBi
));
328 __m256d manh
= _mm256_add_pd(_mm256_mul_pd(vAai
, _mm256_set1_pd(ConservativeEdgeOffsetT::value
)),
329 _mm256_mul_pd(vBai
, _mm256_set1_pd(ConservativeEdgeOffsetT::value
)));
331 static_assert(RT::PrecisionT::BitsT::value
+ RT::ConservativePrecisionT::BitsT::value
>= RT::EdgePrecisionT::BitsT::value
,
332 "Inadequate precision of result of manh calculation ");
334 // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
335 // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
336 manh
= _mm256_mul_pd(manh
, _mm256_set1_pd(ManhToEdgePrecisionAdjust
<RT
>() * 0.5));
338 // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
339 // this allows the rasterizer to do a single conservative coverage test to see if the primitive
340 // intersects the pixel at all
341 vEdge
= _mm256_sub_pd(vEdge
, manh
);
345 //////////////////////////////////////////////////////////////////////////
346 /// @brief adjustEdgeConservative specialization where no edge offset is needed
347 template <typename RT
>
348 struct adjustEdgeConservative
<RT
, std::integral_constant
<int32_t, 0>>
350 INLINE
adjustEdgeConservative(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
) {};
353 //////////////////////////////////////////////////////////////////////////
354 /// @brief calculates the distance a degenerate BBox needs to be adjusted
355 /// for conservative rast based on compile time trait values
356 template<typename RT
>
357 constexpr int64_t ConservativeScissorOffset()
359 static_assert(RT::ConservativePrecisionT::BitsT::value
- RT::PrecisionT::BitsT::value
>= 0, "Rasterizer precision > conservative precision");
360 // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
361 typedef std::integral_constant
<int32_t, (RT::ValidEdgeMaskT::value
== ALL_EDGES_VALID
) ? 0 : 1> DegenerateEdgeOffsetT
;
362 // 1/2 pixel edge offset + conservative offset - degenerateTriangle
363 return RT::ConservativeEdgeOffsetT::value
- (DegenerateEdgeOffsetT::value
<< (RT::ConservativePrecisionT::BitsT::value
- RT::PrecisionT::BitsT::value
));
366 //////////////////////////////////////////////////////////////////////////
367 /// @brief Performs calculations to adjust each a vector of evaluated edges out
368 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
370 template <typename RT
>
371 INLINE
void adjustScissorEdge(const double a
, const double b
, __m256d
&vEdge
)
373 int64_t aabs
= std::abs(static_cast<int64_t>(a
)), babs
= std::abs(static_cast<int64_t>(b
));
374 int64_t manh
= ((aabs
* ConservativeScissorOffset
<RT
>()) + (babs
* ConservativeScissorOffset
<RT
>())) >> ManhToEdgePrecisionAdjust
<RT
>();
375 vEdge
= _mm256_sub_pd(vEdge
, _mm256_set1_pd(manh
));
378 //////////////////////////////////////////////////////////////////////////
379 /// @brief Performs calculations to adjust each a scalar evaluated edge out
380 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
382 template <typename RT
, typename OffsetT
>
383 INLINE
double adjustScalarEdge(const double a
, const double b
, const double Edge
)
385 int64_t aabs
= std::abs(static_cast<int64_t>(a
)), babs
= std::abs(static_cast<int64_t>(b
));
386 int64_t manh
= ((aabs
* OffsetT::value
) + (babs
* OffsetT::value
)) >> ManhToEdgePrecisionAdjust
<RT
>();
387 return (Edge
- manh
);
390 //////////////////////////////////////////////////////////////////////////
391 /// @brief Perform any needed adjustments to evaluated triangle edges
392 template <typename RT
, typename EdgeOffsetT
>
393 struct adjustEdgesFix16
395 INLINE
adjustEdgesFix16(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
)
397 static_assert(std::is_same
<typename
RT::EdgePrecisionT
, FixedPointTraits
<Fixed_X_16
>>::value
,
398 "Edge equation expected to be in x.16 fixed point");
400 static_assert(RT::IsConservativeT::value
, "Edge offset assumes conservative rasterization is enabled");
402 // need to apply any edge offsets before applying the top-left rule
403 adjustEdgeConservative
<RT
, EdgeOffsetT
>(vAi
, vBi
, vEdge
);
405 adjustTopLeftRuleIntFix16(vAi
, vBi
, vEdge
);
409 //////////////////////////////////////////////////////////////////////////
410 /// @brief Perform top left adjustments to evaluated triangle edges
411 template <typename RT
>
412 struct adjustEdgesFix16
<RT
, std::integral_constant
<int32_t, 0>>
414 INLINE
adjustEdgesFix16(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
)
416 adjustTopLeftRuleIntFix16(vAi
, vBi
, vEdge
);
420 // max(abs(dz/dx), abs(dz,dy)
421 INLINE
float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC
* pDesc
)
424 // evaluate i,j at (0,0)
425 float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
426 float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
428 // evaluate i,j at (1,0)
429 float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
430 float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
433 float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
434 float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
435 float dzdx = abs(d10 - d00);
437 // evaluate i,j at (0,1)
438 float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
439 float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
441 float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
442 float dzdy = abs(d01 - d00);
445 // optimized version of above
446 float dzdx
= fabsf(pDesc
->recipDet
* (pDesc
->Z
[0] * pDesc
->I
[0] + pDesc
->Z
[1] * pDesc
->J
[0]));
447 float dzdy
= fabsf(pDesc
->recipDet
* (pDesc
->Z
[0] * pDesc
->I
[1] + pDesc
->Z
[1] * pDesc
->J
[1]));
449 return std::max(dzdx
, dzdy
);
452 INLINE
float ComputeBiasFactor(const SWR_RASTSTATE
* pState
, const SWR_TRIANGLE_DESC
* pDesc
, const float* z
)
454 if (pState
->depthFormat
== R24_UNORM_X8_TYPELESS
)
456 return (1.0f
/ (1 << 24));
458 else if (pState
->depthFormat
== R16_UNORM
)
460 return (1.0f
/ (1 << 16));
464 SWR_ASSERT(pState
->depthFormat
== R32_FLOAT
);
466 // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
467 float zMax
= std::max(fabsf(z
[0]), std::max(fabsf(z
[1]), fabsf(z
[2])));
468 uint32_t zMaxInt
= *(uint32_t*)&zMax
;
469 zMaxInt
&= 0x7f800000;
470 zMax
= *(float*)&zMaxInt
;
472 return zMax
* (1.0f
/ (1 << 23));
476 INLINE
float ComputeDepthBias(const SWR_RASTSTATE
* pState
, const SWR_TRIANGLE_DESC
* pTri
, const float* z
)
478 if (pState
->depthBias
== 0 && pState
->slopeScaledDepthBias
== 0)
483 float scale
= pState
->slopeScaledDepthBias
;
486 scale
*= ComputeMaxDepthSlope(pTri
);
489 float bias
= pState
->depthBias
;
490 if (!pState
->depthBiasPreAdjusted
)
492 bias
*= ComputeBiasFactor(pState
, pTri
, z
);
496 if (pState
->depthBiasClamp
> 0.0f
)
498 bias
= std::min(bias
, pState
->depthBiasClamp
);
500 else if (pState
->depthBiasClamp
< 0.0f
)
502 bias
= std::max(bias
, pState
->depthBiasClamp
);
508 // Prevent DCE by writing coverage mask from rasterizer to volatile
509 #if KNOB_ENABLE_TOSS_POINTS
510 __declspec(thread
) volatile uint64_t gToss
;
513 static const uint32_t vertsPerTri
= 3, componentsPerAttrib
= 4;
514 // try to avoid _chkstk insertions; make this thread local
515 static THREAD
OSALIGNLINE(float) perspAttribsTLS
[vertsPerTri
* SWR_VTX_NUM_SLOTS
* componentsPerAttrib
];
518 void ComputeEdgeData(int32_t a
, int32_t b
, EDGE
& edge
)
523 // compute constant steps to adjacent quads
524 edge
.stepQuadX
= (double)((int64_t)a
* (int64_t)(2 * FIXED_POINT_SCALE
));
525 edge
.stepQuadY
= (double)((int64_t)b
* (int64_t)(2 * FIXED_POINT_SCALE
));
527 // compute constant steps to adjacent raster tiles
528 edge
.stepRasterTileX
= (double)((int64_t)a
* (int64_t)(KNOB_TILE_X_DIM
* FIXED_POINT_SCALE
));
529 edge
.stepRasterTileY
= (double)((int64_t)b
* (int64_t)(KNOB_TILE_Y_DIM
* FIXED_POINT_SCALE
));
531 // compute quad offsets
532 const __m256d vQuadOffsetsXIntFix8
= _mm256_set_pd(FIXED_POINT_SCALE
, 0, FIXED_POINT_SCALE
, 0);
533 const __m256d vQuadOffsetsYIntFix8
= _mm256_set_pd(FIXED_POINT_SCALE
, FIXED_POINT_SCALE
, 0, 0);
535 __m256d vQuadStepXFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.a
), vQuadOffsetsXIntFix8
);
536 __m256d vQuadStepYFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.b
), vQuadOffsetsYIntFix8
);
537 edge
.vQuadOffsets
= _mm256_add_pd(vQuadStepXFix16
, vQuadStepYFix16
);
539 // compute raster tile offsets
540 const __m256d vTileOffsetsXIntFix8
= _mm256_set_pd((KNOB_TILE_X_DIM
- 1)*FIXED_POINT_SCALE
, 0, (KNOB_TILE_X_DIM
- 1)*FIXED_POINT_SCALE
, 0);
541 const __m256d vTileOffsetsYIntFix8
= _mm256_set_pd((KNOB_TILE_Y_DIM
- 1)*FIXED_POINT_SCALE
, (KNOB_TILE_Y_DIM
- 1)*FIXED_POINT_SCALE
, 0, 0);
543 __m256d vTileStepXFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.a
), vTileOffsetsXIntFix8
);
544 __m256d vTileStepYFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.b
), vTileOffsetsYIntFix8
);
545 edge
.vRasterTileOffsets
= _mm256_add_pd(vTileStepXFix16
, vTileStepYFix16
);
549 void ComputeEdgeData(const POS
& p0
, const POS
& p1
, EDGE
& edge
)
551 ComputeEdgeData(p0
.y
- p1
.y
, p1
.x
- p0
.x
, edge
);
554 //////////////////////////////////////////////////////////////////////////
555 /// @brief Primary template definition used for partially specializing
556 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
557 /// corner to sample position, and test for coverage
558 /// @tparam sampleCount: multisample count
559 template <typename NumSamplesT
>
560 INLINE
void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox
)[3], const __m256d
* vEdgeFix16
,
561 int32_t &mask0
, int32_t &mask1
, int32_t &mask2
)
563 __m256d vSampleBboxTest0
, vSampleBboxTest1
, vSampleBboxTest2
;
564 // evaluate edge equations at the tile multisample bounding box
565 vSampleBboxTest0
= _mm256_add_pd(vEdgeTileBbox
[0], vEdgeFix16
[0]);
566 vSampleBboxTest1
= _mm256_add_pd(vEdgeTileBbox
[1], vEdgeFix16
[1]);
567 vSampleBboxTest2
= _mm256_add_pd(vEdgeTileBbox
[2], vEdgeFix16
[2]);
568 mask0
= _mm256_movemask_pd(vSampleBboxTest0
);
569 mask1
= _mm256_movemask_pd(vSampleBboxTest1
);
570 mask2
= _mm256_movemask_pd(vSampleBboxTest2
);
573 //////////////////////////////////////////////////////////////////////////
574 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
575 /// when only rasterizing a single coverage test point
577 INLINE
void UpdateEdgeMasks
<SingleSampleT
>(const __m256d(&)[3], const __m256d
* vEdgeFix16
,
578 int32_t &mask0
, int32_t &mask1
, int32_t &mask2
)
580 mask0
= _mm256_movemask_pd(vEdgeFix16
[0]);
581 mask1
= _mm256_movemask_pd(vEdgeFix16
[1]);
582 mask2
= _mm256_movemask_pd(vEdgeFix16
[2]);
585 //////////////////////////////////////////////////////////////////////////
586 /// @struct ComputeScissorEdges
587 /// @brief Primary template definition. Allows the function to be generically
588 /// called. When paired with below specializations, will result in an empty
589 /// inlined function if scissor is not enabled
590 /// @tparam RasterScissorEdgesT: is scissor enabled?
591 /// @tparam IsConservativeT: is conservative rast enabled?
592 /// @tparam RT: rasterizer traits
593 template <typename RasterScissorEdgesT
, typename IsConservativeT
, typename RT
>
594 struct ComputeScissorEdges
596 INLINE
ComputeScissorEdges(const SWR_RECT
&triBBox
, const SWR_RECT
&scissorBBox
, const int32_t x
, const int32_t y
,
597 EDGE (&rastEdges
)[RT::NumEdgesT::value
], __m256d (&vEdgeFix16
)[7]){};
600 //////////////////////////////////////////////////////////////////////////
601 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
602 /// specialization. Instantiated when conservative rast and scissor are enabled
603 template <typename RT
>
604 struct ComputeScissorEdges
<std::true_type
, std::true_type
, RT
>
606 //////////////////////////////////////////////////////////////////////////
607 /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
608 /// evaluate edge equations and offset them away from pixel center.
609 INLINE
ComputeScissorEdges(const SWR_RECT
&triBBox
, const SWR_RECT
&scissorBBox
, const int32_t x
, const int32_t y
,
610 EDGE (&rastEdges
)[RT::NumEdgesT::value
], __m256d (&vEdgeFix16
)[7])
612 // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
614 scissor
.xmin
= std::max(triBBox
.xmin
, scissorBBox
.xmin
);
615 scissor
.xmax
= std::min(triBBox
.xmax
, scissorBBox
.xmax
);
616 scissor
.ymin
= std::max(triBBox
.ymin
, scissorBBox
.ymin
);
617 scissor
.ymax
= std::min(triBBox
.ymax
, scissorBBox
.ymax
);
619 POS topLeft
{scissor
.xmin
, scissor
.ymin
};
620 POS bottomLeft
{scissor
.xmin
, scissor
.ymax
};
621 POS topRight
{scissor
.xmax
, scissor
.ymin
};
622 POS bottomRight
{scissor
.xmax
, scissor
.ymax
};
624 // construct 4 scissor edges in ccw direction
625 ComputeEdgeData(topLeft
, bottomLeft
, rastEdges
[3]);
626 ComputeEdgeData(bottomLeft
, bottomRight
, rastEdges
[4]);
627 ComputeEdgeData(bottomRight
, topRight
, rastEdges
[5]);
628 ComputeEdgeData(topRight
, topLeft
, rastEdges
[6]);
630 vEdgeFix16
[3] = _mm256_set1_pd((rastEdges
[3].a
* (x
- scissor
.xmin
)) + (rastEdges
[3].b
* (y
- scissor
.ymin
)));
631 vEdgeFix16
[4] = _mm256_set1_pd((rastEdges
[4].a
* (x
- scissor
.xmin
)) + (rastEdges
[4].b
* (y
- scissor
.ymax
)));
632 vEdgeFix16
[5] = _mm256_set1_pd((rastEdges
[5].a
* (x
- scissor
.xmax
)) + (rastEdges
[5].b
* (y
- scissor
.ymax
)));
633 vEdgeFix16
[6] = _mm256_set1_pd((rastEdges
[6].a
* (x
- scissor
.xmax
)) + (rastEdges
[6].b
* (y
- scissor
.ymin
)));
635 // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
636 adjustScissorEdge
<RT
>(rastEdges
[3].a
, rastEdges
[3].b
, vEdgeFix16
[3]);
637 adjustScissorEdge
<RT
>(rastEdges
[4].a
, rastEdges
[4].b
, vEdgeFix16
[4]);
638 adjustScissorEdge
<RT
>(rastEdges
[5].a
, rastEdges
[5].b
, vEdgeFix16
[5]);
639 adjustScissorEdge
<RT
>(rastEdges
[6].a
, rastEdges
[6].b
, vEdgeFix16
[6]);
641 // Upper left rule for scissor
642 vEdgeFix16
[3] = _mm256_sub_pd(vEdgeFix16
[3], _mm256_set1_pd(1.0));
643 vEdgeFix16
[6] = _mm256_sub_pd(vEdgeFix16
[6], _mm256_set1_pd(1.0));
647 //////////////////////////////////////////////////////////////////////////
648 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
649 /// specialization. Instantiated when scissor is enabled and conservative rast
651 template <typename RT
>
652 struct ComputeScissorEdges
<std::true_type
, std::false_type
, RT
>
654 //////////////////////////////////////////////////////////////////////////
655 /// @brief Compute scissor edge vectors and evaluate edge equations
656 INLINE
ComputeScissorEdges(const SWR_RECT
&, const SWR_RECT
&scissorBBox
, const int32_t x
, const int32_t y
,
657 EDGE (&rastEdges
)[RT::NumEdgesT::value
], __m256d (&vEdgeFix16
)[7])
659 const SWR_RECT
&scissor
= scissorBBox
;
660 POS topLeft
{scissor
.xmin
, scissor
.ymin
};
661 POS bottomLeft
{scissor
.xmin
, scissor
.ymax
};
662 POS topRight
{scissor
.xmax
, scissor
.ymin
};
663 POS bottomRight
{scissor
.xmax
, scissor
.ymax
};
665 // construct 4 scissor edges in ccw direction
666 ComputeEdgeData(topLeft
, bottomLeft
, rastEdges
[3]);
667 ComputeEdgeData(bottomLeft
, bottomRight
, rastEdges
[4]);
668 ComputeEdgeData(bottomRight
, topRight
, rastEdges
[5]);
669 ComputeEdgeData(topRight
, topLeft
, rastEdges
[6]);
671 vEdgeFix16
[3] = _mm256_set1_pd((rastEdges
[3].a
* (x
- scissor
.xmin
)) + (rastEdges
[3].b
* (y
- scissor
.ymin
)));
672 vEdgeFix16
[4] = _mm256_set1_pd((rastEdges
[4].a
* (x
- scissor
.xmin
)) + (rastEdges
[4].b
* (y
- scissor
.ymax
)));
673 vEdgeFix16
[5] = _mm256_set1_pd((rastEdges
[5].a
* (x
- scissor
.xmax
)) + (rastEdges
[5].b
* (y
- scissor
.ymax
)));
674 vEdgeFix16
[6] = _mm256_set1_pd((rastEdges
[6].a
* (x
- scissor
.xmax
)) + (rastEdges
[6].b
* (y
- scissor
.ymin
)));
676 // Upper left rule for scissor
677 vEdgeFix16
[3] = _mm256_sub_pd(vEdgeFix16
[3], _mm256_set1_pd(1.0));
678 vEdgeFix16
[6] = _mm256_sub_pd(vEdgeFix16
[6], _mm256_set1_pd(1.0));
682 //////////////////////////////////////////////////////////////////////////
683 /// @brief Primary function template for TrivialRejectTest. Should
684 /// never be called, but TemplateUnroller instantiates a few unused values,
685 /// so it calls a runtime assert instead of a static_assert.
686 template <typename ValidEdgeMaskT
>
687 INLINE
bool TrivialRejectTest(const int, const int, const int)
689 SWR_INVALID("Primary templated function should never be called");
693 //////////////////////////////////////////////////////////////////////////
694 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
695 /// and edge 1 for trivial coverage reject
697 INLINE
bool TrivialRejectTest
<E0E1ValidT
>(const int mask0
, const int mask1
, const int)
699 return (!(mask0
&& mask1
)) ? true : false;
702 //////////////////////////////////////////////////////////////////////////
703 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
704 /// and edge 2 for trivial coverage reject
706 INLINE
bool TrivialRejectTest
<E0E2ValidT
>(const int mask0
, const int, const int mask2
)
708 return (!(mask0
&& mask2
)) ? true : false;
711 //////////////////////////////////////////////////////////////////////////
712 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
713 /// and edge 2 for trivial coverage reject
715 INLINE
bool TrivialRejectTest
<E1E2ValidT
>(const int, const int mask1
, const int mask2
)
717 return (!(mask1
&& mask2
)) ? true : false;
720 //////////////////////////////////////////////////////////////////////////
721 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
722 /// primitive edges for trivial coverage reject
724 INLINE
bool TrivialRejectTest
<AllEdgesValidT
>(const int mask0
, const int mask1
, const int mask2
)
726 return (!(mask0
&& mask1
&& mask2
)) ? true : false;;
729 //////////////////////////////////////////////////////////////////////////
730 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
731 /// point, so return false and rasterize against conservative BBox
733 INLINE
bool TrivialRejectTest
<NoEdgesValidT
>(const int, const int, const int)
738 //////////////////////////////////////////////////////////////////////////
739 /// @brief Primary function template for TrivialAcceptTest. Always returns
740 /// false, since it will only be called for degenerate tris, and as such
741 /// will never cover the entire raster tile
742 template <typename ScissorEnableT
>
743 INLINE
bool TrivialAcceptTest(const int, const int, const int)
748 //////////////////////////////////////////////////////////////////////////
749 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
750 /// edge masks for a fully covered raster tile
752 INLINE
bool TrivialAcceptTest
<std::false_type
>(const int mask0
, const int mask1
, const int mask2
)
754 return ((mask0
& mask1
& mask2
) == 0xf);
757 //////////////////////////////////////////////////////////////////////////
758 /// @brief Primary function template for GenerateSVInnerCoverage. Results
759 /// in an empty function call if SVInnerCoverage isn't requested
760 template <typename RT
, typename ValidEdgeMaskT
, typename InputCoverageT
>
761 struct GenerateSVInnerCoverage
763 INLINE
GenerateSVInnerCoverage(DRAW_CONTEXT
*, uint32_t, EDGE
*, double*, uint64_t &){};
766 //////////////////////////////////////////////////////////////////////////
767 /// @brief Specialization of GenerateSVInnerCoverage where all edges
768 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
769 /// edge values from OuterConservative to InnerConservative and rasterizes.
770 template <typename RT
>
771 struct GenerateSVInnerCoverage
<RT
, AllEdgesValidT
, InnerConservativeCoverageT
>
773 INLINE
GenerateSVInnerCoverage(DRAW_CONTEXT
* pDC
, uint32_t workerId
, EDGE
* pRastEdges
, double* pStartQuadEdges
, uint64_t &innerCoverageMask
)
775 SWR_CONTEXT
*pContext
= pDC
->pContext
;
777 double startQuadEdgesAdj
[RT::NumEdgesT::value
];
778 for(uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
780 startQuadEdgesAdj
[e
] = adjustScalarEdge
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(pRastEdges
[e
].a
, pRastEdges
[e
].b
, pStartQuadEdges
[e
]);
783 // not trivial accept or reject, must rasterize full tile
784 AR_BEGIN(BERasterizePartial
, pDC
->drawId
);
785 innerCoverageMask
= rasterizePartialTile
<RT::NumEdgesT::value
, typename
RT::ValidEdgeMaskT
>(pDC
, startQuadEdgesAdj
, pRastEdges
);
786 AR_END(BERasterizePartial
, 0);
790 //////////////////////////////////////////////////////////////////////////
791 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
792 /// in an empty function call if SVInnerCoverage isn't requested
793 template <typename RT
, typename ValidEdgeMaskT
, typename InputCoverageT
>
794 struct UpdateEdgeMasksInnerConservative
796 INLINE
UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox
)[3], const __m256d
*,
797 const __m128i
, const __m128i
, int32_t &, int32_t &, int32_t &){};
800 //////////////////////////////////////////////////////////////////////////
801 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
802 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
803 /// evaluated at raster tile corners to inner conservative position and
804 /// updates edge masks
805 template <typename RT
>
806 struct UpdateEdgeMasksInnerConservative
<RT
, AllEdgesValidT
, InnerConservativeCoverageT
>
808 INLINE
UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox
)[3], const __m256d
* vEdgeFix16
,
809 const __m128i vAi
, const __m128i vBi
, int32_t &mask0
, int32_t &mask1
, int32_t &mask2
)
811 __m256d vTempEdge
[3]{vEdgeFix16
[0], vEdgeFix16
[1], vEdgeFix16
[2]};
813 // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
814 // conservative evaluated edge when adjusting the edge in for inner conservative tests
815 adjustEdgeConservative
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(vAi
, vBi
, vTempEdge
[0]);
816 adjustEdgeConservative
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(vAi
, vBi
, vTempEdge
[1]);
817 adjustEdgeConservative
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(vAi
, vBi
, vTempEdge
[2]);
819 UpdateEdgeMasks
<typename
RT::NumCoverageSamplesT
>(vEdgeTileBbox
, vTempEdge
, mask0
, mask1
, mask2
);
823 //////////////////////////////////////////////////////////////////////////
824 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
825 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
826 /// cover an entire raster tile, set mask0 to 0 to force it down the
827 /// rastierizePartialTile path
828 template <typename RT
, typename ValidEdgeMaskT
>
829 struct UpdateEdgeMasksInnerConservative
<RT
, ValidEdgeMaskT
, InnerConservativeCoverageT
>
831 INLINE
UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d
*,
832 const __m128i
, const __m128i
, int32_t &mask0
, int32_t &, int32_t &)
834 // set one mask to zero to force the triangle down the rastierizePartialTile path
839 template <typename RT
>
840 void RasterizeTriangle(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t macroTile
, void* pDesc
)
842 SWR_CONTEXT
*pContext
= pDC
->pContext
;
843 const TRIANGLE_WORK_DESC
&workDesc
= *((TRIANGLE_WORK_DESC
*)pDesc
);
844 #if KNOB_ENABLE_TOSS_POINTS
845 if (KNOB_TOSS_BIN_TRIS
)
850 AR_BEGIN(BERasterizeTriangle
, pDC
->drawId
);
851 AR_BEGIN(BETriangleSetup
, pDC
->drawId
);
853 const API_STATE
&state
= GetApiState(pDC
);
854 const SWR_RASTSTATE
&rastState
= state
.rastState
;
855 const BACKEND_FUNCS
& backendFuncs
= pDC
->pState
->backendFuncs
;
857 OSALIGNSIMD(SWR_TRIANGLE_DESC
) triDesc
;
858 triDesc
.pUserClipBuffer
= workDesc
.pUserClipBuffer
;
860 __m128 vX
, vY
, vZ
, vRecipW
;
862 // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
863 // eg: vX = [x0 x1 x2 dc]
864 vX
= _mm_load_ps(workDesc
.pTriBuffer
);
865 vY
= _mm_load_ps(workDesc
.pTriBuffer
+ 4);
866 vZ
= _mm_load_ps(workDesc
.pTriBuffer
+ 8);
867 vRecipW
= _mm_load_ps(workDesc
.pTriBuffer
+ 12);
869 // convert to fixed point
870 static_assert(std::is_same
<typename
RT::PrecisionT
, FixedPointTraits
<Fixed_16_8
>>::value
, "Rasterizer expects 16.8 fixed point precision");
871 __m128i vXi
= fpToFixedPoint(vX
);
872 __m128i vYi
= fpToFixedPoint(vY
);
874 // quantize floating point position to fixed point precision
875 // to prevent attribute creep around the triangle vertices
876 vX
= _mm_mul_ps(_mm_cvtepi32_ps(vXi
), _mm_set1_ps(1.0f
/ FIXED_POINT_SCALE
));
877 vY
= _mm_mul_ps(_mm_cvtepi32_ps(vYi
), _mm_set1_ps(1.0f
/ FIXED_POINT_SCALE
));
879 // triangle setup - A and B edge equation coefs
881 triangleSetupAB(vX
, vY
, vA
, vB
);
884 triangleSetupABInt(vXi
, vYi
, vAi
, vBi
);
887 float det
= calcDeterminantInt(vAi
, vBi
);
889 // Verts in Pixel Coordinate Space at this point
890 // Det > 0 = CW winding order
891 // Convert CW triangles to CCW
894 vA
= _mm_mul_ps(vA
, _mm_set1_ps(-1));
895 vB
= _mm_mul_ps(vB
, _mm_set1_ps(-1));
896 vAi
= _mm_mullo_epi32(vAi
, _mm_set1_epi32(-1));
897 vBi
= _mm_mullo_epi32(vBi
, _mm_set1_epi32(-1));
902 // Finish triangle setup - C edge coef
903 triangleSetupC(vX
, vY
, vA
, vB
, vC
);
905 if(RT::ValidEdgeMaskT::value
!= ALL_EDGES_VALID
)
907 // If we have degenerate edge(s) to rasterize, set I and J coefs
908 // to 0 for constant interpolation of attributes
916 // Degenerate triangles have no area
917 triDesc
.recipDet
= 0.0f
;
921 // only extract coefs for 2 of the barycentrics; the 3rd can be
922 // determined from the barycentric equation:
923 // i + j + k = 1 <=> k = 1 - j - i
924 _MM_EXTRACT_FLOAT(triDesc
.I
[0], vA
, 1);
925 _MM_EXTRACT_FLOAT(triDesc
.I
[1], vB
, 1);
926 _MM_EXTRACT_FLOAT(triDesc
.I
[2], vC
, 1);
927 _MM_EXTRACT_FLOAT(triDesc
.J
[0], vA
, 2);
928 _MM_EXTRACT_FLOAT(triDesc
.J
[1], vB
, 2);
929 _MM_EXTRACT_FLOAT(triDesc
.J
[2], vC
, 2);
931 // compute recipDet, used to calculate barycentric i and j in the backend
932 triDesc
.recipDet
= 1.0f
/det
;
935 OSALIGNSIMD(float) oneOverW
[4];
936 _mm_store_ps(oneOverW
, vRecipW
);
937 triDesc
.OneOverW
[0] = oneOverW
[0] - oneOverW
[2];
938 triDesc
.OneOverW
[1] = oneOverW
[1] - oneOverW
[2];
939 triDesc
.OneOverW
[2] = oneOverW
[2];
941 // calculate perspective correct coefs per vertex attrib
942 float* pPerspAttribs
= perspAttribsTLS
;
943 float* pAttribs
= workDesc
.pAttribs
;
944 triDesc
.pPerspAttribs
= pPerspAttribs
;
945 triDesc
.pAttribs
= pAttribs
;
946 float *pRecipW
= workDesc
.pTriBuffer
+ 12;
947 triDesc
.pRecipW
= pRecipW
;
948 __m128 vOneOverWV0
= _mm_broadcast_ss(pRecipW
);
949 __m128 vOneOverWV1
= _mm_broadcast_ss(pRecipW
+=1);
950 __m128 vOneOverWV2
= _mm_broadcast_ss(pRecipW
+=1);
951 for(uint32_t i
= 0; i
< workDesc
.numAttribs
; i
++)
953 __m128 attribA
= _mm_load_ps(pAttribs
);
954 __m128 attribB
= _mm_load_ps(pAttribs
+=4);
955 __m128 attribC
= _mm_load_ps(pAttribs
+=4);
958 attribA
= _mm_mul_ps(attribA
, vOneOverWV0
);
959 attribB
= _mm_mul_ps(attribB
, vOneOverWV1
);
960 attribC
= _mm_mul_ps(attribC
, vOneOverWV2
);
962 _mm_store_ps(pPerspAttribs
, attribA
);
963 _mm_store_ps(pPerspAttribs
+=4, attribB
);
964 _mm_store_ps(pPerspAttribs
+=4, attribC
);
969 // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
970 OSALIGNSIMD(float) a
[4];
972 triDesc
.Z
[0] = a
[0] - a
[2];
973 triDesc
.Z
[1] = a
[1] - a
[2];
977 triDesc
.Z
[2] += ComputeDepthBias(&rastState
, &triDesc
, workDesc
.pTriBuffer
+ 8);
979 // Calc bounding box of triangle
980 OSALIGNSIMD(SWR_RECT
) bbox
;
981 calcBoundingBoxInt(vXi
, vYi
, bbox
);
983 const SWR_RECT
&scissorInFixedPoint
= state
.scissorsInFixedPoint
[workDesc
.triFlags
.viewportIndex
];
985 if(RT::ValidEdgeMaskT::value
!= ALL_EDGES_VALID
)
987 // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
988 bbox
.xmin
--; bbox
.xmax
++; bbox
.ymin
--; bbox
.ymax
++;
989 SWR_ASSERT(scissorInFixedPoint
.xmin
>= 0 && scissorInFixedPoint
.ymin
>= 0,
990 "Conservative rast degenerate handling requires a valid scissor rect");
993 // Intersect with scissor/viewport
994 OSALIGNSIMD(SWR_RECT
) intersect
;
995 intersect
.xmin
= std::max(bbox
.xmin
, scissorInFixedPoint
.xmin
);
996 intersect
.xmax
= std::min(bbox
.xmax
- 1, scissorInFixedPoint
.xmax
);
997 intersect
.ymin
= std::max(bbox
.ymin
, scissorInFixedPoint
.ymin
);
998 intersect
.ymax
= std::min(bbox
.ymax
- 1, scissorInFixedPoint
.ymax
);
1000 triDesc
.triFlags
= workDesc
.triFlags
;
1002 // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
1003 uint32_t macroX
, macroY
;
1004 MacroTileMgr::getTileIndices(macroTile
, macroX
, macroY
);
1005 int32_t macroBoxLeft
= macroX
* KNOB_MACROTILE_X_DIM_FIXED
;
1006 int32_t macroBoxRight
= macroBoxLeft
+ KNOB_MACROTILE_X_DIM_FIXED
- 1;
1007 int32_t macroBoxTop
= macroY
* KNOB_MACROTILE_Y_DIM_FIXED
;
1008 int32_t macroBoxBottom
= macroBoxTop
+ KNOB_MACROTILE_Y_DIM_FIXED
- 1;
1010 intersect
.xmin
= std::max(intersect
.xmin
, macroBoxLeft
);
1011 intersect
.ymin
= std::max(intersect
.ymin
, macroBoxTop
);
1012 intersect
.xmax
= std::min(intersect
.xmax
, macroBoxRight
);
1013 intersect
.ymax
= std::min(intersect
.ymax
, macroBoxBottom
);
1015 SWR_ASSERT(intersect
.xmin
<= intersect
.xmax
&& intersect
.ymin
<= intersect
.ymax
&& intersect
.xmin
>= 0 && intersect
.xmax
>= 0 && intersect
.ymin
>= 0 && intersect
.ymax
>= 0);
1017 AR_END(BETriangleSetup
, 0);
1019 // update triangle desc
1020 uint32_t minTileX
= intersect
.xmin
>> (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1021 uint32_t minTileY
= intersect
.ymin
>> (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1022 uint32_t maxTileX
= intersect
.xmax
>> (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1023 uint32_t maxTileY
= intersect
.ymax
>> (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1024 uint32_t numTilesX
= maxTileX
- minTileX
+ 1;
1025 uint32_t numTilesY
= maxTileY
- minTileY
+ 1;
1027 if (numTilesX
== 0 || numTilesY
== 0)
1029 RDTSC_EVENT(BEEmptyTriangle
, 1, 0);
1030 AR_END(BERasterizeTriangle
, 1);
1034 AR_BEGIN(BEStepSetup
, pDC
->drawId
);
1036 // Step to pixel center of top-left pixel of the triangle bbox
1037 // Align intersect bbox (top/left) to raster tile's (top/left).
1038 int32_t x
= AlignDown(intersect
.xmin
, (FIXED_POINT_SCALE
* KNOB_TILE_X_DIM
));
1039 int32_t y
= AlignDown(intersect
.ymin
, (FIXED_POINT_SCALE
* KNOB_TILE_Y_DIM
));
1041 // convenience typedef
1042 typedef typename
RT::NumCoverageSamplesT NumCoverageSamplesT
;
1044 // single sample rasterization evaluates edges at pixel center,
1045 // multisample evaluates edges UL pixel corner and steps to each sample position
1046 if(std::is_same
<NumCoverageSamplesT
, SingleSampleT
>::value
)
1048 // Add 0.5, in fixed point, to offset to pixel center
1049 x
+= (FIXED_POINT_SCALE
/ 2);
1050 y
+= (FIXED_POINT_SCALE
/ 2);
1053 __m128i vTopLeftX
= _mm_set1_epi32(x
);
1054 __m128i vTopLeftY
= _mm_set1_epi32(y
);
1056 // evaluate edge equations at top-left pixel using 64bit math
1058 // line = Ax + By + C
1061 // we know x0 and y0 are on the line; plug them in:
1063 // plug C back into line equation:
1064 // line = Ax - By - Ax0 - By0
1065 // line = A(x - x0) + B(y - y0)
1066 // dX = (x-x0), dY = (y-y0)
1067 // so all this simplifies to
1068 // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
1070 __m128i vDeltaX
= _mm_sub_epi32(vTopLeftX
, vXi
);
1071 __m128i vDeltaY
= _mm_sub_epi32(vTopLeftY
, vYi
);
1073 // evaluate A(dx) and B(dY) for all points
1074 __m256d vAipd
= _mm256_cvtepi32_pd(vAi
);
1075 __m256d vBipd
= _mm256_cvtepi32_pd(vBi
);
1076 __m256d vDeltaXpd
= _mm256_cvtepi32_pd(vDeltaX
);
1077 __m256d vDeltaYpd
= _mm256_cvtepi32_pd(vDeltaY
);
1079 __m256d vAiDeltaXFix16
= _mm256_mul_pd(vAipd
, vDeltaXpd
);
1080 __m256d vBiDeltaYFix16
= _mm256_mul_pd(vBipd
, vDeltaYpd
);
1081 __m256d vEdge
= _mm256_add_pd(vAiDeltaXFix16
, vBiDeltaYFix16
);
1083 // apply any edge adjustments(top-left, crast, etc)
1084 adjustEdgesFix16
<RT
, typename
RT::ConservativeEdgeOffsetT
>(vAi
, vBi
, vEdge
);
1086 // broadcast respective edge results to all lanes
1087 double* pEdge
= (double*)&vEdge
;
1088 __m256d vEdgeFix16
[7];
1089 vEdgeFix16
[0] = _mm256_set1_pd(pEdge
[0]);
1090 vEdgeFix16
[1] = _mm256_set1_pd(pEdge
[1]);
1091 vEdgeFix16
[2] = _mm256_set1_pd(pEdge
[2]);
1093 OSALIGNSIMD(int32_t) aAi
[4], aBi
[4];
1094 _mm_store_si128((__m128i
*)aAi
, vAi
);
1095 _mm_store_si128((__m128i
*)aBi
, vBi
);
1096 EDGE rastEdges
[RT::NumEdgesT::value
];
1098 // Compute and store triangle edge data
1099 ComputeEdgeData(aAi
[0], aBi
[0], rastEdges
[0]);
1100 ComputeEdgeData(aAi
[1], aBi
[1], rastEdges
[1]);
1101 ComputeEdgeData(aAi
[2], aBi
[2], rastEdges
[2]);
1103 // Compute and store triangle edge data if scissor needs to rasterized
1104 ComputeScissorEdges
<typename
RT::RasterizeScissorEdgesT
, typename
RT::IsConservativeT
, RT
>
1105 (bbox
, scissorInFixedPoint
, x
, y
, rastEdges
, vEdgeFix16
);
1107 // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
1108 // used to for testing if entire raster tile is inside a triangle
1109 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1111 vEdgeFix16
[e
] = _mm256_add_pd(vEdgeFix16
[e
], rastEdges
[e
].vRasterTileOffsets
);
1114 // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
1115 // step sample positions to the raster tile bbox of multisample points
1116 // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples)
1119 // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
1120 __m256d vEdgeTileBbox
[3];
1121 if (NumCoverageSamplesT::value
> 1)
1123 const SWR_MULTISAMPLE_POS
&samplePos
= rastState
.samplePositions
;
1124 const __m128i vTileSampleBBoxXh
= samplePos
.TileSampleOffsetsX();
1125 const __m128i vTileSampleBBoxYh
= samplePos
.TileSampleOffsetsY();
1127 __m256d vTileSampleBBoxXFix8
= _mm256_cvtepi32_pd(vTileSampleBBoxXh
);
1128 __m256d vTileSampleBBoxYFix8
= _mm256_cvtepi32_pd(vTileSampleBBoxYh
);
1130 // step edge equation tests from Tile
1131 // used to for testing if entire raster tile is inside a triangle
1132 for (uint32_t e
= 0; e
< 3; ++e
)
1134 __m256d vResultAxFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].a
), vTileSampleBBoxXFix8
);
1135 __m256d vResultByFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].b
), vTileSampleBBoxYFix8
);
1136 vEdgeTileBbox
[e
] = _mm256_add_pd(vResultAxFix16
, vResultByFix16
);
1138 // adjust for msaa tile bbox edges outward for conservative rast, if enabled
1139 adjustEdgeConservative
<RT
, typename
RT::ConservativeEdgeOffsetT
>(vAi
, vBi
, vEdgeTileBbox
[e
]);
1143 AR_END(BEStepSetup
, 0);
1145 uint32_t tY
= minTileY
;
1146 uint32_t tX
= minTileX
;
1147 uint32_t maxY
= maxTileY
;
1148 uint32_t maxX
= maxTileX
;
1150 RenderOutputBuffers renderBuffers
, currentRenderBufferRow
;
1151 GetRenderHotTiles
<RT::MT::numSamples
>(pDC
, macroTile
, minTileX
, minTileY
, renderBuffers
, triDesc
.triFlags
.renderTargetArrayIndex
);
1152 currentRenderBufferRow
= renderBuffers
;
1154 // rasterize and generate coverage masks per sample
1155 for (uint32_t tileY
= tY
; tileY
<= maxY
; ++tileY
)
1157 __m256d vStartOfRowEdge
[RT::NumEdgesT::value
];
1158 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1160 vStartOfRowEdge
[e
] = vEdgeFix16
[e
];
1163 for (uint32_t tileX
= tX
; tileX
<= maxX
; ++tileX
)
1165 triDesc
.anyCoveredSamples
= 0;
1167 // is the corner of the edge outside of the raster tile? (vEdge < 0)
1168 int mask0
, mask1
, mask2
;
1169 UpdateEdgeMasks
<NumCoverageSamplesT
>(vEdgeTileBbox
, vEdgeFix16
, mask0
, mask1
, mask2
);
1171 for (uint32_t sampleNum
= 0; sampleNum
< NumCoverageSamplesT::value
; sampleNum
++)
1173 // trivial reject, at least one edge has all 4 corners of raster tile outside
1174 bool trivialReject
= TrivialRejectTest
<typename
RT::ValidEdgeMaskT
>(mask0
, mask1
, mask2
);
1178 // trivial accept mask
1179 triDesc
.coverageMask
[sampleNum
] = 0xffffffffffffffffULL
;
1181 // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
1182 UpdateEdgeMasksInnerConservative
<RT
, typename
RT::ValidEdgeMaskT
, typename
RT::InputCoverageT
>
1183 (vEdgeTileBbox
, vEdgeFix16
, vAi
, vBi
, mask0
, mask1
, mask2
);
1185 // @todo Make this a bit smarter to allow use of trivial accept when:
1186 // 1) scissor/vp intersection rect is raster tile aligned
1187 // 2) raster tile is entirely within scissor/vp intersection rect
1188 if (TrivialAcceptTest
<typename
RT::RasterizeScissorEdgesT
>(mask0
, mask1
, mask2
))
1190 // trivial accept, all 4 corners of all 3 edges are negative
1191 // i.e. raster tile completely inside triangle
1192 triDesc
.anyCoveredSamples
= triDesc
.coverageMask
[sampleNum
];
1193 if(std::is_same
<typename
RT::InputCoverageT
, InnerConservativeCoverageT
>::value
)
1195 triDesc
.innerCoverageMask
= 0xffffffffffffffffULL
;
1197 RDTSC_EVENT(BETrivialAccept
, 1, 0);
1201 __m256d vEdgeAtSample
[RT::NumEdgesT::value
];
1202 if(std::is_same
<NumCoverageSamplesT
, SingleSampleT
>::value
)
1204 // should get optimized out for single sample case (global value numbering or copy propagation)
1205 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1207 vEdgeAtSample
[e
] = vEdgeFix16
[e
];
1212 const SWR_MULTISAMPLE_POS
&samplePos
= rastState
.samplePositions
;
1213 __m128i vSampleOffsetXh
= samplePos
.vXi(sampleNum
);
1214 __m128i vSampleOffsetYh
= samplePos
.vYi(sampleNum
);
1215 __m256d vSampleOffsetX
= _mm256_cvtepi32_pd(vSampleOffsetXh
);
1216 __m256d vSampleOffsetY
= _mm256_cvtepi32_pd(vSampleOffsetYh
);
1218 // step edge equation tests from UL tile corner to pixel sample position
1219 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1221 __m256d vResultAxFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].a
), vSampleOffsetX
);
1222 __m256d vResultByFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].b
), vSampleOffsetY
);
1223 vEdgeAtSample
[e
] = _mm256_add_pd(vResultAxFix16
, vResultByFix16
);
1224 vEdgeAtSample
[e
] = _mm256_add_pd(vEdgeFix16
[e
], vEdgeAtSample
[e
]);
1228 double startQuadEdges
[RT::NumEdgesT::value
];
1229 const __m256i vLane0Mask
= _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1230 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1232 _mm256_maskstore_pd(&startQuadEdges
[e
], vLane0Mask
, vEdgeAtSample
[e
]);
1235 // not trivial accept or reject, must rasterize full tile
1236 AR_BEGIN(BERasterizePartial
, pDC
->drawId
);
1237 triDesc
.coverageMask
[sampleNum
] = rasterizePartialTile
<RT::NumEdgesT::value
, typename
RT::ValidEdgeMaskT
>(pDC
, startQuadEdges
, rastEdges
);
1238 AR_END(BERasterizePartial
, 0);
1240 triDesc
.anyCoveredSamples
|= triDesc
.coverageMask
[sampleNum
];
1242 // Output SV InnerCoverage, if needed
1243 GenerateSVInnerCoverage
<RT
, typename
RT::ValidEdgeMaskT
, typename
RT::InputCoverageT
>(pDC
, workerId
, rastEdges
, startQuadEdges
, triDesc
.innerCoverageMask
);
1248 // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
1249 if(NumCoverageSamplesT::value
> 1)
1251 triDesc
.coverageMask
[sampleNum
] = 0;
1253 RDTSC_EVENT(BETrivialReject
, 1, 0);
1257 #if KNOB_ENABLE_TOSS_POINTS
1260 gToss
= triDesc
.coverageMask
[0];
1264 if(triDesc
.anyCoveredSamples
)
1266 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
1267 // copy conservative coverage result to all samples
1268 if(RT::IsConservativeT::value
)
1270 auto copyCoverage
= [&](int sample
){triDesc
.coverageMask
[sample
] = triDesc
.coverageMask
[0]; };
1271 UnrollerL
<1, RT::MT::numSamples
, 1>::step(copyCoverage
);
1274 AR_BEGIN(BEPixelBackend
, pDC
->drawId
);
1275 backendFuncs
.pfnBackend(pDC
, workerId
, tileX
<< KNOB_TILE_X_DIM_SHIFT
, tileY
<< KNOB_TILE_Y_DIM_SHIFT
, triDesc
, renderBuffers
);
1276 AR_END(BEPixelBackend
, 0);
1279 // step to the next tile in X
1280 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1282 vEdgeFix16
[e
] = _mm256_add_pd(vEdgeFix16
[e
], _mm256_set1_pd(rastEdges
[e
].stepRasterTileX
));
1284 StepRasterTileX
<RT
>(state
.colorHottileEnable
, renderBuffers
);
1287 // step to the next tile in Y
1288 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1290 vEdgeFix16
[e
] = _mm256_add_pd(vStartOfRowEdge
[e
], _mm256_set1_pd(rastEdges
[e
].stepRasterTileY
));
1292 StepRasterTileY
<RT
>(state
.colorHottileEnable
, renderBuffers
, currentRenderBufferRow
);
1295 AR_END(BERasterizeTriangle
, 1);
1298 // Get pointers to hot tile memory for color RT, depth, stencil
1299 template <uint32_t numSamples
>
1300 void GetRenderHotTiles(DRAW_CONTEXT
*pDC
, uint32_t macroID
, uint32_t tileX
, uint32_t tileY
, RenderOutputBuffers
&renderBuffers
, uint32_t renderTargetArrayIndex
)
1302 const API_STATE
& state
= GetApiState(pDC
);
1303 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1306 MacroTileMgr::getTileIndices(macroID
, mx
, my
);
1307 tileX
-= KNOB_MACROTILE_X_DIM_IN_TILES
* mx
;
1308 tileY
-= KNOB_MACROTILE_Y_DIM_IN_TILES
* my
;
1310 // compute tile offset for active hottile buffers
1311 const uint32_t pitch
= KNOB_MACROTILE_X_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8;
1312 uint32_t offset
= ComputeTileOffset2D
<TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
> >(pitch
, tileX
, tileY
);
1315 unsigned long rtSlot
= 0;
1316 uint32_t colorHottileEnableMask
= state
.colorHottileEnable
;
1317 while(_BitScanForward(&rtSlot
, colorHottileEnableMask
))
1319 HOTTILE
*pColor
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroID
, (SWR_RENDERTARGET_ATTACHMENT
)(SWR_ATTACHMENT_COLOR0
+ rtSlot
), true,
1320 numSamples
, renderTargetArrayIndex
);
1321 pColor
->state
= HOTTILE_DIRTY
;
1322 renderBuffers
.pColor
[rtSlot
] = pColor
->pBuffer
+ offset
;
1324 colorHottileEnableMask
&= ~(1 << rtSlot
);
1326 if(state
.depthHottileEnable
)
1328 const uint32_t pitch
= KNOB_MACROTILE_X_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8;
1329 uint32_t offset
= ComputeTileOffset2D
<TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
> >(pitch
, tileX
, tileY
);
1331 HOTTILE
*pDepth
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroID
, SWR_ATTACHMENT_DEPTH
, true,
1332 numSamples
, renderTargetArrayIndex
);
1333 pDepth
->state
= HOTTILE_DIRTY
;
1334 SWR_ASSERT(pDepth
->pBuffer
!= nullptr);
1335 renderBuffers
.pDepth
= pDepth
->pBuffer
+ offset
;
1337 if(state
.stencilHottileEnable
)
1339 const uint32_t pitch
= KNOB_MACROTILE_X_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8;
1340 uint32_t offset
= ComputeTileOffset2D
<TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
> >(pitch
, tileX
, tileY
);
1342 HOTTILE
* pStencil
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroID
, SWR_ATTACHMENT_STENCIL
, true,
1343 numSamples
, renderTargetArrayIndex
);
1344 pStencil
->state
= HOTTILE_DIRTY
;
1345 SWR_ASSERT(pStencil
->pBuffer
!= nullptr);
1346 renderBuffers
.pStencil
= pStencil
->pBuffer
+ offset
;
1350 template <typename RT
>
1351 INLINE
void StepRasterTileX(uint32_t colorHotTileMask
, RenderOutputBuffers
&buffers
)
1354 while (_BitScanForward(&rt
, colorHotTileMask
))
1356 colorHotTileMask
&= ~(1 << rt
);
1357 buffers
.pColor
[rt
] += RT::colorRasterTileStep
;
1360 buffers
.pDepth
+= RT::depthRasterTileStep
;
1361 buffers
.pStencil
+= RT::stencilRasterTileStep
;
1364 template <typename RT
>
1365 INLINE
void StepRasterTileY(uint32_t colorHotTileMask
, RenderOutputBuffers
&buffers
, RenderOutputBuffers
&startBufferRow
)
1368 while (_BitScanForward(&rt
, colorHotTileMask
))
1370 colorHotTileMask
&= ~(1 << rt
);
1371 startBufferRow
.pColor
[rt
] += RT::colorRasterTileRowStep
;
1372 buffers
.pColor
[rt
] = startBufferRow
.pColor
[rt
];
1374 startBufferRow
.pDepth
+= RT::depthRasterTileRowStep
;
1375 buffers
.pDepth
= startBufferRow
.pDepth
;
1377 startBufferRow
.pStencil
+= RT::stencilRasterTileRowStep
;
1378 buffers
.pStencil
= startBufferRow
.pStencil
;