1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * @file rasterizer.cpp
25 * @brief Implementation for the rasterizer.
27 ******************************************************************************/
32 #include "rasterizer.h"
33 #include "rdtsc_core.h"
38 #include "memory/tilingtraits.h"
40 template <uint32_t numSamples
= 1>
41 void GetRenderHotTiles(DRAW_CONTEXT
*pDC
, uint32_t macroID
, uint32_t x
, uint32_t y
, RenderOutputBuffers
&renderBuffers
, uint32_t renderTargetArrayIndex
);
42 template <typename RT
>
43 void StepRasterTileX(uint32_t MaxRT
, RenderOutputBuffers
&buffers
);
44 template <typename RT
>
45 void StepRasterTileY(uint32_t MaxRT
, RenderOutputBuffers
&buffers
, RenderOutputBuffers
&startBufferRow
);
47 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
48 const __m256d gMaskToVecpd
[] =
50 MASKTOVEC(0, 0, 0, 0),
51 MASKTOVEC(0, 0, 0, 1),
52 MASKTOVEC(0, 0, 1, 0),
53 MASKTOVEC(0, 0, 1, 1),
54 MASKTOVEC(0, 1, 0, 0),
55 MASKTOVEC(0, 1, 0, 1),
56 MASKTOVEC(0, 1, 1, 0),
57 MASKTOVEC(0, 1, 1, 1),
58 MASKTOVEC(1, 0, 0, 0),
59 MASKTOVEC(1, 0, 0, 1),
60 MASKTOVEC(1, 0, 1, 0),
61 MASKTOVEC(1, 0, 1, 1),
62 MASKTOVEC(1, 1, 0, 0),
63 MASKTOVEC(1, 1, 0, 1),
64 MASKTOVEC(1, 1, 1, 0),
65 MASKTOVEC(1, 1, 1, 1),
75 double a
, b
; // a, b edge coefficients in fix8
76 double stepQuadX
; // step to adjacent horizontal quad in fix16
77 double stepQuadY
; // step to adjacent vertical quad in fix16
78 double stepRasterTileX
; // step to adjacent horizontal raster tile in fix16
79 double stepRasterTileY
; // step to adjacent vertical raster tile in fix16
81 __m256d vQuadOffsets
; // offsets for 4 samples of a quad
82 __m256d vRasterTileOffsets
; // offsets for the 4 corners of a raster tile
85 //////////////////////////////////////////////////////////////////////////
86 /// @brief rasterize a raster tile partially covered by the triangle
87 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
88 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
89 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
90 /// Used to step between quads when sweeping over the raster tile.
91 template<uint32_t NumEdges
, typename EdgeMaskT
>
92 INLINE
uint64_t rasterizePartialTile(DRAW_CONTEXT
*pDC
, double startEdges
[NumEdges
], EDGE
*pRastEdges
)
94 uint64_t coverageMask
= 0;
96 __m256d vEdges
[NumEdges
];
97 __m256d vStepX
[NumEdges
];
98 __m256d vStepY
[NumEdges
];
100 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
102 // Step to the pixel sample locations of the 1st quad
103 vEdges
[e
] = _mm256_add_pd(_mm256_set1_pd(startEdges
[e
]), pRastEdges
[e
].vQuadOffsets
);
105 // compute step to next quad (mul by 2 in x and y direction)
106 vStepX
[e
] = _mm256_set1_pd(pRastEdges
[e
].stepQuadX
);
107 vStepY
[e
] = _mm256_set1_pd(pRastEdges
[e
].stepQuadY
);
110 // fast unrolled version for 8x8 tile
111 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
112 int edgeMask
[NumEdges
];
115 auto eval_lambda
= [&](int e
){edgeMask
[e
] = _mm256_movemask_pd(vEdges
[e
]);};
116 auto update_lambda
= [&](int e
){mask
&= edgeMask
[e
];};
117 auto incx_lambda
= [&](int e
){vEdges
[e
] = _mm256_add_pd(vEdges
[e
], vStepX
[e
]);};
118 auto incy_lambda
= [&](int e
){vEdges
[e
] = _mm256_add_pd(vEdges
[e
], vStepY
[e
]);};
119 auto decx_lambda
= [&](int e
){vEdges
[e
] = _mm256_sub_pd(vEdges
[e
], vStepX
[e
]);};
121 // evaluate which pixels in the quad are covered
123 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
125 // update coverage mask
126 #define UPDATE_MASK(bit) \
127 mask = edgeMask[0]; \
128 UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
129 coverageMask |= (mask << bit);
131 // step in the +x direction to the next quad
133 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
135 // step in the +y direction to the next quad
137 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
139 // step in the -x direction to the next quad
141 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
143 // sweep 2x2 quad back and forth through the raster tile,
144 // computing coverage masks for the entire tile
149 // x x ------------------>
151 // <-----------------x x V
210 for (uint32_t y
= 0; y
< KNOB_TILE_Y_DIM
/2; ++y
)
212 __m256d vStartOfRowEdge
[NumEdges
];
213 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
215 vStartOfRowEdge
[e
] = vEdges
[e
];
218 for (uint32_t x
= 0; x
< KNOB_TILE_X_DIM
/2; ++x
)
220 int edgeMask
[NumEdges
];
221 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
223 edgeMask
[e
] = _mm256_movemask_pd(vEdges
[e
]);
226 uint64_t mask
= edgeMask
[0];
227 for (uint32_t e
= 1; e
< NumEdges
; ++e
)
231 coverageMask
|= (mask
<< bit
);
233 // step to the next pixel in the x
234 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
236 vEdges
[e
] = _mm256_add_pd(vEdges
[e
], vStepX
[e
]);
241 // step to the next row
242 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
244 vEdges
[e
] = _mm256_add_pd(vStartOfRowEdge
[e
], vStepY
[e
]);
252 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
253 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
254 // Top left: a sample is in if it is a top or left edge.
255 // Out: !(horizontal && above) = !horizontal && below
256 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right
257 INLINE
void adjustTopLeftRuleIntFix16(const __m128i vA
, const __m128i vB
, __m256d
&vEdge
)
260 // if vA == 0 && vB < 0, vC--
262 __m256d vEdgeOut
= vEdge
;
263 __m256d vEdgeAdjust
= _mm256_sub_pd(vEdge
, _mm256_set1_pd(1.0));
265 // if vA < 0 (line is not horizontal and below)
266 int msk
= _mm_movemask_ps(_mm_castsi128_ps(vA
));
268 // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
269 __m128i vCmp
= _mm_cmpeq_epi32(vA
, _mm_setzero_si128());
270 int msk2
= _mm_movemask_ps(_mm_castsi128_ps(vCmp
));
271 msk2
&= _mm_movemask_ps(_mm_castsi128_ps(vB
));
273 // if either of these are true and we're on the line (edge == 0), bump it outside the line
274 vEdge
= _mm256_blendv_pd(vEdgeOut
, vEdgeAdjust
, gMaskToVecpd
[msk
| msk2
]);
277 //////////////////////////////////////////////////////////////////////////
278 /// @brief calculates difference in precision between the result of manh
279 /// calculation and the edge precision, based on compile time trait values
280 template<typename RT
>
281 constexpr int64_t ManhToEdgePrecisionAdjust()
283 static_assert(RT::PrecisionT::BitsT::value
+ RT::ConservativePrecisionT::BitsT::value
>= RT::EdgePrecisionT::BitsT::value
,
284 "Inadequate precision of result of manh calculation ");
285 return ((RT::PrecisionT::BitsT::value
+ RT::ConservativePrecisionT::BitsT::value
) - RT::EdgePrecisionT::BitsT::value
);
288 //////////////////////////////////////////////////////////////////////////
289 /// @struct adjustEdgeConservative
290 /// @brief Primary template definition used for partially specializing
291 /// the adjustEdgeConservative function. This struct should never
293 /// @tparam RT: rasterizer traits
294 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
295 template <typename RT
, typename ConservativeEdgeOffsetT
>
296 struct adjustEdgeConservative
298 //////////////////////////////////////////////////////////////////////////
299 /// @brief Performs calculations to adjust each edge of a triangle away
300 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
303 /// Uncertainty regions arise from fixed point rounding, which
304 /// can snap a vertex +/- by min fixed point value.
305 /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
306 /// This allows the rasterizer to test for coverage only at the pixel center,
307 /// instead of having to test individual pixel corners for conservative coverage
308 INLINE
adjustEdgeConservative(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
)
310 // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
311 // from the pixel center (in the direction of the edge normal A/B)
313 // edge = Ax + Bx + C - (manh/e)
314 // manh = manhattan distance = abs(A) + abs(B)
315 // e = absolute rounding error from snapping from float to fixed point precision
317 // 'fixed point' multiply (in double to be avx1 friendly)
318 // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
319 __m256d vAai
= _mm256_cvtepi32_pd(_mm_abs_epi32(vAi
)), vBai
= _mm256_cvtepi32_pd(_mm_abs_epi32(vBi
));
320 __m256d manh
= _mm256_add_pd(_mm256_mul_pd(vAai
, _mm256_set1_pd(ConservativeEdgeOffsetT::value
)),
321 _mm256_mul_pd(vBai
, _mm256_set1_pd(ConservativeEdgeOffsetT::value
)));
323 static_assert(RT::PrecisionT::BitsT::value
+ RT::ConservativePrecisionT::BitsT::value
>= RT::EdgePrecisionT::BitsT::value
,
324 "Inadequate precision of result of manh calculation ");
326 // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
327 // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
328 manh
= _mm256_mul_pd(manh
, _mm256_set1_pd(ManhToEdgePrecisionAdjust
<RT
>() * 0.5));
330 // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
331 // this allows the rasterizer to do a single conservative coverage test to see if the primitive
332 // intersects the pixel at all
333 vEdge
= _mm256_sub_pd(vEdge
, manh
);
337 //////////////////////////////////////////////////////////////////////////
338 /// @brief adjustEdgeConservative specialization where no edge offset is needed
339 template <typename RT
>
340 struct adjustEdgeConservative
<RT
, std::integral_constant
<int32_t, 0>>
342 INLINE
adjustEdgeConservative(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
) {};
345 //////////////////////////////////////////////////////////////////////////
346 /// @brief calculates the distance a degenerate BBox needs to be adjusted
347 /// for conservative rast based on compile time trait values
348 template<typename RT
>
349 constexpr int64_t ConservativeScissorOffset()
351 static_assert(RT::ConservativePrecisionT::BitsT::value
- RT::PrecisionT::BitsT::value
>= 0, "Rasterizer precision > conservative precision");
352 // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
353 typedef std::integral_constant
<int32_t, (RT::ValidEdgeMaskT::value
== ALL_EDGES_VALID
) ? 0 : 1> DegenerateEdgeOffsetT
;
354 // 1/2 pixel edge offset + conservative offset - degenerateTriangle
355 return RT::ConservativeEdgeOffsetT::value
- (DegenerateEdgeOffsetT::value
<< (RT::ConservativePrecisionT::BitsT::value
- RT::PrecisionT::BitsT::value
));
358 //////////////////////////////////////////////////////////////////////////
359 /// @brief Performs calculations to adjust each a vector of evaluated edges out
360 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
362 template <typename RT
>
363 INLINE
void adjustScissorEdge(const double a
, const double b
, __m256d
&vEdge
)
365 int64_t aabs
= std::abs(static_cast<int64_t>(a
)), babs
= std::abs(static_cast<int64_t>(b
));
366 int64_t manh
= ((aabs
* ConservativeScissorOffset
<RT
>()) + (babs
* ConservativeScissorOffset
<RT
>())) >> ManhToEdgePrecisionAdjust
<RT
>();
367 vEdge
= _mm256_sub_pd(vEdge
, _mm256_set1_pd(manh
));
370 //////////////////////////////////////////////////////////////////////////
371 /// @brief Performs calculations to adjust each a scalar evaluated edge out
372 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
374 template <typename RT
, typename OffsetT
>
375 INLINE
double adjustScalarEdge(const double a
, const double b
, const double Edge
)
377 int64_t aabs
= std::abs(static_cast<int64_t>(a
)), babs
= std::abs(static_cast<int64_t>(b
));
378 int64_t manh
= ((aabs
* OffsetT::value
) + (babs
* OffsetT::value
)) >> ManhToEdgePrecisionAdjust
<RT
>();
379 return (Edge
- manh
);
382 //////////////////////////////////////////////////////////////////////////
383 /// @brief Perform any needed adjustments to evaluated triangle edges
384 template <typename RT
, typename EdgeOffsetT
>
385 struct adjustEdgesFix16
387 INLINE
adjustEdgesFix16(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
)
389 static_assert(std::is_same
<typename
RT::EdgePrecisionT
, FixedPointTraits
<Fixed_X_16
>>::value
,
390 "Edge equation expected to be in x.16 fixed point");
392 static_assert(RT::IsConservativeT::value
, "Edge offset assumes conservative rasterization is enabled");
394 // need to apply any edge offsets before applying the top-left rule
395 adjustEdgeConservative
<RT
, EdgeOffsetT
>(vAi
, vBi
, vEdge
);
397 adjustTopLeftRuleIntFix16(vAi
, vBi
, vEdge
);
401 //////////////////////////////////////////////////////////////////////////
402 /// @brief Perform top left adjustments to evaluated triangle edges
403 template <typename RT
>
404 struct adjustEdgesFix16
<RT
, std::integral_constant
<int32_t, 0>>
406 INLINE
adjustEdgesFix16(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
)
408 adjustTopLeftRuleIntFix16(vAi
, vBi
, vEdge
);
412 // max(abs(dz/dx), abs(dz,dy)
413 INLINE
float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC
* pDesc
)
416 // evaluate i,j at (0,0)
417 float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
418 float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
420 // evaluate i,j at (1,0)
421 float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
422 float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
425 float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
426 float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
427 float dzdx = abs(d10 - d00);
429 // evaluate i,j at (0,1)
430 float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
431 float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
433 float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
434 float dzdy = abs(d01 - d00);
437 // optimized version of above
438 float dzdx
= fabsf(pDesc
->recipDet
* (pDesc
->Z
[0] * pDesc
->I
[0] + pDesc
->Z
[1] * pDesc
->J
[0]));
439 float dzdy
= fabsf(pDesc
->recipDet
* (pDesc
->Z
[0] * pDesc
->I
[1] + pDesc
->Z
[1] * pDesc
->J
[1]));
441 return std::max(dzdx
, dzdy
);
444 INLINE
float ComputeBiasFactor(const SWR_RASTSTATE
* pState
, const SWR_TRIANGLE_DESC
* pDesc
, const float* z
)
446 if (pState
->depthFormat
== R24_UNORM_X8_TYPELESS
)
448 return (1.0f
/ (1 << 24));
450 else if (pState
->depthFormat
== R16_UNORM
)
452 return (1.0f
/ (1 << 16));
456 SWR_ASSERT(pState
->depthFormat
== R32_FLOAT
);
458 // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
459 float zMax
= std::max(fabsf(z
[0]), std::max(fabsf(z
[1]), fabsf(z
[2])));
460 uint32_t zMaxInt
= *(uint32_t*)&zMax
;
461 zMaxInt
&= 0x7f800000;
462 zMax
= *(float*)&zMaxInt
;
464 return zMax
* (1.0f
/ (1 << 23));
468 INLINE
float ComputeDepthBias(const SWR_RASTSTATE
* pState
, const SWR_TRIANGLE_DESC
* pTri
, const float* z
)
470 if (pState
->depthBias
== 0 && pState
->slopeScaledDepthBias
== 0)
475 float scale
= pState
->slopeScaledDepthBias
;
478 scale
*= ComputeMaxDepthSlope(pTri
);
481 float bias
= pState
->depthBias
;
482 if (!pState
->depthBiasPreAdjusted
)
484 bias
*= ComputeBiasFactor(pState
, pTri
, z
);
488 if (pState
->depthBiasClamp
> 0.0f
)
490 bias
= std::min(bias
, pState
->depthBiasClamp
);
492 else if (pState
->depthBiasClamp
< 0.0f
)
494 bias
= std::max(bias
, pState
->depthBiasClamp
);
500 // Prevent DCE by writing coverage mask from rasterizer to volatile
501 #if KNOB_ENABLE_TOSS_POINTS
502 __declspec(thread
) volatile uint64_t gToss
;
505 static const uint32_t vertsPerTri
= 3, componentsPerAttrib
= 4;
506 // try to avoid _chkstk insertions; make this thread local
507 static THREAD
OSALIGNLINE(float) perspAttribsTLS
[vertsPerTri
* KNOB_NUM_ATTRIBUTES
* componentsPerAttrib
];
510 void ComputeEdgeData(int32_t a
, int32_t b
, EDGE
& edge
)
515 // compute constant steps to adjacent quads
516 edge
.stepQuadX
= (double)((int64_t)a
* (int64_t)(2 * FIXED_POINT_SCALE
));
517 edge
.stepQuadY
= (double)((int64_t)b
* (int64_t)(2 * FIXED_POINT_SCALE
));
519 // compute constant steps to adjacent raster tiles
520 edge
.stepRasterTileX
= (double)((int64_t)a
* (int64_t)(KNOB_TILE_X_DIM
* FIXED_POINT_SCALE
));
521 edge
.stepRasterTileY
= (double)((int64_t)b
* (int64_t)(KNOB_TILE_Y_DIM
* FIXED_POINT_SCALE
));
523 // compute quad offsets
524 const __m256d vQuadOffsetsXIntFix8
= _mm256_set_pd(FIXED_POINT_SCALE
, 0, FIXED_POINT_SCALE
, 0);
525 const __m256d vQuadOffsetsYIntFix8
= _mm256_set_pd(FIXED_POINT_SCALE
, FIXED_POINT_SCALE
, 0, 0);
527 __m256d vQuadStepXFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.a
), vQuadOffsetsXIntFix8
);
528 __m256d vQuadStepYFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.b
), vQuadOffsetsYIntFix8
);
529 edge
.vQuadOffsets
= _mm256_add_pd(vQuadStepXFix16
, vQuadStepYFix16
);
531 // compute raster tile offsets
532 const __m256d vTileOffsetsXIntFix8
= _mm256_set_pd((KNOB_TILE_X_DIM
- 1)*FIXED_POINT_SCALE
, 0, (KNOB_TILE_X_DIM
- 1)*FIXED_POINT_SCALE
, 0);
533 const __m256d vTileOffsetsYIntFix8
= _mm256_set_pd((KNOB_TILE_Y_DIM
- 1)*FIXED_POINT_SCALE
, (KNOB_TILE_Y_DIM
- 1)*FIXED_POINT_SCALE
, 0, 0);
535 __m256d vTileStepXFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.a
), vTileOffsetsXIntFix8
);
536 __m256d vTileStepYFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.b
), vTileOffsetsYIntFix8
);
537 edge
.vRasterTileOffsets
= _mm256_add_pd(vTileStepXFix16
, vTileStepYFix16
);
541 void ComputeEdgeData(const POS
& p0
, const POS
& p1
, EDGE
& edge
)
543 ComputeEdgeData(p0
.y
- p1
.y
, p1
.x
- p0
.x
, edge
);
546 //////////////////////////////////////////////////////////////////////////
547 /// @brief Primary template definition used for partially specializing
548 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
549 /// corner to sample position, and test for coverage
550 /// @tparam sampleCount: multisample count
551 template <typename NumSamplesT
>
552 INLINE
void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox
)[3], const __m256d
* vEdgeFix16
,
553 int32_t &mask0
, int32_t &mask1
, int32_t &mask2
)
555 __m256d vSampleBboxTest0
, vSampleBboxTest1
, vSampleBboxTest2
;
556 // evaluate edge equations at the tile multisample bounding box
557 vSampleBboxTest0
= _mm256_add_pd(vEdgeTileBbox
[0], vEdgeFix16
[0]);
558 vSampleBboxTest1
= _mm256_add_pd(vEdgeTileBbox
[1], vEdgeFix16
[1]);
559 vSampleBboxTest2
= _mm256_add_pd(vEdgeTileBbox
[2], vEdgeFix16
[2]);
560 mask0
= _mm256_movemask_pd(vSampleBboxTest0
);
561 mask1
= _mm256_movemask_pd(vSampleBboxTest1
);
562 mask2
= _mm256_movemask_pd(vSampleBboxTest2
);
565 //////////////////////////////////////////////////////////////////////////
566 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
567 /// when only rasterizing a single coverage test point
569 INLINE
void UpdateEdgeMasks
<SingleSampleT
>(const __m256d(&)[3], const __m256d
* vEdgeFix16
,
570 int32_t &mask0
, int32_t &mask1
, int32_t &mask2
)
572 mask0
= _mm256_movemask_pd(vEdgeFix16
[0]);
573 mask1
= _mm256_movemask_pd(vEdgeFix16
[1]);
574 mask2
= _mm256_movemask_pd(vEdgeFix16
[2]);
577 //////////////////////////////////////////////////////////////////////////
578 /// @struct ComputeScissorEdges
579 /// @brief Primary template definition. Allows the function to be generically
580 /// called. When paired with below specializations, will result in an empty
581 /// inlined function if scissor is not enabled
582 /// @tparam RasterScissorEdgesT: is scissor enabled?
583 /// @tparam IsConservativeT: is conservative rast enabled?
584 /// @tparam RT: rasterizer traits
585 template <typename RasterScissorEdgesT
, typename IsConservativeT
, typename RT
>
586 struct ComputeScissorEdges
588 INLINE
ComputeScissorEdges(const BBOX
&triBBox
, const BBOX
&scissorBBox
, const int32_t x
, const int32_t y
,
589 EDGE (&rastEdges
)[RT::NumEdgesT::value
], __m256d (&vEdgeFix16
)[7]){};
592 //////////////////////////////////////////////////////////////////////////
593 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
594 /// specialization. Instantiated when conservative rast and scissor are enabled
595 template <typename RT
>
596 struct ComputeScissorEdges
<std::true_type
, std::true_type
, RT
>
598 //////////////////////////////////////////////////////////////////////////
599 /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
600 /// evaluate edge equations and offset them away from pixel center.
601 INLINE
ComputeScissorEdges(const BBOX
&triBBox
, const BBOX
&scissorBBox
, const int32_t x
, const int32_t y
,
602 EDGE (&rastEdges
)[RT::NumEdgesT::value
], __m256d (&vEdgeFix16
)[7])
604 // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
606 scissor
.left
= std::max(triBBox
.left
, scissorBBox
.left
);
607 scissor
.right
= std::min(triBBox
.right
, scissorBBox
.right
);
608 scissor
.top
= std::max(triBBox
.top
, scissorBBox
.top
);
609 scissor
.bottom
= std::min(triBBox
.bottom
, scissorBBox
.bottom
);
611 POS topLeft
{scissor
.left
, scissor
.top
};
612 POS bottomLeft
{scissor
.left
, scissor
.bottom
};
613 POS topRight
{scissor
.right
, scissor
.top
};
614 POS bottomRight
{scissor
.right
, scissor
.bottom
};
616 // construct 4 scissor edges in ccw direction
617 ComputeEdgeData(topLeft
, bottomLeft
, rastEdges
[3]);
618 ComputeEdgeData(bottomLeft
, bottomRight
, rastEdges
[4]);
619 ComputeEdgeData(bottomRight
, topRight
, rastEdges
[5]);
620 ComputeEdgeData(topRight
, topLeft
, rastEdges
[6]);
622 vEdgeFix16
[3] = _mm256_set1_pd((rastEdges
[3].a
* (x
- scissor
.left
)) + (rastEdges
[3].b
* (y
- scissor
.top
)));
623 vEdgeFix16
[4] = _mm256_set1_pd((rastEdges
[4].a
* (x
- scissor
.left
)) + (rastEdges
[4].b
* (y
- scissor
.bottom
)));
624 vEdgeFix16
[5] = _mm256_set1_pd((rastEdges
[5].a
* (x
- scissor
.right
)) + (rastEdges
[5].b
* (y
- scissor
.bottom
)));
625 vEdgeFix16
[6] = _mm256_set1_pd((rastEdges
[6].a
* (x
- scissor
.right
)) + (rastEdges
[6].b
* (y
- scissor
.top
)));
627 // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
628 adjustScissorEdge
<RT
>(rastEdges
[3].a
, rastEdges
[3].b
, vEdgeFix16
[3]);
629 adjustScissorEdge
<RT
>(rastEdges
[4].a
, rastEdges
[4].b
, vEdgeFix16
[4]);
630 adjustScissorEdge
<RT
>(rastEdges
[5].a
, rastEdges
[5].b
, vEdgeFix16
[5]);
631 adjustScissorEdge
<RT
>(rastEdges
[6].a
, rastEdges
[6].b
, vEdgeFix16
[6]);
635 //////////////////////////////////////////////////////////////////////////
636 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
637 /// specialization. Instantiated when scissor is enabled and conservative rast
639 template <typename RT
>
640 struct ComputeScissorEdges
<std::true_type
, std::false_type
, RT
>
642 //////////////////////////////////////////////////////////////////////////
643 /// @brief Compute scissor edge vectors and evaluate edge equations
644 INLINE
ComputeScissorEdges(const BBOX
&, const BBOX
&scissorBBox
, const int32_t x
, const int32_t y
,
645 EDGE (&rastEdges
)[RT::NumEdgesT::value
], __m256d (&vEdgeFix16
)[7])
647 const BBOX
&scissor
= scissorBBox
;
648 POS topLeft
{scissor
.left
, scissor
.top
};
649 POS bottomLeft
{scissor
.left
, scissor
.bottom
};
650 POS topRight
{scissor
.right
, scissor
.top
};
651 POS bottomRight
{scissor
.right
, scissor
.bottom
};
653 // construct 4 scissor edges in ccw direction
654 ComputeEdgeData(topLeft
, bottomLeft
, rastEdges
[3]);
655 ComputeEdgeData(bottomLeft
, bottomRight
, rastEdges
[4]);
656 ComputeEdgeData(bottomRight
, topRight
, rastEdges
[5]);
657 ComputeEdgeData(topRight
, topLeft
, rastEdges
[6]);
659 vEdgeFix16
[3] = _mm256_set1_pd((rastEdges
[3].a
* (x
- scissor
.left
)) + (rastEdges
[3].b
* (y
- scissor
.top
)));
660 vEdgeFix16
[4] = _mm256_set1_pd((rastEdges
[4].a
* (x
- scissor
.left
)) + (rastEdges
[4].b
* (y
- scissor
.bottom
)));
661 vEdgeFix16
[5] = _mm256_set1_pd((rastEdges
[5].a
* (x
- scissor
.right
)) + (rastEdges
[5].b
* (y
- scissor
.bottom
)));
662 vEdgeFix16
[6] = _mm256_set1_pd((rastEdges
[6].a
* (x
- scissor
.right
)) + (rastEdges
[6].b
* (y
- scissor
.top
)));
666 //////////////////////////////////////////////////////////////////////////
667 /// @brief Primary function template for TrivialRejectTest. Should
668 /// never be called, but TemplateUnroller instantiates a few unused values,
669 /// so it calls a runtime assert instead of a static_assert.
670 template <typename ValidEdgeMaskT
>
671 INLINE
bool TrivialRejectTest(const int, const int, const int)
673 SWR_ASSERT(0, "Primary templated function should never be called");
677 //////////////////////////////////////////////////////////////////////////
678 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
679 /// and edge 1 for trivial coverage reject
681 INLINE
bool TrivialRejectTest
<E0E1ValidT
>(const int mask0
, const int mask1
, const int)
683 return (!(mask0
&& mask1
)) ? true : false;
686 //////////////////////////////////////////////////////////////////////////
687 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
688 /// and edge 2 for trivial coverage reject
690 INLINE
bool TrivialRejectTest
<E0E2ValidT
>(const int mask0
, const int, const int mask2
)
692 return (!(mask0
&& mask2
)) ? true : false;
695 //////////////////////////////////////////////////////////////////////////
696 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
697 /// and edge 2 for trivial coverage reject
699 INLINE
bool TrivialRejectTest
<E1E2ValidT
>(const int, const int mask1
, const int mask2
)
701 return (!(mask1
&& mask2
)) ? true : false;
704 //////////////////////////////////////////////////////////////////////////
705 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
706 /// primitive edges for trivial coverage reject
708 INLINE
bool TrivialRejectTest
<AllEdgesValidT
>(const int mask0
, const int mask1
, const int mask2
)
710 return (!(mask0
&& mask1
&& mask2
)) ? true : false;;
713 //////////////////////////////////////////////////////////////////////////
714 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
715 /// point, so return false and rasterize against conservative BBox
717 INLINE
bool TrivialRejectTest
<NoEdgesValidT
>(const int, const int, const int)
722 //////////////////////////////////////////////////////////////////////////
723 /// @brief Primary function template for TrivialAcceptTest. Always returns
724 /// false, since it will only be called for degenerate tris, and as such
725 /// will never cover the entire raster tile
726 template <typename ValidEdgeMaskT
>
727 INLINE
bool TrivialAcceptTest(const int, const int, const int)
732 //////////////////////////////////////////////////////////////////////////
733 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
734 /// edge masks for a fully covered raster tile
736 INLINE
bool TrivialAcceptTest
<AllEdgesValidT
>(const int mask0
, const int mask1
, const int mask2
)
738 return ((mask0
& mask1
& mask2
) == 0xf);
741 //////////////////////////////////////////////////////////////////////////
742 /// @brief Primary function template for GenerateSVInnerCoverage. Results
743 /// in an empty function call if SVInnerCoverage isn't requested
744 template <typename RT
, typename ValidEdgeMaskT
, typename InputCoverageT
>
745 struct GenerateSVInnerCoverage
747 INLINE
GenerateSVInnerCoverage(DRAW_CONTEXT
*, EDGE
*, double*, uint64_t &){};
750 //////////////////////////////////////////////////////////////////////////
751 /// @brief Specialization of GenerateSVInnerCoverage where all edges
752 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
753 /// edge values from OuterConservative to InnerConservative and rasterizes.
754 template <typename RT
>
755 struct GenerateSVInnerCoverage
<RT
, AllEdgesValidT
, InnerConservativeCoverageT
>
757 INLINE
GenerateSVInnerCoverage(DRAW_CONTEXT
* pDC
, EDGE
* pRastEdges
, double* pStartQuadEdges
, uint64_t &innerCoverageMask
)
759 double startQuadEdgesAdj
[RT::NumEdgesT::value
];
760 for(uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
762 startQuadEdgesAdj
[e
] = adjustScalarEdge
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(pRastEdges
[e
].a
, pRastEdges
[e
].b
, pStartQuadEdges
[e
]);
765 // not trivial accept or reject, must rasterize full tile
766 RDTSC_START(BERasterizePartial
);
767 innerCoverageMask
= rasterizePartialTile
<RT::NumEdgesT::value
, typename
RT::ValidEdgeMaskT
>(pDC
, startQuadEdgesAdj
, pRastEdges
);
768 RDTSC_STOP(BERasterizePartial
, 0, 0);
772 //////////////////////////////////////////////////////////////////////////
773 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
774 /// in an empty function call if SVInnerCoverage isn't requested
775 template <typename RT
, typename ValidEdgeMaskT
, typename InputCoverageT
>
776 struct UpdateEdgeMasksInnerConservative
778 INLINE
UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox
)[3], const __m256d
*,
779 const __m128i
, const __m128i
, int32_t &, int32_t &, int32_t &){};
782 //////////////////////////////////////////////////////////////////////////
783 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
784 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
785 /// evaluated at raster tile corners to inner conservative position and
786 /// updates edge masks
787 template <typename RT
>
788 struct UpdateEdgeMasksInnerConservative
<RT
, AllEdgesValidT
, InnerConservativeCoverageT
>
790 INLINE
UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox
)[3], const __m256d
* vEdgeFix16
,
791 const __m128i vAi
, const __m128i vBi
, int32_t &mask0
, int32_t &mask1
, int32_t &mask2
)
793 __m256d vTempEdge
[3]{vEdgeFix16
[0], vEdgeFix16
[1], vEdgeFix16
[2]};
795 // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
796 // conservative evaluated edge when adjusting the edge in for inner conservative tests
797 adjustEdgeConservative
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(vAi
, vBi
, vTempEdge
[0]);
798 adjustEdgeConservative
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(vAi
, vBi
, vTempEdge
[1]);
799 adjustEdgeConservative
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(vAi
, vBi
, vTempEdge
[2]);
801 UpdateEdgeMasks
<typename
RT::NumRasterSamplesT
>(vEdgeTileBbox
, vTempEdge
, mask0
, mask1
, mask2
);
805 //////////////////////////////////////////////////////////////////////////
806 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
807 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
808 /// cover an entire raster tile, set mask0 to 0 to force it down the
809 /// rastierizePartialTile path
810 template <typename RT
, typename ValidEdgeMaskT
>
811 struct UpdateEdgeMasksInnerConservative
<RT
, ValidEdgeMaskT
, InnerConservativeCoverageT
>
813 INLINE
UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d
*,
814 const __m128i
, const __m128i
, int32_t &mask0
, int32_t &, int32_t &)
816 // set one mask to zero to force the triangle down the rastierizePartialTile path
821 template <typename RT
>
822 void RasterizeTriangle(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t macroTile
, void* pDesc
)
824 const TRIANGLE_WORK_DESC
&workDesc
= *((TRIANGLE_WORK_DESC
*)pDesc
);
825 #if KNOB_ENABLE_TOSS_POINTS
826 if (KNOB_TOSS_BIN_TRIS
)
831 RDTSC_START(BERasterizeTriangle
);
833 RDTSC_START(BETriangleSetup
);
834 const API_STATE
&state
= GetApiState(pDC
);
835 const SWR_RASTSTATE
&rastState
= state
.rastState
;
836 const BACKEND_FUNCS
& backendFuncs
= pDC
->pState
->backendFuncs
;
838 OSALIGNSIMD(SWR_TRIANGLE_DESC
) triDesc
;
839 triDesc
.pUserClipBuffer
= workDesc
.pUserClipBuffer
;
841 __m128 vX
, vY
, vZ
, vRecipW
;
843 // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
844 // eg: vX = [x0 x1 x2 dc]
845 vX
= _mm_load_ps(workDesc
.pTriBuffer
);
846 vY
= _mm_load_ps(workDesc
.pTriBuffer
+ 4);
847 vZ
= _mm_load_ps(workDesc
.pTriBuffer
+ 8);
848 vRecipW
= _mm_load_ps(workDesc
.pTriBuffer
+ 12);
850 // convert to fixed point
851 static_assert(std::is_same
<typename
RT::PrecisionT
, FixedPointTraits
<Fixed_16_8
>>::value
, "Rasterizer expects 16.8 fixed point precision");
852 __m128i vXi
= fpToFixedPoint(vX
);
853 __m128i vYi
= fpToFixedPoint(vY
);
855 // quantize floating point position to fixed point precision
856 // to prevent attribute creep around the triangle vertices
857 vX
= _mm_mul_ps(_mm_cvtepi32_ps(vXi
), _mm_set1_ps(1.0f
/ FIXED_POINT_SCALE
));
858 vY
= _mm_mul_ps(_mm_cvtepi32_ps(vYi
), _mm_set1_ps(1.0f
/ FIXED_POINT_SCALE
));
860 // triangle setup - A and B edge equation coefs
862 triangleSetupAB(vX
, vY
, vA
, vB
);
865 triangleSetupABInt(vXi
, vYi
, vAi
, vBi
);
868 float det
= calcDeterminantInt(vAi
, vBi
);
870 // Verts in Pixel Coordinate Space at this point
871 // Det > 0 = CW winding order
872 // Convert CW triangles to CCW
875 vA
= _mm_mul_ps(vA
, _mm_set1_ps(-1));
876 vB
= _mm_mul_ps(vB
, _mm_set1_ps(-1));
877 vAi
= _mm_mullo_epi32(vAi
, _mm_set1_epi32(-1));
878 vBi
= _mm_mullo_epi32(vBi
, _mm_set1_epi32(-1));
883 // Finish triangle setup - C edge coef
884 triangleSetupC(vX
, vY
, vA
, vB
, vC
);
886 if(RT::ValidEdgeMaskT::value
!= ALL_EDGES_VALID
)
888 // If we have degenerate edge(s) to rasterize, set I and J coefs
889 // to 0 for constant interpolation of attributes
897 // Degenerate triangles have no area
898 triDesc
.recipDet
= 0.0f
;
902 // only extract coefs for 2 of the barycentrics; the 3rd can be
903 // determined from the barycentric equation:
904 // i + j + k = 1 <=> k = 1 - j - i
905 _MM_EXTRACT_FLOAT(triDesc
.I
[0], vA
, 1);
906 _MM_EXTRACT_FLOAT(triDesc
.I
[1], vB
, 1);
907 _MM_EXTRACT_FLOAT(triDesc
.I
[2], vC
, 1);
908 _MM_EXTRACT_FLOAT(triDesc
.J
[0], vA
, 2);
909 _MM_EXTRACT_FLOAT(triDesc
.J
[1], vB
, 2);
910 _MM_EXTRACT_FLOAT(triDesc
.J
[2], vC
, 2);
912 // compute recipDet, used to calculate barycentric i and j in the backend
913 triDesc
.recipDet
= 1.0f
/det
;
916 OSALIGNSIMD(float) oneOverW
[4];
917 _mm_store_ps(oneOverW
, vRecipW
);
918 triDesc
.OneOverW
[0] = oneOverW
[0] - oneOverW
[2];
919 triDesc
.OneOverW
[1] = oneOverW
[1] - oneOverW
[2];
920 triDesc
.OneOverW
[2] = oneOverW
[2];
922 // calculate perspective correct coefs per vertex attrib
923 float* pPerspAttribs
= perspAttribsTLS
;
924 float* pAttribs
= workDesc
.pAttribs
;
925 triDesc
.pPerspAttribs
= pPerspAttribs
;
926 triDesc
.pAttribs
= pAttribs
;
927 float *pRecipW
= workDesc
.pTriBuffer
+ 12;
928 triDesc
.pRecipW
= pRecipW
;
929 __m128 vOneOverWV0
= _mm_broadcast_ss(pRecipW
);
930 __m128 vOneOverWV1
= _mm_broadcast_ss(pRecipW
+=1);
931 __m128 vOneOverWV2
= _mm_broadcast_ss(pRecipW
+=1);
932 for(uint32_t i
= 0; i
< workDesc
.numAttribs
; i
++)
934 __m128 attribA
= _mm_load_ps(pAttribs
);
935 __m128 attribB
= _mm_load_ps(pAttribs
+=4);
936 __m128 attribC
= _mm_load_ps(pAttribs
+=4);
939 attribA
= _mm_mul_ps(attribA
, vOneOverWV0
);
940 attribB
= _mm_mul_ps(attribB
, vOneOverWV1
);
941 attribC
= _mm_mul_ps(attribC
, vOneOverWV2
);
943 _mm_store_ps(pPerspAttribs
, attribA
);
944 _mm_store_ps(pPerspAttribs
+=4, attribB
);
945 _mm_store_ps(pPerspAttribs
+=4, attribC
);
950 // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
951 OSALIGNSIMD(float) a
[4];
953 triDesc
.Z
[0] = a
[0] - a
[2];
954 triDesc
.Z
[1] = a
[1] - a
[2];
958 triDesc
.Z
[2] += ComputeDepthBias(&rastState
, &triDesc
, workDesc
.pTriBuffer
+ 8);
960 // Calc bounding box of triangle
961 OSALIGNSIMD(BBOX
) bbox
;
962 calcBoundingBoxInt(vXi
, vYi
, bbox
);
964 if(RT::ValidEdgeMaskT::value
!= ALL_EDGES_VALID
)
966 // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
967 bbox
.left
--; bbox
.right
++; bbox
.top
--; bbox
.bottom
++;
968 SWR_ASSERT(state
.scissorInFixedPoint
.left
>= 0 && state
.scissorInFixedPoint
.top
>= 0,
969 "Conservative rast degenerate handling requires a valid scissor rect");
972 // Intersect with scissor/viewport
973 OSALIGNSIMD(BBOX
) intersect
;
974 intersect
.left
= std::max(bbox
.left
, state
.scissorInFixedPoint
.left
);
975 intersect
.right
= std::min(bbox
.right
- 1, state
.scissorInFixedPoint
.right
);
976 intersect
.top
= std::max(bbox
.top
, state
.scissorInFixedPoint
.top
);
977 intersect
.bottom
= std::min(bbox
.bottom
- 1, state
.scissorInFixedPoint
.bottom
);
979 triDesc
.triFlags
= workDesc
.triFlags
;
981 // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
982 uint32_t macroX
, macroY
;
983 MacroTileMgr::getTileIndices(macroTile
, macroX
, macroY
);
984 int32_t macroBoxLeft
= macroX
* KNOB_MACROTILE_X_DIM_FIXED
;
985 int32_t macroBoxRight
= macroBoxLeft
+ KNOB_MACROTILE_X_DIM_FIXED
- 1;
986 int32_t macroBoxTop
= macroY
* KNOB_MACROTILE_Y_DIM_FIXED
;
987 int32_t macroBoxBottom
= macroBoxTop
+ KNOB_MACROTILE_Y_DIM_FIXED
- 1;
989 intersect
.left
= std::max(intersect
.left
, macroBoxLeft
);
990 intersect
.top
= std::max(intersect
.top
, macroBoxTop
);
991 intersect
.right
= std::min(intersect
.right
, macroBoxRight
);
992 intersect
.bottom
= std::min(intersect
.bottom
, macroBoxBottom
);
994 SWR_ASSERT(intersect
.left
<= intersect
.right
&& intersect
.top
<= intersect
.bottom
&& intersect
.left
>= 0 && intersect
.right
>= 0 && intersect
.top
>= 0 && intersect
.bottom
>= 0);
996 RDTSC_STOP(BETriangleSetup
, 0, pDC
->drawId
);
998 // update triangle desc
999 uint32_t minTileX
= intersect
.left
>> (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1000 uint32_t minTileY
= intersect
.top
>> (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1001 uint32_t maxTileX
= intersect
.right
>> (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1002 uint32_t maxTileY
= intersect
.bottom
>> (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1003 uint32_t numTilesX
= maxTileX
- minTileX
+ 1;
1004 uint32_t numTilesY
= maxTileY
- minTileY
+ 1;
1006 if (numTilesX
== 0 || numTilesY
== 0)
1008 RDTSC_EVENT(BEEmptyTriangle
, 1, 0);
1009 RDTSC_STOP(BERasterizeTriangle
, 1, 0);
1013 RDTSC_START(BEStepSetup
);
1015 // Step to pixel center of top-left pixel of the triangle bbox
1016 // Align intersect bbox (top/left) to raster tile's (top/left).
1017 int32_t x
= AlignDown(intersect
.left
, (FIXED_POINT_SCALE
* KNOB_TILE_X_DIM
));
1018 int32_t y
= AlignDown(intersect
.top
, (FIXED_POINT_SCALE
* KNOB_TILE_Y_DIM
));
1020 // convenience typedef
1021 typedef typename
RT::NumRasterSamplesT NumRasterSamplesT
;
1023 // single sample rasterization evaluates edges at pixel center,
1024 // multisample evaluates edges UL pixel corner and steps to each sample position
1025 if(std::is_same
<NumRasterSamplesT
, SingleSampleT
>::value
)
1027 // Add 0.5, in fixed point, to offset to pixel center
1028 x
+= (FIXED_POINT_SCALE
/ 2);
1029 y
+= (FIXED_POINT_SCALE
/ 2);
1032 __m128i vTopLeftX
= _mm_set1_epi32(x
);
1033 __m128i vTopLeftY
= _mm_set1_epi32(y
);
1035 // evaluate edge equations at top-left pixel using 64bit math
1037 // line = Ax + By + C
1040 // we know x0 and y0 are on the line; plug them in:
1042 // plug C back into line equation:
1043 // line = Ax - By - Ax0 - By0
1044 // line = A(x - x0) + B(y - y0)
1045 // dX = (x-x0), dY = (y-y0)
1046 // so all this simplifies to
1047 // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
1049 __m128i vDeltaX
= _mm_sub_epi32(vTopLeftX
, vXi
);
1050 __m128i vDeltaY
= _mm_sub_epi32(vTopLeftY
, vYi
);
1052 // evaluate A(dx) and B(dY) for all points
1053 __m256d vAipd
= _mm256_cvtepi32_pd(vAi
);
1054 __m256d vBipd
= _mm256_cvtepi32_pd(vBi
);
1055 __m256d vDeltaXpd
= _mm256_cvtepi32_pd(vDeltaX
);
1056 __m256d vDeltaYpd
= _mm256_cvtepi32_pd(vDeltaY
);
1058 __m256d vAiDeltaXFix16
= _mm256_mul_pd(vAipd
, vDeltaXpd
);
1059 __m256d vBiDeltaYFix16
= _mm256_mul_pd(vBipd
, vDeltaYpd
);
1060 __m256d vEdge
= _mm256_add_pd(vAiDeltaXFix16
, vBiDeltaYFix16
);
1062 // apply any edge adjustments(top-left, crast, etc)
1063 adjustEdgesFix16
<RT
, typename
RT::ConservativeEdgeOffsetT
>(vAi
, vBi
, vEdge
);
1065 // broadcast respective edge results to all lanes
1066 double* pEdge
= (double*)&vEdge
;
1067 __m256d vEdgeFix16
[7];
1068 vEdgeFix16
[0] = _mm256_set1_pd(pEdge
[0]);
1069 vEdgeFix16
[1] = _mm256_set1_pd(pEdge
[1]);
1070 vEdgeFix16
[2] = _mm256_set1_pd(pEdge
[2]);
1072 OSALIGNSIMD(int32_t) aAi
[4], aBi
[4];
1073 _mm_store_si128((__m128i
*)aAi
, vAi
);
1074 _mm_store_si128((__m128i
*)aBi
, vBi
);
1075 EDGE rastEdges
[RT::NumEdgesT::value
];
1077 // Compute and store triangle edge data
1078 ComputeEdgeData(aAi
[0], aBi
[0], rastEdges
[0]);
1079 ComputeEdgeData(aAi
[1], aBi
[1], rastEdges
[1]);
1080 ComputeEdgeData(aAi
[2], aBi
[2], rastEdges
[2]);
1082 // Compute and store triangle edge data if scissor needs to rasterized
1083 ComputeScissorEdges
<typename
RT::RasterizeScissorEdgesT
, typename
RT::IsConservativeT
, RT
>
1084 (bbox
, state
.scissorInFixedPoint
, x
, y
, rastEdges
, vEdgeFix16
);
1086 // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
1087 // used to for testing if entire raster tile is inside a triangle
1088 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1090 vEdgeFix16
[e
] = _mm256_add_pd(vEdgeFix16
[e
], rastEdges
[e
].vRasterTileOffsets
);
1093 // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
1094 // step sample positions to the raster tile bbox of multisample points
1095 // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples)
1098 // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
1099 __m256d vEdgeTileBbox
[3];
1100 if (NumRasterSamplesT::value
> 1)
1102 __m128i vTileSampleBBoxXh
= RT::MT::TileSampleOffsetsX();
1103 __m128i vTileSampleBBoxYh
= RT::MT::TileSampleOffsetsY();
1105 __m256d vTileSampleBBoxXFix8
= _mm256_cvtepi32_pd(vTileSampleBBoxXh
);
1106 __m256d vTileSampleBBoxYFix8
= _mm256_cvtepi32_pd(vTileSampleBBoxYh
);
1108 // step edge equation tests from Tile
1109 // used to for testing if entire raster tile is inside a triangle
1110 for (uint32_t e
= 0; e
< 3; ++e
)
1112 __m256d vResultAxFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].a
), vTileSampleBBoxXFix8
);
1113 __m256d vResultByFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].b
), vTileSampleBBoxYFix8
);
1114 vEdgeTileBbox
[e
] = _mm256_add_pd(vResultAxFix16
, vResultByFix16
);
1116 // adjust for msaa tile bbox edges outward for conservative rast, if enabled
1117 adjustEdgeConservative
<RT
, typename
RT::ConservativeEdgeOffsetT
>(vAi
, vBi
, vEdgeTileBbox
[e
]);
1121 RDTSC_STOP(BEStepSetup
, 0, pDC
->drawId
);
1123 uint32_t tY
= minTileY
;
1124 uint32_t tX
= minTileX
;
1125 uint32_t maxY
= maxTileY
;
1126 uint32_t maxX
= maxTileX
;
1128 RenderOutputBuffers renderBuffers
, currentRenderBufferRow
;
1129 GetRenderHotTiles
<RT::MT::numSamples
>(pDC
, macroTile
, minTileX
, minTileY
, renderBuffers
, triDesc
.triFlags
.renderTargetArrayIndex
);
1130 currentRenderBufferRow
= renderBuffers
;
1132 // rasterize and generate coverage masks per sample
1133 for (uint32_t tileY
= tY
; tileY
<= maxY
; ++tileY
)
1135 __m256d vStartOfRowEdge
[RT::NumEdgesT::value
];
1136 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1138 vStartOfRowEdge
[e
] = vEdgeFix16
[e
];
1141 for (uint32_t tileX
= tX
; tileX
<= maxX
; ++tileX
)
1143 triDesc
.anyCoveredSamples
= 0;
1145 // is the corner of the edge outside of the raster tile? (vEdge < 0)
1146 int mask0
, mask1
, mask2
;
1147 UpdateEdgeMasks
<NumRasterSamplesT
>(vEdgeTileBbox
, vEdgeFix16
, mask0
, mask1
, mask2
);
1149 for (uint32_t sampleNum
= 0; sampleNum
< NumRasterSamplesT::value
; sampleNum
++)
1151 // trivial reject, at least one edge has all 4 corners of raster tile outside
1152 bool trivialReject
= TrivialRejectTest
<typename
RT::ValidEdgeMaskT
>(mask0
, mask1
, mask2
);
1156 // trivial accept mask
1157 triDesc
.coverageMask
[sampleNum
] = 0xffffffffffffffffULL
;
1159 // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
1160 UpdateEdgeMasksInnerConservative
<RT
, typename
RT::ValidEdgeMaskT
, typename
RT::InputCoverageT
>
1161 (vEdgeTileBbox
, vEdgeFix16
, vAi
, vBi
, mask0
, mask1
, mask2
);
1163 if (TrivialAcceptTest
<typename
RT::ValidEdgeMaskT
>(mask0
, mask1
, mask2
))
1165 // trivial accept, all 4 corners of all 3 edges are negative
1166 // i.e. raster tile completely inside triangle
1167 triDesc
.anyCoveredSamples
= triDesc
.coverageMask
[sampleNum
];
1168 if(std::is_same
<typename
RT::InputCoverageT
, InnerConservativeCoverageT
>::value
)
1170 triDesc
.innerCoverageMask
= 0xffffffffffffffffULL
;
1172 RDTSC_EVENT(BETrivialAccept
, 1, 0);
1176 __m256d vEdgeAtSample
[RT::NumEdgesT::value
];
1177 if(std::is_same
<NumRasterSamplesT
, SingleSampleT
>::value
)
1179 // should get optimized out for single sample case (global value numbering or copy propagation)
1180 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1182 vEdgeAtSample
[e
] = vEdgeFix16
[e
];
1187 __m128i vSampleOffsetXh
= RT::MT::vXi(sampleNum
);
1188 __m128i vSampleOffsetYh
= RT::MT::vYi(sampleNum
);
1189 __m256d vSampleOffsetX
= _mm256_cvtepi32_pd(vSampleOffsetXh
);
1190 __m256d vSampleOffsetY
= _mm256_cvtepi32_pd(vSampleOffsetYh
);
1192 // step edge equation tests from UL tile corner to pixel sample position
1193 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1195 __m256d vResultAxFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].a
), vSampleOffsetX
);
1196 __m256d vResultByFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].b
), vSampleOffsetY
);
1197 vEdgeAtSample
[e
] = _mm256_add_pd(vResultAxFix16
, vResultByFix16
);
1198 vEdgeAtSample
[e
] = _mm256_add_pd(vEdgeFix16
[e
], vEdgeAtSample
[e
]);
1202 double startQuadEdges
[RT::NumEdgesT::value
];
1203 const __m256i vLane0Mask
= _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1204 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1206 _mm256_maskstore_pd(&startQuadEdges
[e
], vLane0Mask
, vEdgeAtSample
[e
]);
1209 // not trivial accept or reject, must rasterize full tile
1210 RDTSC_START(BERasterizePartial
);
1211 triDesc
.coverageMask
[sampleNum
] = rasterizePartialTile
<RT::NumEdgesT::value
, typename
RT::ValidEdgeMaskT
>(pDC
, startQuadEdges
, rastEdges
);
1212 RDTSC_STOP(BERasterizePartial
, 0, 0);
1214 triDesc
.anyCoveredSamples
|= triDesc
.coverageMask
[sampleNum
];
1216 // Output SV InnerCoverage, if needed
1217 GenerateSVInnerCoverage
<RT
, typename
RT::ValidEdgeMaskT
, typename
RT::InputCoverageT
>(pDC
, rastEdges
, startQuadEdges
, triDesc
.innerCoverageMask
);
1222 // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
1223 if(NumRasterSamplesT::value
> 1)
1225 triDesc
.coverageMask
[sampleNum
] = 0;
1227 RDTSC_EVENT(BETrivialReject
, 1, 0);
1231 #if KNOB_ENABLE_TOSS_POINTS
1234 gToss
= triDesc
.coverageMask
[0];
1238 if(triDesc
.anyCoveredSamples
)
1240 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
1241 // copy conservative coverage result to all samples
1242 if(RT::IsConservativeT::value
)
1244 auto copyCoverage
= [&](int sample
){triDesc
.coverageMask
[sample
] = triDesc
.coverageMask
[0]; };
1245 UnrollerL
<1, RT::MT::numSamples
, 1>::step(copyCoverage
);
1248 RDTSC_START(BEPixelBackend
);
1249 backendFuncs
.pfnBackend(pDC
, workerId
, tileX
<< KNOB_TILE_X_DIM_SHIFT
, tileY
<< KNOB_TILE_Y_DIM_SHIFT
, triDesc
, renderBuffers
);
1250 RDTSC_STOP(BEPixelBackend
, 0, 0);
1253 // step to the next tile in X
1254 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1256 vEdgeFix16
[e
] = _mm256_add_pd(vEdgeFix16
[e
], _mm256_set1_pd(rastEdges
[e
].stepRasterTileX
));
1258 StepRasterTileX
<RT
>(state
.psState
.numRenderTargets
, renderBuffers
);
1261 // step to the next tile in Y
1262 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1264 vEdgeFix16
[e
] = _mm256_add_pd(vStartOfRowEdge
[e
], _mm256_set1_pd(rastEdges
[e
].stepRasterTileY
));
1266 StepRasterTileY
<RT
>(state
.psState
.numRenderTargets
, renderBuffers
, currentRenderBufferRow
);
1269 RDTSC_STOP(BERasterizeTriangle
, 1, 0);
1272 void RasterizeTriPoint(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void* pData
)
1274 const TRIANGLE_WORK_DESC
& workDesc
= *(const TRIANGLE_WORK_DESC
*)pData
;
1275 const SWR_RASTSTATE
& rastState
= pDC
->pState
->state
.rastState
;
1276 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
1278 bool isPointSpriteTexCoordEnabled
= backendState
.pointSpriteTexCoordMask
!= 0;
1280 // load point vertex
1281 float x
= *workDesc
.pTriBuffer
;
1282 float y
= *(workDesc
.pTriBuffer
+ 1);
1283 float z
= *(workDesc
.pTriBuffer
+ 2);
1285 // create a copy of the triangle buffer to write our adjusted vertices to
1286 OSALIGNSIMD(float) newTriBuffer
[4 * 4];
1287 TRIANGLE_WORK_DESC newWorkDesc
= workDesc
;
1288 newWorkDesc
.pTriBuffer
= &newTriBuffer
[0];
1290 // create a copy of the attrib buffer to write our adjusted attribs to
1291 OSALIGNSIMD(float) newAttribBuffer
[4 * 3 * KNOB_NUM_ATTRIBUTES
];
1292 newWorkDesc
.pAttribs
= &newAttribBuffer
[0];
1294 newWorkDesc
.pUserClipBuffer
= workDesc
.pUserClipBuffer
;
1295 newWorkDesc
.numAttribs
= workDesc
.numAttribs
;
1296 newWorkDesc
.triFlags
= workDesc
.triFlags
;
1298 // construct two tris by bloating point by point size
1299 float halfPointSize
= workDesc
.triFlags
.pointSize
* 0.5f
;
1300 float lowerX
= x
- halfPointSize
;
1301 float upperX
= x
+ halfPointSize
;
1302 float lowerY
= y
- halfPointSize
;
1303 float upperY
= y
+ halfPointSize
;
1306 float *pBuf
= &newTriBuffer
[0];
1315 _mm_store_ps(pBuf
, _mm_set1_ps(z
));
1316 _mm_store_ps(pBuf
+=4, _mm_set1_ps(1.0f
));
1318 // setup triangle rasterizer function
1319 PFN_WORK_FUNC pfnTriRast
;
1320 // for center sample pattern, all samples are at pixel center; calculate coverage
1321 // once at center and broadcast the results in the backend
1322 uint32_t sampleCount
= (rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
) ? rastState
.sampleCount
: SWR_MULTISAMPLE_1X
;
1323 // conservative rast not supported for points/lines
1324 pfnTriRast
= GetRasterizerFunc(sampleCount
, false, SWR_INPUT_COVERAGE_NONE
, ALL_EDGES_VALID
, (rastState
.scissorEnable
> 0));
1326 // overwrite texcoords for point sprites
1327 if (isPointSpriteTexCoordEnabled
)
1329 // copy original attribs
1330 memcpy(&newAttribBuffer
[0], workDesc
.pAttribs
, 4 * 3 * workDesc
.numAttribs
* sizeof(float));
1331 newWorkDesc
.pAttribs
= &newAttribBuffer
[0];
1333 // overwrite texcoord for point sprites
1334 uint32_t texCoordMask
= backendState
.pointSpriteTexCoordMask
;
1335 DWORD texCoordAttrib
= 0;
1337 while (_BitScanForward(&texCoordAttrib
, texCoordMask
))
1339 texCoordMask
&= ~(1 << texCoordAttrib
);
1340 __m128
* pTexAttrib
= (__m128
*)&newAttribBuffer
[0] + 3 * texCoordAttrib
;
1341 if (rastState
.pointSpriteTopOrigin
)
1343 pTexAttrib
[0] = _mm_set_ps(1, 0, 0, 0);
1344 pTexAttrib
[1] = _mm_set_ps(1, 0, 1, 0);
1345 pTexAttrib
[2] = _mm_set_ps(1, 0, 1, 1);
1349 pTexAttrib
[0] = _mm_set_ps(1, 0, 1, 0);
1350 pTexAttrib
[1] = _mm_set_ps(1, 0, 0, 0);
1351 pTexAttrib
[2] = _mm_set_ps(1, 0, 0, 1);
1357 // no texcoord overwrite, can reuse the attrib buffer from frontend
1358 newWorkDesc
.pAttribs
= workDesc
.pAttribs
;
1361 pfnTriRast(pDC
, workerId
, macroTile
, (void*)&newWorkDesc
);
1364 pBuf
= &newTriBuffer
[0];
1374 if (isPointSpriteTexCoordEnabled
)
1376 uint32_t texCoordMask
= backendState
.pointSpriteTexCoordMask
;
1377 DWORD texCoordAttrib
= 0;
1379 while (_BitScanForward(&texCoordAttrib
, texCoordMask
))
1381 texCoordMask
&= ~(1 << texCoordAttrib
);
1382 __m128
* pTexAttrib
= (__m128
*)&newAttribBuffer
[0] + 3 * texCoordAttrib
;
1383 if (rastState
.pointSpriteTopOrigin
)
1385 pTexAttrib
[0] = _mm_set_ps(1, 0, 0, 0);
1386 pTexAttrib
[1] = _mm_set_ps(1, 0, 1, 1);
1387 pTexAttrib
[2] = _mm_set_ps(1, 0, 0, 1);
1392 pTexAttrib
[0] = _mm_set_ps(1, 0, 1, 0);
1393 pTexAttrib
[1] = _mm_set_ps(1, 0, 0, 1);
1394 pTexAttrib
[2] = _mm_set_ps(1, 0, 1, 1);
1399 pfnTriRast(pDC
, workerId
, macroTile
, (void*)&newWorkDesc
);
1402 void RasterizeSimplePoint(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void* pData
)
1404 #if KNOB_ENABLE_TOSS_POINTS
1405 if (KNOB_TOSS_BIN_TRIS
)
1411 const TRIANGLE_WORK_DESC
& workDesc
= *(const TRIANGLE_WORK_DESC
*)pData
;
1412 const BACKEND_FUNCS
& backendFuncs
= pDC
->pState
->backendFuncs
;
1414 // map x,y relative offsets from start of raster tile to bit position in
1415 // coverage mask for the point
1416 static const uint32_t coverageMap
[8][8] = {
1417 { 0, 1, 4, 5, 8, 9, 12, 13 },
1418 { 2, 3, 6, 7, 10, 11, 14, 15 },
1419 { 16, 17, 20, 21, 24, 25, 28, 29 },
1420 { 18, 19, 22, 23, 26, 27, 30, 31 },
1421 { 32, 33, 36, 37, 40, 41, 44, 45 },
1422 { 34, 35, 38, 39, 42, 43, 46, 47 },
1423 { 48, 49, 52, 53, 56, 57, 60, 61 },
1424 { 50, 51, 54, 55, 58, 59, 62, 63 }
1427 OSALIGNSIMD(SWR_TRIANGLE_DESC
) triDesc
;
1429 // pull point information from triangle buffer
1430 // @todo use structs for readability
1431 uint32_t tileAlignedX
= *(uint32_t*)workDesc
.pTriBuffer
;
1432 uint32_t tileAlignedY
= *(uint32_t*)(workDesc
.pTriBuffer
+ 1);
1433 float z
= *(workDesc
.pTriBuffer
+ 2);
1435 // construct triangle descriptor for point
1436 // no interpolation, set up i,j for constant interpolation of z and attribs
1437 // @todo implement an optimized backend that doesn't require triangle information
1439 // compute coverage mask from x,y packed into the coverageMask flag
1440 // mask indices by the maximum valid index for x/y of coveragemap.
1441 uint32_t tX
= workDesc
.triFlags
.coverageMask
& 0x7;
1442 uint32_t tY
= (workDesc
.triFlags
.coverageMask
>> 4) & 0x7;
1443 // todo: multisample points?
1444 triDesc
.coverageMask
[0] = 1ULL << coverageMap
[tY
][tX
];
1446 // no persp divide needed for points
1447 triDesc
.pAttribs
= triDesc
.pPerspAttribs
= workDesc
.pAttribs
;
1448 triDesc
.triFlags
= workDesc
.triFlags
;
1449 triDesc
.recipDet
= 1.0f
;
1450 triDesc
.OneOverW
[0] = triDesc
.OneOverW
[1] = triDesc
.OneOverW
[2] = 1.0f
;
1451 triDesc
.I
[0] = triDesc
.I
[1] = triDesc
.I
[2] = 0.0f
;
1452 triDesc
.J
[0] = triDesc
.J
[1] = triDesc
.J
[2] = 0.0f
;
1453 triDesc
.Z
[0] = triDesc
.Z
[1] = triDesc
.Z
[2] = z
;
1455 RenderOutputBuffers renderBuffers
;
1456 GetRenderHotTiles(pDC
, macroTile
, tileAlignedX
>> KNOB_TILE_X_DIM_SHIFT
, tileAlignedY
>> KNOB_TILE_Y_DIM_SHIFT
,
1457 renderBuffers
, triDesc
.triFlags
.renderTargetArrayIndex
);
1459 RDTSC_START(BEPixelBackend
);
1460 backendFuncs
.pfnBackend(pDC
, workerId
, tileAlignedX
, tileAlignedY
, triDesc
, renderBuffers
);
1461 RDTSC_STOP(BEPixelBackend
, 0, 0);
1464 // Get pointers to hot tile memory for color RT, depth, stencil
1465 template <uint32_t numSamples
>
1466 void GetRenderHotTiles(DRAW_CONTEXT
*pDC
, uint32_t macroID
, uint32_t tileX
, uint32_t tileY
, RenderOutputBuffers
&renderBuffers
, uint32_t renderTargetArrayIndex
)
1468 const API_STATE
& state
= GetApiState(pDC
);
1469 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1472 MacroTileMgr::getTileIndices(macroID
, mx
, my
);
1473 tileX
-= KNOB_MACROTILE_X_DIM_IN_TILES
* mx
;
1474 tileY
-= KNOB_MACROTILE_Y_DIM_IN_TILES
* my
;
1476 // compute tile offset for active hottile buffers
1477 const uint32_t pitch
= KNOB_MACROTILE_X_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8;
1478 uint32_t offset
= ComputeTileOffset2D
<TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
> >(pitch
, tileX
, tileY
);
1481 unsigned long rtSlot
= 0;
1482 uint32_t colorHottileEnableMask
= state
.colorHottileEnable
;
1483 while(_BitScanForward(&rtSlot
, colorHottileEnableMask
))
1485 HOTTILE
*pColor
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroID
, (SWR_RENDERTARGET_ATTACHMENT
)(SWR_ATTACHMENT_COLOR0
+ rtSlot
), true,
1486 numSamples
, renderTargetArrayIndex
);
1487 pColor
->state
= HOTTILE_DIRTY
;
1488 renderBuffers
.pColor
[rtSlot
] = pColor
->pBuffer
+ offset
;
1490 colorHottileEnableMask
&= ~(1 << rtSlot
);
1492 if(state
.depthHottileEnable
)
1494 const uint32_t pitch
= KNOB_MACROTILE_X_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8;
1495 uint32_t offset
= ComputeTileOffset2D
<TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
> >(pitch
, tileX
, tileY
);
1497 HOTTILE
*pDepth
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroID
, SWR_ATTACHMENT_DEPTH
, true,
1498 numSamples
, renderTargetArrayIndex
);
1499 pDepth
->state
= HOTTILE_DIRTY
;
1500 SWR_ASSERT(pDepth
->pBuffer
!= nullptr);
1501 renderBuffers
.pDepth
= pDepth
->pBuffer
+ offset
;
1503 if(state
.stencilHottileEnable
)
1505 const uint32_t pitch
= KNOB_MACROTILE_X_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8;
1506 uint32_t offset
= ComputeTileOffset2D
<TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
> >(pitch
, tileX
, tileY
);
1508 HOTTILE
* pStencil
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroID
, SWR_ATTACHMENT_STENCIL
, true,
1509 numSamples
, renderTargetArrayIndex
);
1510 pStencil
->state
= HOTTILE_DIRTY
;
1511 SWR_ASSERT(pStencil
->pBuffer
!= nullptr);
1512 renderBuffers
.pStencil
= pStencil
->pBuffer
+ offset
;
1516 template <typename RT
>
1517 INLINE
void StepRasterTileX(uint32_t NumRT
, RenderOutputBuffers
&buffers
)
1519 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1521 buffers
.pColor
[rt
] += RT::colorRasterTileStep
;
1524 buffers
.pDepth
+= RT::depthRasterTileStep
;
1525 buffers
.pStencil
+= RT::stencilRasterTileStep
;
1528 template <typename RT
>
1529 INLINE
void StepRasterTileY(uint32_t NumRT
, RenderOutputBuffers
&buffers
, RenderOutputBuffers
&startBufferRow
)
1531 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1533 startBufferRow
.pColor
[rt
] += RT::colorRasterTileRowStep
;
1534 buffers
.pColor
[rt
] = startBufferRow
.pColor
[rt
];
1536 startBufferRow
.pDepth
+= RT::depthRasterTileRowStep
;
1537 buffers
.pDepth
= startBufferRow
.pDepth
;
1539 startBufferRow
.pStencil
+= RT::stencilRasterTileRowStep
;
1540 buffers
.pStencil
= startBufferRow
.pStencil
;
1543 void RasterizeLine(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
1545 const TRIANGLE_WORK_DESC
&workDesc
= *((TRIANGLE_WORK_DESC
*)pData
);
1546 #if KNOB_ENABLE_TOSS_POINTS
1547 if (KNOB_TOSS_BIN_TRIS
)
1553 // bloat line to two tris and call the triangle rasterizer twice
1554 RDTSC_START(BERasterizeLine
);
1556 const API_STATE
&state
= GetApiState(pDC
);
1557 const SWR_RASTSTATE
&rastState
= state
.rastState
;
1559 // macrotile dimensioning
1560 uint32_t macroX
, macroY
;
1561 MacroTileMgr::getTileIndices(macroTile
, macroX
, macroY
);
1562 int32_t macroBoxLeft
= macroX
* KNOB_MACROTILE_X_DIM_FIXED
;
1563 int32_t macroBoxRight
= macroBoxLeft
+ KNOB_MACROTILE_X_DIM_FIXED
- 1;
1564 int32_t macroBoxTop
= macroY
* KNOB_MACROTILE_Y_DIM_FIXED
;
1565 int32_t macroBoxBottom
= macroBoxTop
+ KNOB_MACROTILE_Y_DIM_FIXED
- 1;
1567 // create a copy of the triangle buffer to write our adjusted vertices to
1568 OSALIGNSIMD(float) newTriBuffer
[4 * 4];
1569 TRIANGLE_WORK_DESC newWorkDesc
= workDesc
;
1570 newWorkDesc
.pTriBuffer
= &newTriBuffer
[0];
1572 // create a copy of the attrib buffer to write our adjusted attribs to
1573 OSALIGNSIMD(float) newAttribBuffer
[4 * 3 * KNOB_NUM_ATTRIBUTES
];
1574 newWorkDesc
.pAttribs
= &newAttribBuffer
[0];
1576 const __m128 vBloat0
= _mm_set_ps(0.5f
, -0.5f
, -0.5f
, 0.5f
);
1577 const __m128 vBloat1
= _mm_set_ps(0.5f
, 0.5f
, 0.5f
, -0.5f
);
1579 __m128 vX
, vY
, vZ
, vRecipW
;
1581 vX
= _mm_load_ps(workDesc
.pTriBuffer
);
1582 vY
= _mm_load_ps(workDesc
.pTriBuffer
+ 4);
1583 vZ
= _mm_load_ps(workDesc
.pTriBuffer
+ 8);
1584 vRecipW
= _mm_load_ps(workDesc
.pTriBuffer
+ 12);
1587 // v0,v1 -> v0,v0,v1
1588 __m128 vXa
= _mm_shuffle_ps(vX
, vX
, _MM_SHUFFLE(1, 1, 0, 0));
1589 __m128 vYa
= _mm_shuffle_ps(vY
, vY
, _MM_SHUFFLE(1, 1, 0, 0));
1590 __m128 vZa
= _mm_shuffle_ps(vZ
, vZ
, _MM_SHUFFLE(1, 1, 0, 0));
1591 __m128 vRecipWa
= _mm_shuffle_ps(vRecipW
, vRecipW
, _MM_SHUFFLE(1, 1, 0, 0));
1593 __m128 vLineWidth
= _mm_set1_ps(pDC
->pState
->state
.rastState
.lineWidth
);
1594 __m128 vAdjust
= _mm_mul_ps(vLineWidth
, vBloat0
);
1595 if (workDesc
.triFlags
.yMajor
)
1597 vXa
= _mm_add_ps(vAdjust
, vXa
);
1601 vYa
= _mm_add_ps(vAdjust
, vYa
);
1604 // Store triangle description for rasterizer
1605 _mm_store_ps((float*)&newTriBuffer
[0], vXa
);
1606 _mm_store_ps((float*)&newTriBuffer
[4], vYa
);
1607 _mm_store_ps((float*)&newTriBuffer
[8], vZa
);
1608 _mm_store_ps((float*)&newTriBuffer
[12], vRecipWa
);
1610 // binner bins 3 edges for lines as v0, v1, v1
1611 // tri0 needs v0, v0, v1
1612 for (uint32_t a
= 0; a
< workDesc
.numAttribs
; ++a
)
1614 __m128 vAttrib0
= _mm_load_ps(&workDesc
.pAttribs
[a
*12 + 0]);
1615 __m128 vAttrib1
= _mm_load_ps(&workDesc
.pAttribs
[a
*12 + 4]);
1617 _mm_store_ps((float*)&newAttribBuffer
[a
*12 + 0], vAttrib0
);
1618 _mm_store_ps((float*)&newAttribBuffer
[a
*12 + 4], vAttrib0
);
1619 _mm_store_ps((float*)&newAttribBuffer
[a
*12 + 8], vAttrib1
);
1622 // Store user clip distances for triangle 0
1623 float newClipBuffer
[3 * 8];
1624 uint32_t numClipDist
= _mm_popcnt_u32(state
.rastState
.clipDistanceMask
);
1627 newWorkDesc
.pUserClipBuffer
= newClipBuffer
;
1629 float* pOldBuffer
= workDesc
.pUserClipBuffer
;
1630 float* pNewBuffer
= newClipBuffer
;
1631 for (uint32_t i
= 0; i
< numClipDist
; ++i
)
1633 // read barycentric coeffs from binner
1634 float a
= *(pOldBuffer
++);
1635 float b
= *(pOldBuffer
++);
1637 // reconstruct original clip distance at vertices
1641 // construct triangle barycentrics
1642 *(pNewBuffer
++) = c0
- c1
;
1643 *(pNewBuffer
++) = c0
- c1
;
1644 *(pNewBuffer
++) = c1
;
1648 // setup triangle rasterizer function
1649 PFN_WORK_FUNC pfnTriRast
;
1650 uint32_t sampleCount
= (rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
) ? rastState
.sampleCount
: SWR_MULTISAMPLE_1X
;
1651 // conservative rast not supported for points/lines
1652 pfnTriRast
= GetRasterizerFunc(sampleCount
, false, SWR_INPUT_COVERAGE_NONE
, ALL_EDGES_VALID
, (rastState
.scissorEnable
> 0));
1654 // make sure this macrotile intersects the triangle
1655 __m128i vXai
= fpToFixedPoint(vXa
);
1656 __m128i vYai
= fpToFixedPoint(vYa
);
1657 OSALIGNSIMD(BBOX
) bboxA
;
1658 calcBoundingBoxInt(vXai
, vYai
, bboxA
);
1660 if (!(bboxA
.left
> macroBoxRight
||
1661 bboxA
.left
> state
.scissorInFixedPoint
.right
||
1662 bboxA
.right
- 1 < macroBoxLeft
||
1663 bboxA
.right
- 1 < state
.scissorInFixedPoint
.left
||
1664 bboxA
.top
> macroBoxBottom
||
1665 bboxA
.top
> state
.scissorInFixedPoint
.bottom
||
1666 bboxA
.bottom
- 1 < macroBoxTop
||
1667 bboxA
.bottom
- 1 < state
.scissorInFixedPoint
.top
)) {
1668 // rasterize triangle
1669 pfnTriRast(pDC
, workerId
, macroTile
, (void*)&newWorkDesc
);
1673 // v0,v1 -> v1,v1,v0
1674 vXa
= _mm_shuffle_ps(vX
, vX
, _MM_SHUFFLE(1, 0, 1, 1));
1675 vYa
= _mm_shuffle_ps(vY
, vY
, _MM_SHUFFLE(1, 0, 1, 1));
1676 vZa
= _mm_shuffle_ps(vZ
, vZ
, _MM_SHUFFLE(1, 0, 1, 1));
1677 vRecipWa
= _mm_shuffle_ps(vRecipW
, vRecipW
, _MM_SHUFFLE(1, 0, 1, 1));
1679 vAdjust
= _mm_mul_ps(vLineWidth
, vBloat1
);
1680 if (workDesc
.triFlags
.yMajor
)
1682 vXa
= _mm_add_ps(vAdjust
, vXa
);
1686 vYa
= _mm_add_ps(vAdjust
, vYa
);
1689 // Store triangle description for rasterizer
1690 _mm_store_ps((float*)&newTriBuffer
[0], vXa
);
1691 _mm_store_ps((float*)&newTriBuffer
[4], vYa
);
1692 _mm_store_ps((float*)&newTriBuffer
[8], vZa
);
1693 _mm_store_ps((float*)&newTriBuffer
[12], vRecipWa
);
1695 // binner bins 3 edges for lines as v0, v1, v1
1696 // tri1 needs v1, v1, v0
1697 for (uint32_t a
= 0; a
< workDesc
.numAttribs
; ++a
)
1699 __m128 vAttrib0
= _mm_load_ps(&workDesc
.pAttribs
[a
* 12 + 0]);
1700 __m128 vAttrib1
= _mm_load_ps(&workDesc
.pAttribs
[a
* 12 + 4]);
1702 _mm_store_ps((float*)&newAttribBuffer
[a
* 12 + 0], vAttrib1
);
1703 _mm_store_ps((float*)&newAttribBuffer
[a
* 12 + 4], vAttrib1
);
1704 _mm_store_ps((float*)&newAttribBuffer
[a
* 12 + 8], vAttrib0
);
1707 // store user clip distance for triangle 1
1710 float* pOldBuffer
= workDesc
.pUserClipBuffer
;
1711 float* pNewBuffer
= newClipBuffer
;
1712 for (uint32_t i
= 0; i
< numClipDist
; ++i
)
1714 // read barycentric coeffs from binner
1715 float a
= *(pOldBuffer
++);
1716 float b
= *(pOldBuffer
++);
1718 // reconstruct original clip distance at vertices
1722 // construct triangle barycentrics
1723 *(pNewBuffer
++) = c1
- c0
;
1724 *(pNewBuffer
++) = c1
- c0
;
1725 *(pNewBuffer
++) = c0
;
1729 vXai
= fpToFixedPoint(vXa
);
1730 vYai
= fpToFixedPoint(vYa
);
1731 calcBoundingBoxInt(vXai
, vYai
, bboxA
);
1733 if (!(bboxA
.left
> macroBoxRight
||
1734 bboxA
.left
> state
.scissorInFixedPoint
.right
||
1735 bboxA
.right
- 1 < macroBoxLeft
||
1736 bboxA
.right
- 1 < state
.scissorInFixedPoint
.left
||
1737 bboxA
.top
> macroBoxBottom
||
1738 bboxA
.top
> state
.scissorInFixedPoint
.bottom
||
1739 bboxA
.bottom
- 1 < macroBoxTop
||
1740 bboxA
.bottom
- 1 < state
.scissorInFixedPoint
.top
)) {
1741 // rasterize triangle
1742 pfnTriRast(pDC
, workerId
, macroTile
, (void*)&newWorkDesc
);
1745 RDTSC_STOP(BERasterizeLine
, 1, 0);
1748 struct RasterizerChooser
1750 typedef PFN_WORK_FUNC FuncType
;
1752 template <typename
... ArgsB
>
1753 static FuncType
GetFunc()
1755 return RasterizeTriangle
<RasterizerTraits
<ArgsB
...>>;
1759 // Selector for correct templated RasterizeTriangle function
1760 PFN_WORK_FUNC
GetRasterizerFunc(
1761 uint32_t numSamples
,
1762 bool IsConservative
,
1763 uint32_t InputCoverage
,
1764 uint32_t EdgeEnable
,
1765 bool RasterizeScissorEdges
1768 return TemplateArgUnroller
<RasterizerChooser
>::GetFunc(
1769 IntArg
<SWR_MULTISAMPLE_1X
,SWR_MULTISAMPLE_TYPE_COUNT
-1>{numSamples
},
1771 IntArg
<SWR_INPUT_COVERAGE_NONE
, SWR_INPUT_COVERAGE_COUNT
-1>{InputCoverage
},
1772 IntArg
<0, VALID_TRI_EDGE_COUNT
-1>{EdgeEnable
},
1773 RasterizeScissorEdges
);