1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * @file rasterizer.cpp
25 * @brief Implementation for the rasterizer.
27 ******************************************************************************/
32 #include "rasterizer.h"
33 #include "rdtsc_core.h"
38 #include "memory/tilingtraits.h"
40 template <uint32_t numSamples
= 1>
41 void GetRenderHotTiles(DRAW_CONTEXT
*pDC
, uint32_t macroID
, uint32_t x
, uint32_t y
, RenderOutputBuffers
&renderBuffers
, uint32_t renderTargetArrayIndex
);
42 template <typename RT
>
43 void StepRasterTileX(uint32_t MaxRT
, RenderOutputBuffers
&buffers
);
44 template <typename RT
>
45 void StepRasterTileY(uint32_t MaxRT
, RenderOutputBuffers
&buffers
, RenderOutputBuffers
&startBufferRow
);
47 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
48 const __m256d gMaskToVecpd
[] =
50 MASKTOVEC(0, 0, 0, 0),
51 MASKTOVEC(0, 0, 0, 1),
52 MASKTOVEC(0, 0, 1, 0),
53 MASKTOVEC(0, 0, 1, 1),
54 MASKTOVEC(0, 1, 0, 0),
55 MASKTOVEC(0, 1, 0, 1),
56 MASKTOVEC(0, 1, 1, 0),
57 MASKTOVEC(0, 1, 1, 1),
58 MASKTOVEC(1, 0, 0, 0),
59 MASKTOVEC(1, 0, 0, 1),
60 MASKTOVEC(1, 0, 1, 0),
61 MASKTOVEC(1, 0, 1, 1),
62 MASKTOVEC(1, 1, 0, 0),
63 MASKTOVEC(1, 1, 0, 1),
64 MASKTOVEC(1, 1, 1, 0),
65 MASKTOVEC(1, 1, 1, 1),
75 double a
, b
; // a, b edge coefficients in fix8
76 double stepQuadX
; // step to adjacent horizontal quad in fix16
77 double stepQuadY
; // step to adjacent vertical quad in fix16
78 double stepRasterTileX
; // step to adjacent horizontal raster tile in fix16
79 double stepRasterTileY
; // step to adjacent vertical raster tile in fix16
81 __m256d vQuadOffsets
; // offsets for 4 samples of a quad
82 __m256d vRasterTileOffsets
; // offsets for the 4 corners of a raster tile
85 //////////////////////////////////////////////////////////////////////////
86 /// @brief rasterize a raster tile partially covered by the triangle
87 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
88 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
89 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
90 /// Used to step between quads when sweeping over the raster tile.
91 template<uint32_t NumEdges
, typename EdgeMaskT
>
92 INLINE
uint64_t rasterizePartialTile(DRAW_CONTEXT
*pDC
, double startEdges
[NumEdges
], EDGE
*pRastEdges
)
94 uint64_t coverageMask
= 0;
96 __m256d vEdges
[NumEdges
];
97 __m256d vStepX
[NumEdges
];
98 __m256d vStepY
[NumEdges
];
100 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
102 // Step to the pixel sample locations of the 1st quad
103 vEdges
[e
] = _mm256_add_pd(_mm256_set1_pd(startEdges
[e
]), pRastEdges
[e
].vQuadOffsets
);
105 // compute step to next quad (mul by 2 in x and y direction)
106 vStepX
[e
] = _mm256_set1_pd(pRastEdges
[e
].stepQuadX
);
107 vStepY
[e
] = _mm256_set1_pd(pRastEdges
[e
].stepQuadY
);
110 // fast unrolled version for 8x8 tile
111 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
112 int edgeMask
[NumEdges
];
115 auto eval_lambda
= [&](int e
){edgeMask
[e
] = _mm256_movemask_pd(vEdges
[e
]);};
116 auto update_lambda
= [&](int e
){mask
&= edgeMask
[e
];};
117 auto incx_lambda
= [&](int e
){vEdges
[e
] = _mm256_add_pd(vEdges
[e
], vStepX
[e
]);};
118 auto incy_lambda
= [&](int e
){vEdges
[e
] = _mm256_add_pd(vEdges
[e
], vStepY
[e
]);};
119 auto decx_lambda
= [&](int e
){vEdges
[e
] = _mm256_sub_pd(vEdges
[e
], vStepX
[e
]);};
121 // evaluate which pixels in the quad are covered
123 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
125 // update coverage mask
126 // if edge 0 is degenerate and will be skipped; init the mask
127 #define UPDATE_MASK(bit) \
128 if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
132 mask = edgeMask[0]; \
134 UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
135 coverageMask |= (mask << bit);
137 // step in the +x direction to the next quad
139 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
141 // step in the +y direction to the next quad
143 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
145 // step in the -x direction to the next quad
147 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
149 // sweep 2x2 quad back and forth through the raster tile,
150 // computing coverage masks for the entire tile
155 // x x ------------------>
157 // <-----------------x x V
216 for (uint32_t y
= 0; y
< KNOB_TILE_Y_DIM
/2; ++y
)
218 __m256d vStartOfRowEdge
[NumEdges
];
219 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
221 vStartOfRowEdge
[e
] = vEdges
[e
];
224 for (uint32_t x
= 0; x
< KNOB_TILE_X_DIM
/2; ++x
)
226 int edgeMask
[NumEdges
];
227 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
229 edgeMask
[e
] = _mm256_movemask_pd(vEdges
[e
]);
232 uint64_t mask
= edgeMask
[0];
233 for (uint32_t e
= 1; e
< NumEdges
; ++e
)
237 coverageMask
|= (mask
<< bit
);
239 // step to the next pixel in the x
240 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
242 vEdges
[e
] = _mm256_add_pd(vEdges
[e
], vStepX
[e
]);
247 // step to the next row
248 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
250 vEdges
[e
] = _mm256_add_pd(vStartOfRowEdge
[e
], vStepY
[e
]);
258 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
259 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
260 // Top left: a sample is in if it is a top or left edge.
261 // Out: !(horizontal && above) = !horizontal && below
262 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right
263 INLINE
void adjustTopLeftRuleIntFix16(const __m128i vA
, const __m128i vB
, __m256d
&vEdge
)
266 // if vA == 0 && vB < 0, vC--
268 __m256d vEdgeOut
= vEdge
;
269 __m256d vEdgeAdjust
= _mm256_sub_pd(vEdge
, _mm256_set1_pd(1.0));
271 // if vA < 0 (line is not horizontal and below)
272 int msk
= _mm_movemask_ps(_mm_castsi128_ps(vA
));
274 // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
275 __m128i vCmp
= _mm_cmpeq_epi32(vA
, _mm_setzero_si128());
276 int msk2
= _mm_movemask_ps(_mm_castsi128_ps(vCmp
));
277 msk2
&= _mm_movemask_ps(_mm_castsi128_ps(vB
));
279 // if either of these are true and we're on the line (edge == 0), bump it outside the line
280 vEdge
= _mm256_blendv_pd(vEdgeOut
, vEdgeAdjust
, gMaskToVecpd
[msk
| msk2
]);
283 //////////////////////////////////////////////////////////////////////////
284 /// @brief calculates difference in precision between the result of manh
285 /// calculation and the edge precision, based on compile time trait values
286 template<typename RT
>
287 constexpr int64_t ManhToEdgePrecisionAdjust()
289 static_assert(RT::PrecisionT::BitsT::value
+ RT::ConservativePrecisionT::BitsT::value
>= RT::EdgePrecisionT::BitsT::value
,
290 "Inadequate precision of result of manh calculation ");
291 return ((RT::PrecisionT::BitsT::value
+ RT::ConservativePrecisionT::BitsT::value
) - RT::EdgePrecisionT::BitsT::value
);
294 //////////////////////////////////////////////////////////////////////////
295 /// @struct adjustEdgeConservative
296 /// @brief Primary template definition used for partially specializing
297 /// the adjustEdgeConservative function. This struct should never
299 /// @tparam RT: rasterizer traits
300 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
301 template <typename RT
, typename ConservativeEdgeOffsetT
>
302 struct adjustEdgeConservative
304 //////////////////////////////////////////////////////////////////////////
305 /// @brief Performs calculations to adjust each edge of a triangle away
306 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
309 /// Uncertainty regions arise from fixed point rounding, which
310 /// can snap a vertex +/- by min fixed point value.
311 /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
312 /// This allows the rasterizer to test for coverage only at the pixel center,
313 /// instead of having to test individual pixel corners for conservative coverage
314 INLINE
adjustEdgeConservative(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
)
316 // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
317 // from the pixel center (in the direction of the edge normal A/B)
319 // edge = Ax + Bx + C - (manh/e)
320 // manh = manhattan distance = abs(A) + abs(B)
321 // e = absolute rounding error from snapping from float to fixed point precision
323 // 'fixed point' multiply (in double to be avx1 friendly)
324 // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
325 __m256d vAai
= _mm256_cvtepi32_pd(_mm_abs_epi32(vAi
)), vBai
= _mm256_cvtepi32_pd(_mm_abs_epi32(vBi
));
326 __m256d manh
= _mm256_add_pd(_mm256_mul_pd(vAai
, _mm256_set1_pd(ConservativeEdgeOffsetT::value
)),
327 _mm256_mul_pd(vBai
, _mm256_set1_pd(ConservativeEdgeOffsetT::value
)));
329 static_assert(RT::PrecisionT::BitsT::value
+ RT::ConservativePrecisionT::BitsT::value
>= RT::EdgePrecisionT::BitsT::value
,
330 "Inadequate precision of result of manh calculation ");
332 // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
333 // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
334 manh
= _mm256_mul_pd(manh
, _mm256_set1_pd(ManhToEdgePrecisionAdjust
<RT
>() * 0.5));
336 // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
337 // this allows the rasterizer to do a single conservative coverage test to see if the primitive
338 // intersects the pixel at all
339 vEdge
= _mm256_sub_pd(vEdge
, manh
);
343 //////////////////////////////////////////////////////////////////////////
344 /// @brief adjustEdgeConservative specialization where no edge offset is needed
345 template <typename RT
>
346 struct adjustEdgeConservative
<RT
, std::integral_constant
<int32_t, 0>>
348 INLINE
adjustEdgeConservative(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
) {};
351 //////////////////////////////////////////////////////////////////////////
352 /// @brief calculates the distance a degenerate BBox needs to be adjusted
353 /// for conservative rast based on compile time trait values
354 template<typename RT
>
355 constexpr int64_t ConservativeScissorOffset()
357 static_assert(RT::ConservativePrecisionT::BitsT::value
- RT::PrecisionT::BitsT::value
>= 0, "Rasterizer precision > conservative precision");
358 // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
359 typedef std::integral_constant
<int32_t, (RT::ValidEdgeMaskT::value
== ALL_EDGES_VALID
) ? 0 : 1> DegenerateEdgeOffsetT
;
360 // 1/2 pixel edge offset + conservative offset - degenerateTriangle
361 return RT::ConservativeEdgeOffsetT::value
- (DegenerateEdgeOffsetT::value
<< (RT::ConservativePrecisionT::BitsT::value
- RT::PrecisionT::BitsT::value
));
364 //////////////////////////////////////////////////////////////////////////
365 /// @brief Performs calculations to adjust each a vector of evaluated edges out
366 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
368 template <typename RT
>
369 INLINE
void adjustScissorEdge(const double a
, const double b
, __m256d
&vEdge
)
371 int64_t aabs
= std::abs(static_cast<int64_t>(a
)), babs
= std::abs(static_cast<int64_t>(b
));
372 int64_t manh
= ((aabs
* ConservativeScissorOffset
<RT
>()) + (babs
* ConservativeScissorOffset
<RT
>())) >> ManhToEdgePrecisionAdjust
<RT
>();
373 vEdge
= _mm256_sub_pd(vEdge
, _mm256_set1_pd(manh
));
376 //////////////////////////////////////////////////////////////////////////
377 /// @brief Performs calculations to adjust each a scalar evaluated edge out
378 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
380 template <typename RT
, typename OffsetT
>
381 INLINE
double adjustScalarEdge(const double a
, const double b
, const double Edge
)
383 int64_t aabs
= std::abs(static_cast<int64_t>(a
)), babs
= std::abs(static_cast<int64_t>(b
));
384 int64_t manh
= ((aabs
* OffsetT::value
) + (babs
* OffsetT::value
)) >> ManhToEdgePrecisionAdjust
<RT
>();
385 return (Edge
- manh
);
388 //////////////////////////////////////////////////////////////////////////
389 /// @brief Perform any needed adjustments to evaluated triangle edges
390 template <typename RT
, typename EdgeOffsetT
>
391 struct adjustEdgesFix16
393 INLINE
adjustEdgesFix16(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
)
395 static_assert(std::is_same
<typename
RT::EdgePrecisionT
, FixedPointTraits
<Fixed_X_16
>>::value
,
396 "Edge equation expected to be in x.16 fixed point");
398 static_assert(RT::IsConservativeT::value
, "Edge offset assumes conservative rasterization is enabled");
400 // need to apply any edge offsets before applying the top-left rule
401 adjustEdgeConservative
<RT
, EdgeOffsetT
>(vAi
, vBi
, vEdge
);
403 adjustTopLeftRuleIntFix16(vAi
, vBi
, vEdge
);
407 //////////////////////////////////////////////////////////////////////////
408 /// @brief Perform top left adjustments to evaluated triangle edges
409 template <typename RT
>
410 struct adjustEdgesFix16
<RT
, std::integral_constant
<int32_t, 0>>
412 INLINE
adjustEdgesFix16(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
)
414 adjustTopLeftRuleIntFix16(vAi
, vBi
, vEdge
);
418 // max(abs(dz/dx), abs(dz,dy)
419 INLINE
float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC
* pDesc
)
422 // evaluate i,j at (0,0)
423 float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
424 float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
426 // evaluate i,j at (1,0)
427 float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
428 float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
431 float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
432 float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
433 float dzdx = abs(d10 - d00);
435 // evaluate i,j at (0,1)
436 float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
437 float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
439 float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
440 float dzdy = abs(d01 - d00);
443 // optimized version of above
444 float dzdx
= fabsf(pDesc
->recipDet
* (pDesc
->Z
[0] * pDesc
->I
[0] + pDesc
->Z
[1] * pDesc
->J
[0]));
445 float dzdy
= fabsf(pDesc
->recipDet
* (pDesc
->Z
[0] * pDesc
->I
[1] + pDesc
->Z
[1] * pDesc
->J
[1]));
447 return std::max(dzdx
, dzdy
);
450 INLINE
float ComputeBiasFactor(const SWR_RASTSTATE
* pState
, const SWR_TRIANGLE_DESC
* pDesc
, const float* z
)
452 if (pState
->depthFormat
== R24_UNORM_X8_TYPELESS
)
454 return (1.0f
/ (1 << 24));
456 else if (pState
->depthFormat
== R16_UNORM
)
458 return (1.0f
/ (1 << 16));
462 SWR_ASSERT(pState
->depthFormat
== R32_FLOAT
);
464 // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
465 float zMax
= std::max(fabsf(z
[0]), std::max(fabsf(z
[1]), fabsf(z
[2])));
466 uint32_t zMaxInt
= *(uint32_t*)&zMax
;
467 zMaxInt
&= 0x7f800000;
468 zMax
= *(float*)&zMaxInt
;
470 return zMax
* (1.0f
/ (1 << 23));
474 INLINE
float ComputeDepthBias(const SWR_RASTSTATE
* pState
, const SWR_TRIANGLE_DESC
* pTri
, const float* z
)
476 if (pState
->depthBias
== 0 && pState
->slopeScaledDepthBias
== 0)
481 float scale
= pState
->slopeScaledDepthBias
;
484 scale
*= ComputeMaxDepthSlope(pTri
);
487 float bias
= pState
->depthBias
;
488 if (!pState
->depthBiasPreAdjusted
)
490 bias
*= ComputeBiasFactor(pState
, pTri
, z
);
494 if (pState
->depthBiasClamp
> 0.0f
)
496 bias
= std::min(bias
, pState
->depthBiasClamp
);
498 else if (pState
->depthBiasClamp
< 0.0f
)
500 bias
= std::max(bias
, pState
->depthBiasClamp
);
506 // Prevent DCE by writing coverage mask from rasterizer to volatile
507 #if KNOB_ENABLE_TOSS_POINTS
508 __declspec(thread
) volatile uint64_t gToss
;
511 static const uint32_t vertsPerTri
= 3, componentsPerAttrib
= 4;
512 // try to avoid _chkstk insertions; make this thread local
513 static THREAD
OSALIGNLINE(float) perspAttribsTLS
[vertsPerTri
* KNOB_NUM_ATTRIBUTES
* componentsPerAttrib
];
516 void ComputeEdgeData(int32_t a
, int32_t b
, EDGE
& edge
)
521 // compute constant steps to adjacent quads
522 edge
.stepQuadX
= (double)((int64_t)a
* (int64_t)(2 * FIXED_POINT_SCALE
));
523 edge
.stepQuadY
= (double)((int64_t)b
* (int64_t)(2 * FIXED_POINT_SCALE
));
525 // compute constant steps to adjacent raster tiles
526 edge
.stepRasterTileX
= (double)((int64_t)a
* (int64_t)(KNOB_TILE_X_DIM
* FIXED_POINT_SCALE
));
527 edge
.stepRasterTileY
= (double)((int64_t)b
* (int64_t)(KNOB_TILE_Y_DIM
* FIXED_POINT_SCALE
));
529 // compute quad offsets
530 const __m256d vQuadOffsetsXIntFix8
= _mm256_set_pd(FIXED_POINT_SCALE
, 0, FIXED_POINT_SCALE
, 0);
531 const __m256d vQuadOffsetsYIntFix8
= _mm256_set_pd(FIXED_POINT_SCALE
, FIXED_POINT_SCALE
, 0, 0);
533 __m256d vQuadStepXFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.a
), vQuadOffsetsXIntFix8
);
534 __m256d vQuadStepYFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.b
), vQuadOffsetsYIntFix8
);
535 edge
.vQuadOffsets
= _mm256_add_pd(vQuadStepXFix16
, vQuadStepYFix16
);
537 // compute raster tile offsets
538 const __m256d vTileOffsetsXIntFix8
= _mm256_set_pd((KNOB_TILE_X_DIM
- 1)*FIXED_POINT_SCALE
, 0, (KNOB_TILE_X_DIM
- 1)*FIXED_POINT_SCALE
, 0);
539 const __m256d vTileOffsetsYIntFix8
= _mm256_set_pd((KNOB_TILE_Y_DIM
- 1)*FIXED_POINT_SCALE
, (KNOB_TILE_Y_DIM
- 1)*FIXED_POINT_SCALE
, 0, 0);
541 __m256d vTileStepXFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.a
), vTileOffsetsXIntFix8
);
542 __m256d vTileStepYFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.b
), vTileOffsetsYIntFix8
);
543 edge
.vRasterTileOffsets
= _mm256_add_pd(vTileStepXFix16
, vTileStepYFix16
);
547 void ComputeEdgeData(const POS
& p0
, const POS
& p1
, EDGE
& edge
)
549 ComputeEdgeData(p0
.y
- p1
.y
, p1
.x
- p0
.x
, edge
);
552 //////////////////////////////////////////////////////////////////////////
553 /// @brief Primary template definition used for partially specializing
554 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
555 /// corner to sample position, and test for coverage
556 /// @tparam sampleCount: multisample count
557 template <typename NumSamplesT
>
558 INLINE
void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox
)[3], const __m256d
* vEdgeFix16
,
559 int32_t &mask0
, int32_t &mask1
, int32_t &mask2
)
561 __m256d vSampleBboxTest0
, vSampleBboxTest1
, vSampleBboxTest2
;
562 // evaluate edge equations at the tile multisample bounding box
563 vSampleBboxTest0
= _mm256_add_pd(vEdgeTileBbox
[0], vEdgeFix16
[0]);
564 vSampleBboxTest1
= _mm256_add_pd(vEdgeTileBbox
[1], vEdgeFix16
[1]);
565 vSampleBboxTest2
= _mm256_add_pd(vEdgeTileBbox
[2], vEdgeFix16
[2]);
566 mask0
= _mm256_movemask_pd(vSampleBboxTest0
);
567 mask1
= _mm256_movemask_pd(vSampleBboxTest1
);
568 mask2
= _mm256_movemask_pd(vSampleBboxTest2
);
571 //////////////////////////////////////////////////////////////////////////
572 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
573 /// when only rasterizing a single coverage test point
575 INLINE
void UpdateEdgeMasks
<SingleSampleT
>(const __m256d(&)[3], const __m256d
* vEdgeFix16
,
576 int32_t &mask0
, int32_t &mask1
, int32_t &mask2
)
578 mask0
= _mm256_movemask_pd(vEdgeFix16
[0]);
579 mask1
= _mm256_movemask_pd(vEdgeFix16
[1]);
580 mask2
= _mm256_movemask_pd(vEdgeFix16
[2]);
583 //////////////////////////////////////////////////////////////////////////
584 /// @struct ComputeScissorEdges
585 /// @brief Primary template definition. Allows the function to be generically
586 /// called. When paired with below specializations, will result in an empty
587 /// inlined function if scissor is not enabled
588 /// @tparam RasterScissorEdgesT: is scissor enabled?
589 /// @tparam IsConservativeT: is conservative rast enabled?
590 /// @tparam RT: rasterizer traits
591 template <typename RasterScissorEdgesT
, typename IsConservativeT
, typename RT
>
592 struct ComputeScissorEdges
594 INLINE
ComputeScissorEdges(const SWR_RECT
&triBBox
, const SWR_RECT
&scissorBBox
, const int32_t x
, const int32_t y
,
595 EDGE (&rastEdges
)[RT::NumEdgesT::value
], __m256d (&vEdgeFix16
)[7]){};
598 //////////////////////////////////////////////////////////////////////////
599 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
600 /// specialization. Instantiated when conservative rast and scissor are enabled
601 template <typename RT
>
602 struct ComputeScissorEdges
<std::true_type
, std::true_type
, RT
>
604 //////////////////////////////////////////////////////////////////////////
605 /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
606 /// evaluate edge equations and offset them away from pixel center.
607 INLINE
ComputeScissorEdges(const SWR_RECT
&triBBox
, const SWR_RECT
&scissorBBox
, const int32_t x
, const int32_t y
,
608 EDGE (&rastEdges
)[RT::NumEdgesT::value
], __m256d (&vEdgeFix16
)[7])
610 // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
612 scissor
.xmin
= std::max(triBBox
.xmin
, scissorBBox
.xmin
);
613 scissor
.xmax
= std::min(triBBox
.xmax
, scissorBBox
.xmax
);
614 scissor
.ymin
= std::max(triBBox
.ymin
, scissorBBox
.ymin
);
615 scissor
.ymax
= std::min(triBBox
.ymax
, scissorBBox
.ymax
);
617 POS topLeft
{scissor
.xmin
, scissor
.ymin
};
618 POS bottomLeft
{scissor
.xmin
, scissor
.ymax
};
619 POS topRight
{scissor
.xmax
, scissor
.ymin
};
620 POS bottomRight
{scissor
.xmax
, scissor
.ymax
};
622 // construct 4 scissor edges in ccw direction
623 ComputeEdgeData(topLeft
, bottomLeft
, rastEdges
[3]);
624 ComputeEdgeData(bottomLeft
, bottomRight
, rastEdges
[4]);
625 ComputeEdgeData(bottomRight
, topRight
, rastEdges
[5]);
626 ComputeEdgeData(topRight
, topLeft
, rastEdges
[6]);
628 vEdgeFix16
[3] = _mm256_set1_pd((rastEdges
[3].a
* (x
- scissor
.xmin
)) + (rastEdges
[3].b
* (y
- scissor
.ymin
)));
629 vEdgeFix16
[4] = _mm256_set1_pd((rastEdges
[4].a
* (x
- scissor
.xmin
)) + (rastEdges
[4].b
* (y
- scissor
.ymax
)));
630 vEdgeFix16
[5] = _mm256_set1_pd((rastEdges
[5].a
* (x
- scissor
.xmax
)) + (rastEdges
[5].b
* (y
- scissor
.ymax
)));
631 vEdgeFix16
[6] = _mm256_set1_pd((rastEdges
[6].a
* (x
- scissor
.xmax
)) + (rastEdges
[6].b
* (y
- scissor
.ymin
)));
633 // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
634 adjustScissorEdge
<RT
>(rastEdges
[3].a
, rastEdges
[3].b
, vEdgeFix16
[3]);
635 adjustScissorEdge
<RT
>(rastEdges
[4].a
, rastEdges
[4].b
, vEdgeFix16
[4]);
636 adjustScissorEdge
<RT
>(rastEdges
[5].a
, rastEdges
[5].b
, vEdgeFix16
[5]);
637 adjustScissorEdge
<RT
>(rastEdges
[6].a
, rastEdges
[6].b
, vEdgeFix16
[6]);
639 // Upper left rule for scissor
640 vEdgeFix16
[3] = _mm256_sub_pd(vEdgeFix16
[3], _mm256_set1_pd(1.0));
641 vEdgeFix16
[6] = _mm256_sub_pd(vEdgeFix16
[6], _mm256_set1_pd(1.0));
645 //////////////////////////////////////////////////////////////////////////
646 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
647 /// specialization. Instantiated when scissor is enabled and conservative rast
649 template <typename RT
>
650 struct ComputeScissorEdges
<std::true_type
, std::false_type
, RT
>
652 //////////////////////////////////////////////////////////////////////////
653 /// @brief Compute scissor edge vectors and evaluate edge equations
654 INLINE
ComputeScissorEdges(const SWR_RECT
&, const SWR_RECT
&scissorBBox
, const int32_t x
, const int32_t y
,
655 EDGE (&rastEdges
)[RT::NumEdgesT::value
], __m256d (&vEdgeFix16
)[7])
657 const SWR_RECT
&scissor
= scissorBBox
;
658 POS topLeft
{scissor
.xmin
, scissor
.ymin
};
659 POS bottomLeft
{scissor
.xmin
, scissor
.ymax
};
660 POS topRight
{scissor
.xmax
, scissor
.ymin
};
661 POS bottomRight
{scissor
.xmax
, scissor
.ymax
};
663 // construct 4 scissor edges in ccw direction
664 ComputeEdgeData(topLeft
, bottomLeft
, rastEdges
[3]);
665 ComputeEdgeData(bottomLeft
, bottomRight
, rastEdges
[4]);
666 ComputeEdgeData(bottomRight
, topRight
, rastEdges
[5]);
667 ComputeEdgeData(topRight
, topLeft
, rastEdges
[6]);
669 vEdgeFix16
[3] = _mm256_set1_pd((rastEdges
[3].a
* (x
- scissor
.xmin
)) + (rastEdges
[3].b
* (y
- scissor
.ymin
)));
670 vEdgeFix16
[4] = _mm256_set1_pd((rastEdges
[4].a
* (x
- scissor
.xmin
)) + (rastEdges
[4].b
* (y
- scissor
.ymax
)));
671 vEdgeFix16
[5] = _mm256_set1_pd((rastEdges
[5].a
* (x
- scissor
.xmax
)) + (rastEdges
[5].b
* (y
- scissor
.ymax
)));
672 vEdgeFix16
[6] = _mm256_set1_pd((rastEdges
[6].a
* (x
- scissor
.xmax
)) + (rastEdges
[6].b
* (y
- scissor
.ymin
)));
674 // Upper left rule for scissor
675 vEdgeFix16
[3] = _mm256_sub_pd(vEdgeFix16
[3], _mm256_set1_pd(1.0));
676 vEdgeFix16
[6] = _mm256_sub_pd(vEdgeFix16
[6], _mm256_set1_pd(1.0));
680 //////////////////////////////////////////////////////////////////////////
681 /// @brief Primary function template for TrivialRejectTest. Should
682 /// never be called, but TemplateUnroller instantiates a few unused values,
683 /// so it calls a runtime assert instead of a static_assert.
684 template <typename ValidEdgeMaskT
>
685 INLINE
bool TrivialRejectTest(const int, const int, const int)
687 SWR_INVALID("Primary templated function should never be called");
691 //////////////////////////////////////////////////////////////////////////
692 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
693 /// and edge 1 for trivial coverage reject
695 INLINE
bool TrivialRejectTest
<E0E1ValidT
>(const int mask0
, const int mask1
, const int)
697 return (!(mask0
&& mask1
)) ? true : false;
700 //////////////////////////////////////////////////////////////////////////
701 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
702 /// and edge 2 for trivial coverage reject
704 INLINE
bool TrivialRejectTest
<E0E2ValidT
>(const int mask0
, const int, const int mask2
)
706 return (!(mask0
&& mask2
)) ? true : false;
709 //////////////////////////////////////////////////////////////////////////
710 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
711 /// and edge 2 for trivial coverage reject
713 INLINE
bool TrivialRejectTest
<E1E2ValidT
>(const int, const int mask1
, const int mask2
)
715 return (!(mask1
&& mask2
)) ? true : false;
718 //////////////////////////////////////////////////////////////////////////
719 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
720 /// primitive edges for trivial coverage reject
722 INLINE
bool TrivialRejectTest
<AllEdgesValidT
>(const int mask0
, const int mask1
, const int mask2
)
724 return (!(mask0
&& mask1
&& mask2
)) ? true : false;;
727 //////////////////////////////////////////////////////////////////////////
728 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
729 /// point, so return false and rasterize against conservative BBox
731 INLINE
bool TrivialRejectTest
<NoEdgesValidT
>(const int, const int, const int)
736 //////////////////////////////////////////////////////////////////////////
737 /// @brief Primary function template for TrivialAcceptTest. Always returns
738 /// false, since it will only be called for degenerate tris, and as such
739 /// will never cover the entire raster tile
740 template <typename ScissorEnableT
>
741 INLINE
bool TrivialAcceptTest(const int, const int, const int)
746 //////////////////////////////////////////////////////////////////////////
747 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
748 /// edge masks for a fully covered raster tile
750 INLINE
bool TrivialAcceptTest
<std::false_type
>(const int mask0
, const int mask1
, const int mask2
)
752 return ((mask0
& mask1
& mask2
) == 0xf);
755 //////////////////////////////////////////////////////////////////////////
756 /// @brief Primary function template for GenerateSVInnerCoverage. Results
757 /// in an empty function call if SVInnerCoverage isn't requested
758 template <typename RT
, typename ValidEdgeMaskT
, typename InputCoverageT
>
759 struct GenerateSVInnerCoverage
761 INLINE
GenerateSVInnerCoverage(DRAW_CONTEXT
*, uint32_t, EDGE
*, double*, uint64_t &){};
764 //////////////////////////////////////////////////////////////////////////
765 /// @brief Specialization of GenerateSVInnerCoverage where all edges
766 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
767 /// edge values from OuterConservative to InnerConservative and rasterizes.
768 template <typename RT
>
769 struct GenerateSVInnerCoverage
<RT
, AllEdgesValidT
, InnerConservativeCoverageT
>
771 INLINE
GenerateSVInnerCoverage(DRAW_CONTEXT
* pDC
, uint32_t workerId
, EDGE
* pRastEdges
, double* pStartQuadEdges
, uint64_t &innerCoverageMask
)
773 SWR_CONTEXT
*pContext
= pDC
->pContext
;
775 double startQuadEdgesAdj
[RT::NumEdgesT::value
];
776 for(uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
778 startQuadEdgesAdj
[e
] = adjustScalarEdge
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(pRastEdges
[e
].a
, pRastEdges
[e
].b
, pStartQuadEdges
[e
]);
781 // not trivial accept or reject, must rasterize full tile
782 AR_BEGIN(BERasterizePartial
, pDC
->drawId
);
783 innerCoverageMask
= rasterizePartialTile
<RT::NumEdgesT::value
, typename
RT::ValidEdgeMaskT
>(pDC
, startQuadEdgesAdj
, pRastEdges
);
784 AR_END(BERasterizePartial
, 0);
788 //////////////////////////////////////////////////////////////////////////
789 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
790 /// in an empty function call if SVInnerCoverage isn't requested
791 template <typename RT
, typename ValidEdgeMaskT
, typename InputCoverageT
>
792 struct UpdateEdgeMasksInnerConservative
794 INLINE
UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox
)[3], const __m256d
*,
795 const __m128i
, const __m128i
, int32_t &, int32_t &, int32_t &){};
798 //////////////////////////////////////////////////////////////////////////
799 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
800 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
801 /// evaluated at raster tile corners to inner conservative position and
802 /// updates edge masks
803 template <typename RT
>
804 struct UpdateEdgeMasksInnerConservative
<RT
, AllEdgesValidT
, InnerConservativeCoverageT
>
806 INLINE
UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox
)[3], const __m256d
* vEdgeFix16
,
807 const __m128i vAi
, const __m128i vBi
, int32_t &mask0
, int32_t &mask1
, int32_t &mask2
)
809 __m256d vTempEdge
[3]{vEdgeFix16
[0], vEdgeFix16
[1], vEdgeFix16
[2]};
811 // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
812 // conservative evaluated edge when adjusting the edge in for inner conservative tests
813 adjustEdgeConservative
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(vAi
, vBi
, vTempEdge
[0]);
814 adjustEdgeConservative
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(vAi
, vBi
, vTempEdge
[1]);
815 adjustEdgeConservative
<RT
, typename
RT::InnerConservativeEdgeOffsetT
>(vAi
, vBi
, vTempEdge
[2]);
817 UpdateEdgeMasks
<typename
RT::NumCoverageSamplesT
>(vEdgeTileBbox
, vTempEdge
, mask0
, mask1
, mask2
);
821 //////////////////////////////////////////////////////////////////////////
822 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
823 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
824 /// cover an entire raster tile, set mask0 to 0 to force it down the
825 /// rastierizePartialTile path
826 template <typename RT
, typename ValidEdgeMaskT
>
827 struct UpdateEdgeMasksInnerConservative
<RT
, ValidEdgeMaskT
, InnerConservativeCoverageT
>
829 INLINE
UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d
*,
830 const __m128i
, const __m128i
, int32_t &mask0
, int32_t &, int32_t &)
832 // set one mask to zero to force the triangle down the rastierizePartialTile path
837 template <typename RT
>
838 void RasterizeTriangle(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t macroTile
, void* pDesc
)
840 SWR_CONTEXT
*pContext
= pDC
->pContext
;
841 const TRIANGLE_WORK_DESC
&workDesc
= *((TRIANGLE_WORK_DESC
*)pDesc
);
842 #if KNOB_ENABLE_TOSS_POINTS
843 if (KNOB_TOSS_BIN_TRIS
)
848 AR_BEGIN(BERasterizeTriangle
, pDC
->drawId
);
849 AR_BEGIN(BETriangleSetup
, pDC
->drawId
);
851 const API_STATE
&state
= GetApiState(pDC
);
852 const SWR_RASTSTATE
&rastState
= state
.rastState
;
853 const BACKEND_FUNCS
& backendFuncs
= pDC
->pState
->backendFuncs
;
855 OSALIGNSIMD(SWR_TRIANGLE_DESC
) triDesc
;
856 triDesc
.pUserClipBuffer
= workDesc
.pUserClipBuffer
;
858 __m128 vX
, vY
, vZ
, vRecipW
;
860 // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
861 // eg: vX = [x0 x1 x2 dc]
862 vX
= _mm_load_ps(workDesc
.pTriBuffer
);
863 vY
= _mm_load_ps(workDesc
.pTriBuffer
+ 4);
864 vZ
= _mm_load_ps(workDesc
.pTriBuffer
+ 8);
865 vRecipW
= _mm_load_ps(workDesc
.pTriBuffer
+ 12);
867 // convert to fixed point
868 static_assert(std::is_same
<typename
RT::PrecisionT
, FixedPointTraits
<Fixed_16_8
>>::value
, "Rasterizer expects 16.8 fixed point precision");
869 __m128i vXi
= fpToFixedPoint(vX
);
870 __m128i vYi
= fpToFixedPoint(vY
);
872 // quantize floating point position to fixed point precision
873 // to prevent attribute creep around the triangle vertices
874 vX
= _mm_mul_ps(_mm_cvtepi32_ps(vXi
), _mm_set1_ps(1.0f
/ FIXED_POINT_SCALE
));
875 vY
= _mm_mul_ps(_mm_cvtepi32_ps(vYi
), _mm_set1_ps(1.0f
/ FIXED_POINT_SCALE
));
877 // triangle setup - A and B edge equation coefs
879 triangleSetupAB(vX
, vY
, vA
, vB
);
882 triangleSetupABInt(vXi
, vYi
, vAi
, vBi
);
885 float det
= calcDeterminantInt(vAi
, vBi
);
887 // Verts in Pixel Coordinate Space at this point
888 // Det > 0 = CW winding order
889 // Convert CW triangles to CCW
892 vA
= _mm_mul_ps(vA
, _mm_set1_ps(-1));
893 vB
= _mm_mul_ps(vB
, _mm_set1_ps(-1));
894 vAi
= _mm_mullo_epi32(vAi
, _mm_set1_epi32(-1));
895 vBi
= _mm_mullo_epi32(vBi
, _mm_set1_epi32(-1));
900 // Finish triangle setup - C edge coef
901 triangleSetupC(vX
, vY
, vA
, vB
, vC
);
903 if(RT::ValidEdgeMaskT::value
!= ALL_EDGES_VALID
)
905 // If we have degenerate edge(s) to rasterize, set I and J coefs
906 // to 0 for constant interpolation of attributes
914 // Degenerate triangles have no area
915 triDesc
.recipDet
= 0.0f
;
919 // only extract coefs for 2 of the barycentrics; the 3rd can be
920 // determined from the barycentric equation:
921 // i + j + k = 1 <=> k = 1 - j - i
922 _MM_EXTRACT_FLOAT(triDesc
.I
[0], vA
, 1);
923 _MM_EXTRACT_FLOAT(triDesc
.I
[1], vB
, 1);
924 _MM_EXTRACT_FLOAT(triDesc
.I
[2], vC
, 1);
925 _MM_EXTRACT_FLOAT(triDesc
.J
[0], vA
, 2);
926 _MM_EXTRACT_FLOAT(triDesc
.J
[1], vB
, 2);
927 _MM_EXTRACT_FLOAT(triDesc
.J
[2], vC
, 2);
929 // compute recipDet, used to calculate barycentric i and j in the backend
930 triDesc
.recipDet
= 1.0f
/det
;
933 OSALIGNSIMD(float) oneOverW
[4];
934 _mm_store_ps(oneOverW
, vRecipW
);
935 triDesc
.OneOverW
[0] = oneOverW
[0] - oneOverW
[2];
936 triDesc
.OneOverW
[1] = oneOverW
[1] - oneOverW
[2];
937 triDesc
.OneOverW
[2] = oneOverW
[2];
939 // calculate perspective correct coefs per vertex attrib
940 float* pPerspAttribs
= perspAttribsTLS
;
941 float* pAttribs
= workDesc
.pAttribs
;
942 triDesc
.pPerspAttribs
= pPerspAttribs
;
943 triDesc
.pAttribs
= pAttribs
;
944 float *pRecipW
= workDesc
.pTriBuffer
+ 12;
945 triDesc
.pRecipW
= pRecipW
;
946 __m128 vOneOverWV0
= _mm_broadcast_ss(pRecipW
);
947 __m128 vOneOverWV1
= _mm_broadcast_ss(pRecipW
+=1);
948 __m128 vOneOverWV2
= _mm_broadcast_ss(pRecipW
+=1);
949 for(uint32_t i
= 0; i
< workDesc
.numAttribs
; i
++)
951 __m128 attribA
= _mm_load_ps(pAttribs
);
952 __m128 attribB
= _mm_load_ps(pAttribs
+=4);
953 __m128 attribC
= _mm_load_ps(pAttribs
+=4);
956 attribA
= _mm_mul_ps(attribA
, vOneOverWV0
);
957 attribB
= _mm_mul_ps(attribB
, vOneOverWV1
);
958 attribC
= _mm_mul_ps(attribC
, vOneOverWV2
);
960 _mm_store_ps(pPerspAttribs
, attribA
);
961 _mm_store_ps(pPerspAttribs
+=4, attribB
);
962 _mm_store_ps(pPerspAttribs
+=4, attribC
);
967 // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
968 OSALIGNSIMD(float) a
[4];
970 triDesc
.Z
[0] = a
[0] - a
[2];
971 triDesc
.Z
[1] = a
[1] - a
[2];
975 triDesc
.Z
[2] += ComputeDepthBias(&rastState
, &triDesc
, workDesc
.pTriBuffer
+ 8);
977 // Calc bounding box of triangle
978 OSALIGNSIMD(SWR_RECT
) bbox
;
979 calcBoundingBoxInt(vXi
, vYi
, bbox
);
981 const SWR_RECT
&scissorInFixedPoint
= state
.scissorsInFixedPoint
[workDesc
.triFlags
.viewportIndex
];
983 if(RT::ValidEdgeMaskT::value
!= ALL_EDGES_VALID
)
985 // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
986 bbox
.xmin
--; bbox
.xmax
++; bbox
.ymin
--; bbox
.ymax
++;
987 SWR_ASSERT(scissorInFixedPoint
.xmin
>= 0 && scissorInFixedPoint
.ymin
>= 0,
988 "Conservative rast degenerate handling requires a valid scissor rect");
991 // Intersect with scissor/viewport
992 OSALIGNSIMD(SWR_RECT
) intersect
;
993 intersect
.xmin
= std::max(bbox
.xmin
, scissorInFixedPoint
.xmin
);
994 intersect
.xmax
= std::min(bbox
.xmax
- 1, scissorInFixedPoint
.xmax
);
995 intersect
.ymin
= std::max(bbox
.ymin
, scissorInFixedPoint
.ymin
);
996 intersect
.ymax
= std::min(bbox
.ymax
- 1, scissorInFixedPoint
.ymax
);
998 triDesc
.triFlags
= workDesc
.triFlags
;
1000 // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
1001 uint32_t macroX
, macroY
;
1002 MacroTileMgr::getTileIndices(macroTile
, macroX
, macroY
);
1003 int32_t macroBoxLeft
= macroX
* KNOB_MACROTILE_X_DIM_FIXED
;
1004 int32_t macroBoxRight
= macroBoxLeft
+ KNOB_MACROTILE_X_DIM_FIXED
- 1;
1005 int32_t macroBoxTop
= macroY
* KNOB_MACROTILE_Y_DIM_FIXED
;
1006 int32_t macroBoxBottom
= macroBoxTop
+ KNOB_MACROTILE_Y_DIM_FIXED
- 1;
1008 intersect
.xmin
= std::max(intersect
.xmin
, macroBoxLeft
);
1009 intersect
.ymin
= std::max(intersect
.ymin
, macroBoxTop
);
1010 intersect
.xmax
= std::min(intersect
.xmax
, macroBoxRight
);
1011 intersect
.ymax
= std::min(intersect
.ymax
, macroBoxBottom
);
1013 SWR_ASSERT(intersect
.xmin
<= intersect
.xmax
&& intersect
.ymin
<= intersect
.ymax
&& intersect
.xmin
>= 0 && intersect
.xmax
>= 0 && intersect
.ymin
>= 0 && intersect
.ymax
>= 0);
1015 AR_END(BETriangleSetup
, 0);
1017 // update triangle desc
1018 uint32_t minTileX
= intersect
.xmin
>> (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1019 uint32_t minTileY
= intersect
.ymin
>> (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1020 uint32_t maxTileX
= intersect
.xmax
>> (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1021 uint32_t maxTileY
= intersect
.ymax
>> (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1022 uint32_t numTilesX
= maxTileX
- minTileX
+ 1;
1023 uint32_t numTilesY
= maxTileY
- minTileY
+ 1;
1025 if (numTilesX
== 0 || numTilesY
== 0)
1027 RDTSC_EVENT(BEEmptyTriangle
, 1, 0);
1028 AR_END(BERasterizeTriangle
, 1);
1032 AR_BEGIN(BEStepSetup
, pDC
->drawId
);
1034 // Step to pixel center of top-left pixel of the triangle bbox
1035 // Align intersect bbox (top/left) to raster tile's (top/left).
1036 int32_t x
= AlignDown(intersect
.xmin
, (FIXED_POINT_SCALE
* KNOB_TILE_X_DIM
));
1037 int32_t y
= AlignDown(intersect
.ymin
, (FIXED_POINT_SCALE
* KNOB_TILE_Y_DIM
));
1039 // convenience typedef
1040 typedef typename
RT::NumCoverageSamplesT NumCoverageSamplesT
;
1042 // single sample rasterization evaluates edges at pixel center,
1043 // multisample evaluates edges UL pixel corner and steps to each sample position
1044 if(std::is_same
<NumCoverageSamplesT
, SingleSampleT
>::value
)
1046 // Add 0.5, in fixed point, to offset to pixel center
1047 x
+= (FIXED_POINT_SCALE
/ 2);
1048 y
+= (FIXED_POINT_SCALE
/ 2);
1051 __m128i vTopLeftX
= _mm_set1_epi32(x
);
1052 __m128i vTopLeftY
= _mm_set1_epi32(y
);
1054 // evaluate edge equations at top-left pixel using 64bit math
1056 // line = Ax + By + C
1059 // we know x0 and y0 are on the line; plug them in:
1061 // plug C back into line equation:
1062 // line = Ax - By - Ax0 - By0
1063 // line = A(x - x0) + B(y - y0)
1064 // dX = (x-x0), dY = (y-y0)
1065 // so all this simplifies to
1066 // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
1068 __m128i vDeltaX
= _mm_sub_epi32(vTopLeftX
, vXi
);
1069 __m128i vDeltaY
= _mm_sub_epi32(vTopLeftY
, vYi
);
1071 // evaluate A(dx) and B(dY) for all points
1072 __m256d vAipd
= _mm256_cvtepi32_pd(vAi
);
1073 __m256d vBipd
= _mm256_cvtepi32_pd(vBi
);
1074 __m256d vDeltaXpd
= _mm256_cvtepi32_pd(vDeltaX
);
1075 __m256d vDeltaYpd
= _mm256_cvtepi32_pd(vDeltaY
);
1077 __m256d vAiDeltaXFix16
= _mm256_mul_pd(vAipd
, vDeltaXpd
);
1078 __m256d vBiDeltaYFix16
= _mm256_mul_pd(vBipd
, vDeltaYpd
);
1079 __m256d vEdge
= _mm256_add_pd(vAiDeltaXFix16
, vBiDeltaYFix16
);
1081 // apply any edge adjustments(top-left, crast, etc)
1082 adjustEdgesFix16
<RT
, typename
RT::ConservativeEdgeOffsetT
>(vAi
, vBi
, vEdge
);
1084 // broadcast respective edge results to all lanes
1085 double* pEdge
= (double*)&vEdge
;
1086 __m256d vEdgeFix16
[7];
1087 vEdgeFix16
[0] = _mm256_set1_pd(pEdge
[0]);
1088 vEdgeFix16
[1] = _mm256_set1_pd(pEdge
[1]);
1089 vEdgeFix16
[2] = _mm256_set1_pd(pEdge
[2]);
1091 OSALIGNSIMD(int32_t) aAi
[4], aBi
[4];
1092 _mm_store_si128((__m128i
*)aAi
, vAi
);
1093 _mm_store_si128((__m128i
*)aBi
, vBi
);
1094 EDGE rastEdges
[RT::NumEdgesT::value
];
1096 // Compute and store triangle edge data
1097 ComputeEdgeData(aAi
[0], aBi
[0], rastEdges
[0]);
1098 ComputeEdgeData(aAi
[1], aBi
[1], rastEdges
[1]);
1099 ComputeEdgeData(aAi
[2], aBi
[2], rastEdges
[2]);
1101 // Compute and store triangle edge data if scissor needs to rasterized
1102 ComputeScissorEdges
<typename
RT::RasterizeScissorEdgesT
, typename
RT::IsConservativeT
, RT
>
1103 (bbox
, scissorInFixedPoint
, x
, y
, rastEdges
, vEdgeFix16
);
1105 // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
1106 // used to for testing if entire raster tile is inside a triangle
1107 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1109 vEdgeFix16
[e
] = _mm256_add_pd(vEdgeFix16
[e
], rastEdges
[e
].vRasterTileOffsets
);
1112 // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
1113 // step sample positions to the raster tile bbox of multisample points
1114 // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples)
1117 // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
1118 __m256d vEdgeTileBbox
[3];
1119 if (NumCoverageSamplesT::value
> 1)
1121 const SWR_MULTISAMPLE_POS
&samplePos
= rastState
.samplePositions
;
1122 const __m128i vTileSampleBBoxXh
= samplePos
.TileSampleOffsetsX();
1123 const __m128i vTileSampleBBoxYh
= samplePos
.TileSampleOffsetsY();
1125 __m256d vTileSampleBBoxXFix8
= _mm256_cvtepi32_pd(vTileSampleBBoxXh
);
1126 __m256d vTileSampleBBoxYFix8
= _mm256_cvtepi32_pd(vTileSampleBBoxYh
);
1128 // step edge equation tests from Tile
1129 // used to for testing if entire raster tile is inside a triangle
1130 for (uint32_t e
= 0; e
< 3; ++e
)
1132 __m256d vResultAxFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].a
), vTileSampleBBoxXFix8
);
1133 __m256d vResultByFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].b
), vTileSampleBBoxYFix8
);
1134 vEdgeTileBbox
[e
] = _mm256_add_pd(vResultAxFix16
, vResultByFix16
);
1136 // adjust for msaa tile bbox edges outward for conservative rast, if enabled
1137 adjustEdgeConservative
<RT
, typename
RT::ConservativeEdgeOffsetT
>(vAi
, vBi
, vEdgeTileBbox
[e
]);
1141 AR_END(BEStepSetup
, 0);
1143 uint32_t tY
= minTileY
;
1144 uint32_t tX
= minTileX
;
1145 uint32_t maxY
= maxTileY
;
1146 uint32_t maxX
= maxTileX
;
1148 RenderOutputBuffers renderBuffers
, currentRenderBufferRow
;
1149 GetRenderHotTiles
<RT::MT::numSamples
>(pDC
, macroTile
, minTileX
, minTileY
, renderBuffers
, triDesc
.triFlags
.renderTargetArrayIndex
);
1150 currentRenderBufferRow
= renderBuffers
;
1152 // rasterize and generate coverage masks per sample
1153 for (uint32_t tileY
= tY
; tileY
<= maxY
; ++tileY
)
1155 __m256d vStartOfRowEdge
[RT::NumEdgesT::value
];
1156 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1158 vStartOfRowEdge
[e
] = vEdgeFix16
[e
];
1161 for (uint32_t tileX
= tX
; tileX
<= maxX
; ++tileX
)
1163 triDesc
.anyCoveredSamples
= 0;
1165 // is the corner of the edge outside of the raster tile? (vEdge < 0)
1166 int mask0
, mask1
, mask2
;
1167 UpdateEdgeMasks
<NumCoverageSamplesT
>(vEdgeTileBbox
, vEdgeFix16
, mask0
, mask1
, mask2
);
1169 for (uint32_t sampleNum
= 0; sampleNum
< NumCoverageSamplesT::value
; sampleNum
++)
1171 // trivial reject, at least one edge has all 4 corners of raster tile outside
1172 bool trivialReject
= TrivialRejectTest
<typename
RT::ValidEdgeMaskT
>(mask0
, mask1
, mask2
);
1176 // trivial accept mask
1177 triDesc
.coverageMask
[sampleNum
] = 0xffffffffffffffffULL
;
1179 // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
1180 UpdateEdgeMasksInnerConservative
<RT
, typename
RT::ValidEdgeMaskT
, typename
RT::InputCoverageT
>
1181 (vEdgeTileBbox
, vEdgeFix16
, vAi
, vBi
, mask0
, mask1
, mask2
);
1183 // @todo Make this a bit smarter to allow use of trivial accept when:
1184 // 1) scissor/vp intersection rect is raster tile aligned
1185 // 2) raster tile is entirely within scissor/vp intersection rect
1186 if (TrivialAcceptTest
<typename
RT::RasterizeScissorEdgesT
>(mask0
, mask1
, mask2
))
1188 // trivial accept, all 4 corners of all 3 edges are negative
1189 // i.e. raster tile completely inside triangle
1190 triDesc
.anyCoveredSamples
= triDesc
.coverageMask
[sampleNum
];
1191 if(std::is_same
<typename
RT::InputCoverageT
, InnerConservativeCoverageT
>::value
)
1193 triDesc
.innerCoverageMask
= 0xffffffffffffffffULL
;
1195 RDTSC_EVENT(BETrivialAccept
, 1, 0);
1199 __m256d vEdgeAtSample
[RT::NumEdgesT::value
];
1200 if(std::is_same
<NumCoverageSamplesT
, SingleSampleT
>::value
)
1202 // should get optimized out for single sample case (global value numbering or copy propagation)
1203 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1205 vEdgeAtSample
[e
] = vEdgeFix16
[e
];
1210 const SWR_MULTISAMPLE_POS
&samplePos
= rastState
.samplePositions
;
1211 __m128i vSampleOffsetXh
= samplePos
.vXi(sampleNum
);
1212 __m128i vSampleOffsetYh
= samplePos
.vYi(sampleNum
);
1213 __m256d vSampleOffsetX
= _mm256_cvtepi32_pd(vSampleOffsetXh
);
1214 __m256d vSampleOffsetY
= _mm256_cvtepi32_pd(vSampleOffsetYh
);
1216 // step edge equation tests from UL tile corner to pixel sample position
1217 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1219 __m256d vResultAxFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].a
), vSampleOffsetX
);
1220 __m256d vResultByFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].b
), vSampleOffsetY
);
1221 vEdgeAtSample
[e
] = _mm256_add_pd(vResultAxFix16
, vResultByFix16
);
1222 vEdgeAtSample
[e
] = _mm256_add_pd(vEdgeFix16
[e
], vEdgeAtSample
[e
]);
1226 double startQuadEdges
[RT::NumEdgesT::value
];
1227 const __m256i vLane0Mask
= _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1228 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1230 _mm256_maskstore_pd(&startQuadEdges
[e
], vLane0Mask
, vEdgeAtSample
[e
]);
1233 // not trivial accept or reject, must rasterize full tile
1234 AR_BEGIN(BERasterizePartial
, pDC
->drawId
);
1235 triDesc
.coverageMask
[sampleNum
] = rasterizePartialTile
<RT::NumEdgesT::value
, typename
RT::ValidEdgeMaskT
>(pDC
, startQuadEdges
, rastEdges
);
1236 AR_END(BERasterizePartial
, 0);
1238 triDesc
.anyCoveredSamples
|= triDesc
.coverageMask
[sampleNum
];
1240 // Output SV InnerCoverage, if needed
1241 GenerateSVInnerCoverage
<RT
, typename
RT::ValidEdgeMaskT
, typename
RT::InputCoverageT
>(pDC
, workerId
, rastEdges
, startQuadEdges
, triDesc
.innerCoverageMask
);
1246 // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
1247 if(NumCoverageSamplesT::value
> 1)
1249 triDesc
.coverageMask
[sampleNum
] = 0;
1251 RDTSC_EVENT(BETrivialReject
, 1, 0);
1255 #if KNOB_ENABLE_TOSS_POINTS
1258 gToss
= triDesc
.coverageMask
[0];
1262 if(triDesc
.anyCoveredSamples
)
1264 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
1265 // copy conservative coverage result to all samples
1266 if(RT::IsConservativeT::value
)
1268 auto copyCoverage
= [&](int sample
){triDesc
.coverageMask
[sample
] = triDesc
.coverageMask
[0]; };
1269 UnrollerL
<1, RT::MT::numSamples
, 1>::step(copyCoverage
);
1272 AR_BEGIN(BEPixelBackend
, pDC
->drawId
);
1273 backendFuncs
.pfnBackend(pDC
, workerId
, tileX
<< KNOB_TILE_X_DIM_SHIFT
, tileY
<< KNOB_TILE_Y_DIM_SHIFT
, triDesc
, renderBuffers
);
1274 AR_END(BEPixelBackend
, 0);
1277 // step to the next tile in X
1278 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1280 vEdgeFix16
[e
] = _mm256_add_pd(vEdgeFix16
[e
], _mm256_set1_pd(rastEdges
[e
].stepRasterTileX
));
1282 StepRasterTileX
<RT
>(state
.psState
.numRenderTargets
, renderBuffers
);
1285 // step to the next tile in Y
1286 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1288 vEdgeFix16
[e
] = _mm256_add_pd(vStartOfRowEdge
[e
], _mm256_set1_pd(rastEdges
[e
].stepRasterTileY
));
1290 StepRasterTileY
<RT
>(state
.psState
.numRenderTargets
, renderBuffers
, currentRenderBufferRow
);
1293 AR_END(BERasterizeTriangle
, 1);
1296 void RasterizeTriPoint(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void* pData
)
1298 const TRIANGLE_WORK_DESC
& workDesc
= *(const TRIANGLE_WORK_DESC
*)pData
;
1299 const SWR_RASTSTATE
& rastState
= pDC
->pState
->state
.rastState
;
1300 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
1302 bool isPointSpriteTexCoordEnabled
= backendState
.pointSpriteTexCoordMask
!= 0;
1304 // load point vertex
1305 float x
= *workDesc
.pTriBuffer
;
1306 float y
= *(workDesc
.pTriBuffer
+ 1);
1307 float z
= *(workDesc
.pTriBuffer
+ 2);
1309 // create a copy of the triangle buffer to write our adjusted vertices to
1310 OSALIGNSIMD(float) newTriBuffer
[4 * 4];
1311 TRIANGLE_WORK_DESC newWorkDesc
= workDesc
;
1312 newWorkDesc
.pTriBuffer
= &newTriBuffer
[0];
1314 // create a copy of the attrib buffer to write our adjusted attribs to
1315 OSALIGNSIMD(float) newAttribBuffer
[4 * 3 * KNOB_NUM_ATTRIBUTES
];
1316 newWorkDesc
.pAttribs
= &newAttribBuffer
[0];
1318 newWorkDesc
.pUserClipBuffer
= workDesc
.pUserClipBuffer
;
1319 newWorkDesc
.numAttribs
= workDesc
.numAttribs
;
1320 newWorkDesc
.triFlags
= workDesc
.triFlags
;
1322 // construct two tris by bloating point by point size
1323 float halfPointSize
= workDesc
.triFlags
.pointSize
* 0.5f
;
1324 float lowerX
= x
- halfPointSize
;
1325 float upperX
= x
+ halfPointSize
;
1326 float lowerY
= y
- halfPointSize
;
1327 float upperY
= y
+ halfPointSize
;
1330 float *pBuf
= &newTriBuffer
[0];
1339 _mm_store_ps(pBuf
, _mm_set1_ps(z
));
1340 _mm_store_ps(pBuf
+=4, _mm_set1_ps(1.0f
));
1342 // setup triangle rasterizer function
1343 PFN_WORK_FUNC pfnTriRast
;
1344 // conservative rast not supported for points/lines
1345 pfnTriRast
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, false,
1346 SWR_INPUT_COVERAGE_NONE
, EdgeValToEdgeState(ALL_EDGES_VALID
), (pDC
->pState
->state
.scissorsTileAligned
== false));
1348 // overwrite texcoords for point sprites
1349 if (isPointSpriteTexCoordEnabled
)
1351 // copy original attribs
1352 memcpy(&newAttribBuffer
[0], workDesc
.pAttribs
, 4 * 3 * workDesc
.numAttribs
* sizeof(float));
1353 newWorkDesc
.pAttribs
= &newAttribBuffer
[0];
1355 // overwrite texcoord for point sprites
1356 uint32_t texCoordMask
= backendState
.pointSpriteTexCoordMask
;
1357 DWORD texCoordAttrib
= 0;
1359 while (_BitScanForward(&texCoordAttrib
, texCoordMask
))
1361 texCoordMask
&= ~(1 << texCoordAttrib
);
1362 __m128
* pTexAttrib
= (__m128
*)&newAttribBuffer
[0] + 3 * texCoordAttrib
;
1363 if (rastState
.pointSpriteTopOrigin
)
1365 pTexAttrib
[0] = _mm_set_ps(1, 0, 0, 0);
1366 pTexAttrib
[1] = _mm_set_ps(1, 0, 1, 0);
1367 pTexAttrib
[2] = _mm_set_ps(1, 0, 1, 1);
1371 pTexAttrib
[0] = _mm_set_ps(1, 0, 1, 0);
1372 pTexAttrib
[1] = _mm_set_ps(1, 0, 0, 0);
1373 pTexAttrib
[2] = _mm_set_ps(1, 0, 0, 1);
1379 // no texcoord overwrite, can reuse the attrib buffer from frontend
1380 newWorkDesc
.pAttribs
= workDesc
.pAttribs
;
1383 pfnTriRast(pDC
, workerId
, macroTile
, (void*)&newWorkDesc
);
1386 pBuf
= &newTriBuffer
[0];
1396 if (isPointSpriteTexCoordEnabled
)
1398 uint32_t texCoordMask
= backendState
.pointSpriteTexCoordMask
;
1399 DWORD texCoordAttrib
= 0;
1401 while (_BitScanForward(&texCoordAttrib
, texCoordMask
))
1403 texCoordMask
&= ~(1 << texCoordAttrib
);
1404 __m128
* pTexAttrib
= (__m128
*)&newAttribBuffer
[0] + 3 * texCoordAttrib
;
1405 if (rastState
.pointSpriteTopOrigin
)
1407 pTexAttrib
[0] = _mm_set_ps(1, 0, 0, 0);
1408 pTexAttrib
[1] = _mm_set_ps(1, 0, 1, 1);
1409 pTexAttrib
[2] = _mm_set_ps(1, 0, 0, 1);
1414 pTexAttrib
[0] = _mm_set_ps(1, 0, 1, 0);
1415 pTexAttrib
[1] = _mm_set_ps(1, 0, 0, 1);
1416 pTexAttrib
[2] = _mm_set_ps(1, 0, 1, 1);
1421 pfnTriRast(pDC
, workerId
, macroTile
, (void*)&newWorkDesc
);
1424 void RasterizeSimplePoint(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void* pData
)
1426 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1428 #if KNOB_ENABLE_TOSS_POINTS
1429 if (KNOB_TOSS_BIN_TRIS
)
1435 const TRIANGLE_WORK_DESC
& workDesc
= *(const TRIANGLE_WORK_DESC
*)pData
;
1436 const BACKEND_FUNCS
& backendFuncs
= pDC
->pState
->backendFuncs
;
1438 // map x,y relative offsets from start of raster tile to bit position in
1439 // coverage mask for the point
1440 static const uint32_t coverageMap
[8][8] = {
1441 { 0, 1, 4, 5, 8, 9, 12, 13 },
1442 { 2, 3, 6, 7, 10, 11, 14, 15 },
1443 { 16, 17, 20, 21, 24, 25, 28, 29 },
1444 { 18, 19, 22, 23, 26, 27, 30, 31 },
1445 { 32, 33, 36, 37, 40, 41, 44, 45 },
1446 { 34, 35, 38, 39, 42, 43, 46, 47 },
1447 { 48, 49, 52, 53, 56, 57, 60, 61 },
1448 { 50, 51, 54, 55, 58, 59, 62, 63 }
1451 OSALIGNSIMD(SWR_TRIANGLE_DESC
) triDesc
;
1453 // pull point information from triangle buffer
1454 // @todo use structs for readability
1455 uint32_t tileAlignedX
= *(uint32_t*)workDesc
.pTriBuffer
;
1456 uint32_t tileAlignedY
= *(uint32_t*)(workDesc
.pTriBuffer
+ 1);
1457 float z
= *(workDesc
.pTriBuffer
+ 2);
1459 // construct triangle descriptor for point
1460 // no interpolation, set up i,j for constant interpolation of z and attribs
1461 // @todo implement an optimized backend that doesn't require triangle information
1463 // compute coverage mask from x,y packed into the coverageMask flag
1464 // mask indices by the maximum valid index for x/y of coveragemap.
1465 uint32_t tX
= workDesc
.triFlags
.coverageMask
& 0x7;
1466 uint32_t tY
= (workDesc
.triFlags
.coverageMask
>> 4) & 0x7;
1467 // todo: multisample points?
1468 triDesc
.coverageMask
[0] = 1ULL << coverageMap
[tY
][tX
];
1470 // no persp divide needed for points
1471 triDesc
.pAttribs
= triDesc
.pPerspAttribs
= workDesc
.pAttribs
;
1472 triDesc
.triFlags
= workDesc
.triFlags
;
1473 triDesc
.recipDet
= 1.0f
;
1474 triDesc
.OneOverW
[0] = triDesc
.OneOverW
[1] = triDesc
.OneOverW
[2] = 1.0f
;
1475 triDesc
.I
[0] = triDesc
.I
[1] = triDesc
.I
[2] = 0.0f
;
1476 triDesc
.J
[0] = triDesc
.J
[1] = triDesc
.J
[2] = 0.0f
;
1477 triDesc
.Z
[0] = triDesc
.Z
[1] = triDesc
.Z
[2] = z
;
1479 RenderOutputBuffers renderBuffers
;
1480 GetRenderHotTiles(pDC
, macroTile
, tileAlignedX
>> KNOB_TILE_X_DIM_SHIFT
, tileAlignedY
>> KNOB_TILE_Y_DIM_SHIFT
,
1481 renderBuffers
, triDesc
.triFlags
.renderTargetArrayIndex
);
1483 AR_BEGIN(BEPixelBackend
, pDC
->drawId
);
1484 backendFuncs
.pfnBackend(pDC
, workerId
, tileAlignedX
, tileAlignedY
, triDesc
, renderBuffers
);
1485 AR_END(BEPixelBackend
, 0);
1488 // Get pointers to hot tile memory for color RT, depth, stencil
1489 template <uint32_t numSamples
>
1490 void GetRenderHotTiles(DRAW_CONTEXT
*pDC
, uint32_t macroID
, uint32_t tileX
, uint32_t tileY
, RenderOutputBuffers
&renderBuffers
, uint32_t renderTargetArrayIndex
)
1492 const API_STATE
& state
= GetApiState(pDC
);
1493 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1496 MacroTileMgr::getTileIndices(macroID
, mx
, my
);
1497 tileX
-= KNOB_MACROTILE_X_DIM_IN_TILES
* mx
;
1498 tileY
-= KNOB_MACROTILE_Y_DIM_IN_TILES
* my
;
1500 // compute tile offset for active hottile buffers
1501 const uint32_t pitch
= KNOB_MACROTILE_X_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8;
1502 uint32_t offset
= ComputeTileOffset2D
<TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
> >(pitch
, tileX
, tileY
);
1505 unsigned long rtSlot
= 0;
1506 uint32_t colorHottileEnableMask
= state
.colorHottileEnable
;
1507 while(_BitScanForward(&rtSlot
, colorHottileEnableMask
))
1509 HOTTILE
*pColor
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroID
, (SWR_RENDERTARGET_ATTACHMENT
)(SWR_ATTACHMENT_COLOR0
+ rtSlot
), true,
1510 numSamples
, renderTargetArrayIndex
);
1511 pColor
->state
= HOTTILE_DIRTY
;
1512 renderBuffers
.pColor
[rtSlot
] = pColor
->pBuffer
+ offset
;
1514 colorHottileEnableMask
&= ~(1 << rtSlot
);
1516 if(state
.depthHottileEnable
)
1518 const uint32_t pitch
= KNOB_MACROTILE_X_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8;
1519 uint32_t offset
= ComputeTileOffset2D
<TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
> >(pitch
, tileX
, tileY
);
1521 HOTTILE
*pDepth
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroID
, SWR_ATTACHMENT_DEPTH
, true,
1522 numSamples
, renderTargetArrayIndex
);
1523 pDepth
->state
= HOTTILE_DIRTY
;
1524 SWR_ASSERT(pDepth
->pBuffer
!= nullptr);
1525 renderBuffers
.pDepth
= pDepth
->pBuffer
+ offset
;
1527 if(state
.stencilHottileEnable
)
1529 const uint32_t pitch
= KNOB_MACROTILE_X_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8;
1530 uint32_t offset
= ComputeTileOffset2D
<TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
> >(pitch
, tileX
, tileY
);
1532 HOTTILE
* pStencil
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroID
, SWR_ATTACHMENT_STENCIL
, true,
1533 numSamples
, renderTargetArrayIndex
);
1534 pStencil
->state
= HOTTILE_DIRTY
;
1535 SWR_ASSERT(pStencil
->pBuffer
!= nullptr);
1536 renderBuffers
.pStencil
= pStencil
->pBuffer
+ offset
;
1540 template <typename RT
>
1541 INLINE
void StepRasterTileX(uint32_t NumRT
, RenderOutputBuffers
&buffers
)
1543 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1545 buffers
.pColor
[rt
] += RT::colorRasterTileStep
;
1548 buffers
.pDepth
+= RT::depthRasterTileStep
;
1549 buffers
.pStencil
+= RT::stencilRasterTileStep
;
1552 template <typename RT
>
1553 INLINE
void StepRasterTileY(uint32_t NumRT
, RenderOutputBuffers
&buffers
, RenderOutputBuffers
&startBufferRow
)
1555 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1557 startBufferRow
.pColor
[rt
] += RT::colorRasterTileRowStep
;
1558 buffers
.pColor
[rt
] = startBufferRow
.pColor
[rt
];
1560 startBufferRow
.pDepth
+= RT::depthRasterTileRowStep
;
1561 buffers
.pDepth
= startBufferRow
.pDepth
;
1563 startBufferRow
.pStencil
+= RT::stencilRasterTileRowStep
;
1564 buffers
.pStencil
= startBufferRow
.pStencil
;
1567 void RasterizeLine(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
1569 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1570 const TRIANGLE_WORK_DESC
&workDesc
= *((TRIANGLE_WORK_DESC
*)pData
);
1571 #if KNOB_ENABLE_TOSS_POINTS
1572 if (KNOB_TOSS_BIN_TRIS
)
1578 // bloat line to two tris and call the triangle rasterizer twice
1579 AR_BEGIN(BERasterizeLine
, pDC
->drawId
);
1581 const API_STATE
&state
= GetApiState(pDC
);
1582 const SWR_RASTSTATE
&rastState
= state
.rastState
;
1584 // macrotile dimensioning
1585 uint32_t macroX
, macroY
;
1586 MacroTileMgr::getTileIndices(macroTile
, macroX
, macroY
);
1587 int32_t macroBoxLeft
= macroX
* KNOB_MACROTILE_X_DIM_FIXED
;
1588 int32_t macroBoxRight
= macroBoxLeft
+ KNOB_MACROTILE_X_DIM_FIXED
- 1;
1589 int32_t macroBoxTop
= macroY
* KNOB_MACROTILE_Y_DIM_FIXED
;
1590 int32_t macroBoxBottom
= macroBoxTop
+ KNOB_MACROTILE_Y_DIM_FIXED
- 1;
1592 const SWR_RECT
&scissorInFixedPoint
= state
.scissorsInFixedPoint
[workDesc
.triFlags
.viewportIndex
];
1594 // create a copy of the triangle buffer to write our adjusted vertices to
1595 OSALIGNSIMD(float) newTriBuffer
[4 * 4];
1596 TRIANGLE_WORK_DESC newWorkDesc
= workDesc
;
1597 newWorkDesc
.pTriBuffer
= &newTriBuffer
[0];
1599 // create a copy of the attrib buffer to write our adjusted attribs to
1600 OSALIGNSIMD(float) newAttribBuffer
[4 * 3 * KNOB_NUM_ATTRIBUTES
];
1601 newWorkDesc
.pAttribs
= &newAttribBuffer
[0];
1603 const __m128 vBloat0
= _mm_set_ps(0.5f
, -0.5f
, -0.5f
, 0.5f
);
1604 const __m128 vBloat1
= _mm_set_ps(0.5f
, 0.5f
, 0.5f
, -0.5f
);
1606 __m128 vX
, vY
, vZ
, vRecipW
;
1608 vX
= _mm_load_ps(workDesc
.pTriBuffer
);
1609 vY
= _mm_load_ps(workDesc
.pTriBuffer
+ 4);
1610 vZ
= _mm_load_ps(workDesc
.pTriBuffer
+ 8);
1611 vRecipW
= _mm_load_ps(workDesc
.pTriBuffer
+ 12);
1614 // v0,v1 -> v0,v0,v1
1615 __m128 vXa
= _mm_shuffle_ps(vX
, vX
, _MM_SHUFFLE(1, 1, 0, 0));
1616 __m128 vYa
= _mm_shuffle_ps(vY
, vY
, _MM_SHUFFLE(1, 1, 0, 0));
1617 __m128 vZa
= _mm_shuffle_ps(vZ
, vZ
, _MM_SHUFFLE(1, 1, 0, 0));
1618 __m128 vRecipWa
= _mm_shuffle_ps(vRecipW
, vRecipW
, _MM_SHUFFLE(1, 1, 0, 0));
1620 __m128 vLineWidth
= _mm_set1_ps(pDC
->pState
->state
.rastState
.lineWidth
);
1621 __m128 vAdjust
= _mm_mul_ps(vLineWidth
, vBloat0
);
1622 if (workDesc
.triFlags
.yMajor
)
1624 vXa
= _mm_add_ps(vAdjust
, vXa
);
1628 vYa
= _mm_add_ps(vAdjust
, vYa
);
1631 // Store triangle description for rasterizer
1632 _mm_store_ps((float*)&newTriBuffer
[0], vXa
);
1633 _mm_store_ps((float*)&newTriBuffer
[4], vYa
);
1634 _mm_store_ps((float*)&newTriBuffer
[8], vZa
);
1635 _mm_store_ps((float*)&newTriBuffer
[12], vRecipWa
);
1637 // binner bins 3 edges for lines as v0, v1, v1
1638 // tri0 needs v0, v0, v1
1639 for (uint32_t a
= 0; a
< workDesc
.numAttribs
; ++a
)
1641 __m128 vAttrib0
= _mm_load_ps(&workDesc
.pAttribs
[a
*12 + 0]);
1642 __m128 vAttrib1
= _mm_load_ps(&workDesc
.pAttribs
[a
*12 + 4]);
1644 _mm_store_ps((float*)&newAttribBuffer
[a
*12 + 0], vAttrib0
);
1645 _mm_store_ps((float*)&newAttribBuffer
[a
*12 + 4], vAttrib0
);
1646 _mm_store_ps((float*)&newAttribBuffer
[a
*12 + 8], vAttrib1
);
1649 // Store user clip distances for triangle 0
1650 float newClipBuffer
[3 * 8];
1651 uint32_t numClipDist
= _mm_popcnt_u32(state
.rastState
.clipDistanceMask
);
1654 newWorkDesc
.pUserClipBuffer
= newClipBuffer
;
1656 float* pOldBuffer
= workDesc
.pUserClipBuffer
;
1657 float* pNewBuffer
= newClipBuffer
;
1658 for (uint32_t i
= 0; i
< numClipDist
; ++i
)
1660 // read barycentric coeffs from binner
1661 float a
= *(pOldBuffer
++);
1662 float b
= *(pOldBuffer
++);
1664 // reconstruct original clip distance at vertices
1668 // construct triangle barycentrics
1669 *(pNewBuffer
++) = c0
- c1
;
1670 *(pNewBuffer
++) = c0
- c1
;
1671 *(pNewBuffer
++) = c1
;
1675 // setup triangle rasterizer function
1676 PFN_WORK_FUNC pfnTriRast
;
1677 // conservative rast not supported for points/lines
1678 pfnTriRast
= GetRasterizerFunc(rastState
.sampleCount
, rastState
.bIsCenterPattern
, false,
1679 SWR_INPUT_COVERAGE_NONE
, EdgeValToEdgeState(ALL_EDGES_VALID
), (pDC
->pState
->state
.scissorsTileAligned
== false));
1681 // make sure this macrotile intersects the triangle
1682 __m128i vXai
= fpToFixedPoint(vXa
);
1683 __m128i vYai
= fpToFixedPoint(vYa
);
1684 OSALIGNSIMD(SWR_RECT
) bboxA
;
1685 calcBoundingBoxInt(vXai
, vYai
, bboxA
);
1687 if (!(bboxA
.xmin
> macroBoxRight
||
1688 bboxA
.xmin
> scissorInFixedPoint
.xmax
||
1689 bboxA
.xmax
- 1 < macroBoxLeft
||
1690 bboxA
.xmax
- 1 < scissorInFixedPoint
.xmin
||
1691 bboxA
.ymin
> macroBoxBottom
||
1692 bboxA
.ymin
> scissorInFixedPoint
.ymax
||
1693 bboxA
.ymax
- 1 < macroBoxTop
||
1694 bboxA
.ymax
- 1 < scissorInFixedPoint
.ymin
)) {
1695 // rasterize triangle
1696 pfnTriRast(pDC
, workerId
, macroTile
, (void*)&newWorkDesc
);
1700 // v0,v1 -> v1,v1,v0
1701 vXa
= _mm_shuffle_ps(vX
, vX
, _MM_SHUFFLE(1, 0, 1, 1));
1702 vYa
= _mm_shuffle_ps(vY
, vY
, _MM_SHUFFLE(1, 0, 1, 1));
1703 vZa
= _mm_shuffle_ps(vZ
, vZ
, _MM_SHUFFLE(1, 0, 1, 1));
1704 vRecipWa
= _mm_shuffle_ps(vRecipW
, vRecipW
, _MM_SHUFFLE(1, 0, 1, 1));
1706 vAdjust
= _mm_mul_ps(vLineWidth
, vBloat1
);
1707 if (workDesc
.triFlags
.yMajor
)
1709 vXa
= _mm_add_ps(vAdjust
, vXa
);
1713 vYa
= _mm_add_ps(vAdjust
, vYa
);
1716 // Store triangle description for rasterizer
1717 _mm_store_ps((float*)&newTriBuffer
[0], vXa
);
1718 _mm_store_ps((float*)&newTriBuffer
[4], vYa
);
1719 _mm_store_ps((float*)&newTriBuffer
[8], vZa
);
1720 _mm_store_ps((float*)&newTriBuffer
[12], vRecipWa
);
1722 // binner bins 3 edges for lines as v0, v1, v1
1723 // tri1 needs v1, v1, v0
1724 for (uint32_t a
= 0; a
< workDesc
.numAttribs
; ++a
)
1726 __m128 vAttrib0
= _mm_load_ps(&workDesc
.pAttribs
[a
* 12 + 0]);
1727 __m128 vAttrib1
= _mm_load_ps(&workDesc
.pAttribs
[a
* 12 + 4]);
1729 _mm_store_ps((float*)&newAttribBuffer
[a
* 12 + 0], vAttrib1
);
1730 _mm_store_ps((float*)&newAttribBuffer
[a
* 12 + 4], vAttrib1
);
1731 _mm_store_ps((float*)&newAttribBuffer
[a
* 12 + 8], vAttrib0
);
1734 // store user clip distance for triangle 1
1737 float* pOldBuffer
= workDesc
.pUserClipBuffer
;
1738 float* pNewBuffer
= newClipBuffer
;
1739 for (uint32_t i
= 0; i
< numClipDist
; ++i
)
1741 // read barycentric coeffs from binner
1742 float a
= *(pOldBuffer
++);
1743 float b
= *(pOldBuffer
++);
1745 // reconstruct original clip distance at vertices
1749 // construct triangle barycentrics
1750 *(pNewBuffer
++) = c1
- c0
;
1751 *(pNewBuffer
++) = c1
- c0
;
1752 *(pNewBuffer
++) = c0
;
1756 vXai
= fpToFixedPoint(vXa
);
1757 vYai
= fpToFixedPoint(vYa
);
1758 calcBoundingBoxInt(vXai
, vYai
, bboxA
);
1760 if (!(bboxA
.xmin
> macroBoxRight
||
1761 bboxA
.xmin
> scissorInFixedPoint
.xmax
||
1762 bboxA
.xmax
- 1 < macroBoxLeft
||
1763 bboxA
.xmax
- 1 < scissorInFixedPoint
.xmin
||
1764 bboxA
.ymin
> macroBoxBottom
||
1765 bboxA
.ymin
> scissorInFixedPoint
.ymax
||
1766 bboxA
.ymax
- 1 < macroBoxTop
||
1767 bboxA
.ymax
- 1 < scissorInFixedPoint
.ymin
)) {
1768 // rasterize triangle
1769 pfnTriRast(pDC
, workerId
, macroTile
, (void*)&newWorkDesc
);
1772 AR_END(BERasterizeLine
, 1);
1775 struct RasterizerChooser
1777 typedef PFN_WORK_FUNC FuncType
;
1779 template <typename
... ArgsB
>
1780 static FuncType
GetFunc()
1782 return RasterizeTriangle
<RasterizerTraits
<ArgsB
...>>;
1786 // Selector for correct templated RasterizeTriangle function
1787 PFN_WORK_FUNC
GetRasterizerFunc(
1788 uint32_t numSamples
,
1790 bool IsConservative
,
1791 uint32_t InputCoverage
,
1792 uint32_t EdgeEnable
,
1793 bool RasterizeScissorEdges
1796 return TemplateArgUnroller
<RasterizerChooser
>::GetFunc(
1797 IntArg
<SWR_MULTISAMPLE_1X
,SWR_MULTISAMPLE_TYPE_COUNT
-1>{numSamples
},
1800 IntArg
<SWR_INPUT_COVERAGE_NONE
, SWR_INPUT_COVERAGE_COUNT
-1>{InputCoverage
},
1801 IntArg
<0, STATE_VALID_TRI_EDGE_COUNT
-1>{EdgeEnable
},
1802 RasterizeScissorEdges
);