1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * @file rasterizer.cpp
25 * @brief Implementation for the rasterizer.
27 ******************************************************************************/
32 #include "rasterizer.h"
33 #include "rdtsc_core.h"
38 #include "memory/tilingtraits.h"
40 template <uint32_t numSamples
= 1>
41 void GetRenderHotTiles(DRAW_CONTEXT
*pDC
, uint32_t macroID
, uint32_t x
, uint32_t y
, RenderOutputBuffers
&renderBuffers
, uint32_t renderTargetArrayIndex
);
42 template <typename RT
>
43 void StepRasterTileX(uint32_t MaxRT
, RenderOutputBuffers
&buffers
);
44 template <typename RT
>
45 void StepRasterTileY(uint32_t MaxRT
, RenderOutputBuffers
&buffers
, RenderOutputBuffers
&startBufferRow
);
47 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
48 const __m256d gMaskToVecpd
[] =
50 MASKTOVEC(0, 0, 0, 0),
51 MASKTOVEC(0, 0, 0, 1),
52 MASKTOVEC(0, 0, 1, 0),
53 MASKTOVEC(0, 0, 1, 1),
54 MASKTOVEC(0, 1, 0, 0),
55 MASKTOVEC(0, 1, 0, 1),
56 MASKTOVEC(0, 1, 1, 0),
57 MASKTOVEC(0, 1, 1, 1),
58 MASKTOVEC(1, 0, 0, 0),
59 MASKTOVEC(1, 0, 0, 1),
60 MASKTOVEC(1, 0, 1, 0),
61 MASKTOVEC(1, 0, 1, 1),
62 MASKTOVEC(1, 1, 0, 0),
63 MASKTOVEC(1, 1, 0, 1),
64 MASKTOVEC(1, 1, 1, 0),
65 MASKTOVEC(1, 1, 1, 1),
75 double a
, b
; // a, b edge coefficients in fix8
76 double stepQuadX
; // step to adjacent horizontal quad in fix16
77 double stepQuadY
; // step to adjacent vertical quad in fix16
78 double stepRasterTileX
; // step to adjacent horizontal raster tile in fix16
79 double stepRasterTileY
; // step to adjacent vertical raster tile in fix16
81 __m256d vQuadOffsets
; // offsets for 4 samples of a quad
82 __m256d vRasterTileOffsets
; // offsets for the 4 corners of a raster tile
85 //////////////////////////////////////////////////////////////////////////
86 /// @brief rasterize a raster tile partially covered by the triangle
87 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
88 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
89 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
90 /// Used to step between quads when sweeping over the raster tile.
91 template<uint32_t NumEdges
, typename EdgeMaskT
>
92 INLINE
uint64_t rasterizePartialTile(DRAW_CONTEXT
*pDC
, double startEdges
[NumEdges
], EDGE
*pRastEdges
)
94 uint64_t coverageMask
= 0;
96 __m256d vEdges
[NumEdges
];
97 __m256d vStepX
[NumEdges
];
98 __m256d vStepY
[NumEdges
];
100 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
102 // Step to the pixel sample locations of the 1st quad
103 vEdges
[e
] = _mm256_add_pd(_mm256_set1_pd(startEdges
[e
]), pRastEdges
[e
].vQuadOffsets
);
105 // compute step to next quad (mul by 2 in x and y direction)
106 vStepX
[e
] = _mm256_set1_pd(pRastEdges
[e
].stepQuadX
);
107 vStepY
[e
] = _mm256_set1_pd(pRastEdges
[e
].stepQuadY
);
110 // fast unrolled version for 8x8 tile
111 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
112 int edgeMask
[NumEdges
];
115 auto eval_lambda
= [&](int e
){edgeMask
[e
] = _mm256_movemask_pd(vEdges
[e
]);};
116 auto update_lambda
= [&](int e
){mask
&= edgeMask
[e
];};
117 auto incx_lambda
= [&](int e
){vEdges
[e
] = _mm256_add_pd(vEdges
[e
], vStepX
[e
]);};
118 auto incy_lambda
= [&](int e
){vEdges
[e
] = _mm256_add_pd(vEdges
[e
], vStepY
[e
]);};
119 auto decx_lambda
= [&](int e
){vEdges
[e
] = _mm256_sub_pd(vEdges
[e
], vStepX
[e
]);};
121 // evaluate which pixels in the quad are covered
123 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
125 // update coverage mask
126 #define UPDATE_MASK(bit) \
127 mask = edgeMask[0]; \
128 UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
129 coverageMask |= (mask << bit);
131 // step in the +x direction to the next quad
133 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
135 // step in the +y direction to the next quad
137 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
139 // step in the -x direction to the next quad
141 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
143 // sweep 2x2 quad back and forth through the raster tile,
144 // computing coverage masks for the entire tile
149 // x x ------------------>
151 // <-----------------x x V
210 for (uint32_t y
= 0; y
< KNOB_TILE_Y_DIM
/2; ++y
)
212 __m256d vStartOfRowEdge
[NumEdges
];
213 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
215 vStartOfRowEdge
[e
] = vEdges
[e
];
218 for (uint32_t x
= 0; x
< KNOB_TILE_X_DIM
/2; ++x
)
220 int edgeMask
[NumEdges
];
221 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
223 edgeMask
[e
] = _mm256_movemask_pd(vEdges
[e
]);
226 uint64_t mask
= edgeMask
[0];
227 for (uint32_t e
= 1; e
< NumEdges
; ++e
)
231 coverageMask
|= (mask
<< bit
);
233 // step to the next pixel in the x
234 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
236 vEdges
[e
] = _mm256_add_pd(vEdges
[e
], vStepX
[e
]);
241 // step to the next row
242 for (uint32_t e
= 0; e
< NumEdges
; ++e
)
244 vEdges
[e
] = _mm256_add_pd(vStartOfRowEdge
[e
], vStepY
[e
]);
252 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
253 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
254 // Top left: a sample is in if it is a top or left edge.
255 // Out: !(horizontal && above) = !horizontal && below
256 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right
257 INLINE
void adjustTopLeftRuleIntFix16(const __m128i vA
, const __m128i vB
, __m256d
&vEdge
)
260 // if vA == 0 && vB < 0, vC--
262 __m256d vEdgeOut
= vEdge
;
263 __m256d vEdgeAdjust
= _mm256_sub_pd(vEdge
, _mm256_set1_pd(1.0));
265 // if vA < 0 (line is not horizontal and below)
266 int msk
= _mm_movemask_ps(_mm_castsi128_ps(vA
));
268 // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
269 __m128i vCmp
= _mm_cmpeq_epi32(vA
, _mm_setzero_si128());
270 int msk2
= _mm_movemask_ps(_mm_castsi128_ps(vCmp
));
271 msk2
&= _mm_movemask_ps(_mm_castsi128_ps(vB
));
273 // if either of these are true and we're on the line (edge == 0), bump it outside the line
274 vEdge
= _mm256_blendv_pd(vEdgeOut
, vEdgeAdjust
, gMaskToVecpd
[msk
| msk2
]);
277 //////////////////////////////////////////////////////////////////////////
278 /// @brief calculates difference in precision between the result of manh
279 /// calculation and the edge precision, based on compile time trait values
280 template<typename RT
>
281 constexpr int64_t ManhToEdgePrecisionAdjust()
283 static_assert(RT::PrecisionT::BitsT::value
+ RT::ConservativePrecisionT::BitsT::value
>= RT::EdgePrecisionT::BitsT::value
,
284 "Inadequate precision of result of manh calculation ");
285 return ((RT::PrecisionT::BitsT::value
+ RT::ConservativePrecisionT::BitsT::value
) - RT::EdgePrecisionT::BitsT::value
);
288 //////////////////////////////////////////////////////////////////////////
289 /// @struct adjustEdgeConservative
290 /// @brief Primary template definition used for partially specializing
291 /// the adjustEdgeConservative function. This struct should never
293 /// @tparam RT: rasterizer traits
294 /// @tparam IsConservativeT: is conservative rast enabled?
295 template <typename RT
, typename IsConservativeT
>
296 struct adjustEdgeConservative
298 INLINE
adjustEdgeConservative(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
) = delete;
301 //////////////////////////////////////////////////////////////////////////
302 /// @brief adjustEdgeConservative<RT, std::true_type> specialization
303 /// of adjustEdgeConservative. Used for conservative rasterization specific
305 template <typename RT
>
306 struct adjustEdgeConservative
<RT
, std::true_type
>
308 //////////////////////////////////////////////////////////////////////////
309 /// @brief Performs calculations to adjust each edge of a triangle away
310 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
313 /// Uncertainty regions arise from fixed point rounding, which
314 /// can snap a vertex +/- by min fixed point value.
315 /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
316 /// This allows the rasterizer to test for coverage only at the pixel center,
317 /// instead of having to test individual pixel corners for conservative coverage
318 INLINE
adjustEdgeConservative(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
)
320 // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
321 // from the pixel center (in the direction of the edge normal A/B)
323 // edge = Ax + Bx + C - (manh/e)
324 // manh = manhattan distance = abs(A) + abs(B)
325 // e = absolute rounding error from snapping from float to fixed point precision
327 // 'fixed point' multiply (in double to be avx1 friendly)
328 // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
329 __m256d vAai
= _mm256_cvtepi32_pd(_mm_abs_epi32(vAi
)), vBai
= _mm256_cvtepi32_pd(_mm_abs_epi32(vBi
));
330 __m256d manh
= _mm256_add_pd(_mm256_mul_pd(vAai
, _mm256_set1_pd(RT::ConservativeEdgeOffsetT::value
)),
331 _mm256_mul_pd(vBai
, _mm256_set1_pd(RT::ConservativeEdgeOffsetT::value
)));
333 static_assert(RT::PrecisionT::BitsT::value
+ RT::ConservativePrecisionT::BitsT::value
>= RT::EdgePrecisionT::BitsT::value
,
334 "Inadequate precision of result of manh calculation ");
336 // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
337 // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
338 manh
= _mm256_mul_pd(manh
, _mm256_set1_pd(ManhToEdgePrecisionAdjust
<RT
>() * 0.5));
340 // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
341 // this allows the rasterizer to do a single conservative coverage test to see if the primitive
342 // intersects the pixel at all
343 vEdge
= _mm256_sub_pd(vEdge
, manh
);
347 //////////////////////////////////////////////////////////////////////////
348 /// @brief adjustEdgeConservative<RT, std::false_type> specialization
349 /// of adjustEdgeConservative. Allows code to be generically called; when
350 /// IsConservativeT trait is disabled this inlines an empty function, which
351 /// should get optimized out.
352 template <typename RT
>
353 struct adjustEdgeConservative
<RT
, std::false_type
>
355 INLINE
adjustEdgeConservative(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
){};
358 //////////////////////////////////////////////////////////////////////////
359 /// @brief calculates the distance a degenerate BBox needs to be adjusted
360 /// for conservative rast based on compile time trait values
361 template<typename RT
>
362 constexpr int64_t ConservativeScissorOffset()
364 static_assert(RT::ConservativePrecisionT::BitsT::value
- RT::PrecisionT::BitsT::value
>= 0, "Rasterizer precision > conservative precision");
365 // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
366 typedef std::integral_constant
<int32_t, (RT::ValidEdgeMaskT::value
== ALL_EDGES_VALID
) ? 0 : 1> DegenerateEdgeOffsetT
;
367 // 1/2 pixel edge offset + conservative offset - degenerateTriangle
368 return RT::ConservativeEdgeOffsetT::value
- (DegenerateEdgeOffsetT::value
<< (RT::ConservativePrecisionT::BitsT::value
- RT::PrecisionT::BitsT::value
));
371 //////////////////////////////////////////////////////////////////////////
372 /// @brief Performs calculations to adjust each a scalar edge out
373 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
375 template <typename RT
>
376 INLINE
void adjustScissorEdge(const double a
, const double b
, __m256d
&vEdge
)
378 int64_t aabs
= std::abs(static_cast<int64_t>(a
)), babs
= std::abs(static_cast<int64_t>(b
));
379 int64_t manh
= ((aabs
* ConservativeScissorOffset
<RT
>()) + (babs
* ConservativeScissorOffset
<RT
>())) >> ManhToEdgePrecisionAdjust
<RT
>();
380 vEdge
= _mm256_sub_pd(vEdge
, _mm256_set1_pd(manh
));
383 //////////////////////////////////////////////////////////////////////////
384 /// @brief Perform any needed adjustments to evaluated triangle edges
385 template <typename RT
>
386 INLINE
void adjustEdgesFix16(const __m128i
&vAi
, const __m128i
&vBi
, __m256d
&vEdge
)
388 static_assert(std::is_same
<typename
RT::EdgePrecisionT
, FixedPointTraits
<Fixed_X_16
>>::value
,
389 "Edge equation expected to be in x.16 fixed point");
390 // need to offset the edge before applying the top-left rule
391 adjustEdgeConservative
<RT
, typename
RT::IsConservativeT
>(vAi
, vBi
, vEdge
);
393 adjustTopLeftRuleIntFix16(vAi
, vBi
, vEdge
);
396 // max(abs(dz/dx), abs(dz,dy)
397 INLINE
float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC
* pDesc
)
400 // evaluate i,j at (0,0)
401 float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
402 float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
404 // evaluate i,j at (1,0)
405 float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
406 float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
409 float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
410 float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
411 float dzdx = abs(d10 - d00);
413 // evaluate i,j at (0,1)
414 float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
415 float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
417 float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
418 float dzdy = abs(d01 - d00);
421 // optimized version of above
422 float dzdx
= fabsf(pDesc
->recipDet
* (pDesc
->Z
[0] * pDesc
->I
[0] + pDesc
->Z
[1] * pDesc
->J
[0]));
423 float dzdy
= fabsf(pDesc
->recipDet
* (pDesc
->Z
[0] * pDesc
->I
[1] + pDesc
->Z
[1] * pDesc
->J
[1]));
425 return std::max(dzdx
, dzdy
);
428 INLINE
float ComputeBiasFactor(const SWR_RASTSTATE
* pState
, const SWR_TRIANGLE_DESC
* pDesc
, const float* z
)
430 if (pState
->depthFormat
== R24_UNORM_X8_TYPELESS
)
432 return (1.0f
/ (1 << 24));
434 else if (pState
->depthFormat
== R16_UNORM
)
436 return (1.0f
/ (1 << 16));
440 SWR_ASSERT(pState
->depthFormat
== R32_FLOAT
);
442 // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
443 float zMax
= std::max(fabsf(z
[0]), std::max(fabsf(z
[1]), fabsf(z
[2])));
444 uint32_t zMaxInt
= *(uint32_t*)&zMax
;
445 zMaxInt
&= 0x7f800000;
446 zMax
= *(float*)&zMaxInt
;
448 return zMax
* (1.0f
/ (1 << 23));
452 INLINE
float ComputeDepthBias(const SWR_RASTSTATE
* pState
, const SWR_TRIANGLE_DESC
* pTri
, const float* z
)
454 if (pState
->depthBias
== 0 && pState
->slopeScaledDepthBias
== 0)
459 float scale
= pState
->slopeScaledDepthBias
;
462 scale
*= ComputeMaxDepthSlope(pTri
);
465 float bias
= pState
->depthBias
;
466 if (!pState
->depthBiasPreAdjusted
)
468 bias
*= ComputeBiasFactor(pState
, pTri
, z
);
472 if (pState
->depthBiasClamp
> 0.0f
)
474 bias
= std::min(bias
, pState
->depthBiasClamp
);
476 else if (pState
->depthBiasClamp
< 0.0f
)
478 bias
= std::max(bias
, pState
->depthBiasClamp
);
484 // Prevent DCE by writing coverage mask from rasterizer to volatile
485 #if KNOB_ENABLE_TOSS_POINTS
486 __declspec(thread
) volatile uint64_t gToss
;
489 static const uint32_t vertsPerTri
= 3, componentsPerAttrib
= 4;
490 // try to avoid _chkstk insertions; make this thread local
491 static THREAD
OSALIGNLINE(float) perspAttribsTLS
[vertsPerTri
* KNOB_NUM_ATTRIBUTES
* componentsPerAttrib
];
494 void ComputeEdgeData(int32_t a
, int32_t b
, EDGE
& edge
)
499 // compute constant steps to adjacent quads
500 edge
.stepQuadX
= (double)((int64_t)a
* (int64_t)(2 * FIXED_POINT_SCALE
));
501 edge
.stepQuadY
= (double)((int64_t)b
* (int64_t)(2 * FIXED_POINT_SCALE
));
503 // compute constant steps to adjacent raster tiles
504 edge
.stepRasterTileX
= (double)((int64_t)a
* (int64_t)(KNOB_TILE_X_DIM
* FIXED_POINT_SCALE
));
505 edge
.stepRasterTileY
= (double)((int64_t)b
* (int64_t)(KNOB_TILE_Y_DIM
* FIXED_POINT_SCALE
));
507 // compute quad offsets
508 const __m256d vQuadOffsetsXIntFix8
= _mm256_set_pd(FIXED_POINT_SCALE
, 0, FIXED_POINT_SCALE
, 0);
509 const __m256d vQuadOffsetsYIntFix8
= _mm256_set_pd(FIXED_POINT_SCALE
, FIXED_POINT_SCALE
, 0, 0);
511 __m256d vQuadStepXFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.a
), vQuadOffsetsXIntFix8
);
512 __m256d vQuadStepYFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.b
), vQuadOffsetsYIntFix8
);
513 edge
.vQuadOffsets
= _mm256_add_pd(vQuadStepXFix16
, vQuadStepYFix16
);
515 // compute raster tile offsets
516 const __m256d vTileOffsetsXIntFix8
= _mm256_set_pd((KNOB_TILE_X_DIM
- 1)*FIXED_POINT_SCALE
, 0, (KNOB_TILE_X_DIM
- 1)*FIXED_POINT_SCALE
, 0);
517 const __m256d vTileOffsetsYIntFix8
= _mm256_set_pd((KNOB_TILE_Y_DIM
- 1)*FIXED_POINT_SCALE
, (KNOB_TILE_Y_DIM
- 1)*FIXED_POINT_SCALE
, 0, 0);
519 __m256d vTileStepXFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.a
), vTileOffsetsXIntFix8
);
520 __m256d vTileStepYFix16
= _mm256_mul_pd(_mm256_set1_pd(edge
.b
), vTileOffsetsYIntFix8
);
521 edge
.vRasterTileOffsets
= _mm256_add_pd(vTileStepXFix16
, vTileStepYFix16
);
525 void ComputeEdgeData(const POS
& p0
, const POS
& p1
, EDGE
& edge
)
527 ComputeEdgeData(p0
.y
- p1
.y
, p1
.x
- p0
.x
, edge
);
530 //////////////////////////////////////////////////////////////////////////
531 /// @brief Primary template definition used for partially specializing
532 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
533 /// corner to sample position, and test for coverage
534 /// @tparam sampleCount: multisample count
535 template <typename NumSamplesT
>
536 INLINE
void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox
)[3], const __m256d (&vEdgeFix16
)[7],
537 int32_t &mask0
, int32_t &mask1
, int32_t &mask2
)
539 __m256d vSampleBboxTest0
, vSampleBboxTest1
, vSampleBboxTest2
;
540 // evaluate edge equations at the tile multisample bounding box
541 vSampleBboxTest0
= _mm256_add_pd(vEdgeTileBbox
[0], vEdgeFix16
[0]);
542 vSampleBboxTest1
= _mm256_add_pd(vEdgeTileBbox
[1], vEdgeFix16
[1]);
543 vSampleBboxTest2
= _mm256_add_pd(vEdgeTileBbox
[2], vEdgeFix16
[2]);
544 mask0
= _mm256_movemask_pd(vSampleBboxTest0
);
545 mask1
= _mm256_movemask_pd(vSampleBboxTest1
);
546 mask2
= _mm256_movemask_pd(vSampleBboxTest2
);
549 //////////////////////////////////////////////////////////////////////////
550 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
551 /// when only rasterizing a single coverage test point
553 INLINE
void UpdateEdgeMasks
<SingleSampleT
>(const __m256d(&)[3], const __m256d (&vEdgeFix16
)[7],
554 int32_t &mask0
, int32_t &mask1
, int32_t &mask2
)
556 mask0
= _mm256_movemask_pd(vEdgeFix16
[0]);
557 mask1
= _mm256_movemask_pd(vEdgeFix16
[1]);
558 mask2
= _mm256_movemask_pd(vEdgeFix16
[2]);
561 //////////////////////////////////////////////////////////////////////////
562 /// @struct ComputeScissorEdges
563 /// @brief Primary template definition. Allows the function to be generically
564 /// called. When paired with below specializations, will result in an empty
565 /// inlined function if scissor is not enabled
566 /// @tparam RasterScissorEdgesT: is scissor enabled?
567 /// @tparam IsConservativeT: is conservative rast enabled?
568 /// @tparam RT: rasterizer traits
569 template <typename RasterScissorEdgesT
, typename IsConservativeT
, typename RT
>
570 struct ComputeScissorEdges
572 INLINE
ComputeScissorEdges(const BBOX
&triBBox
, const BBOX
&scissorBBox
, const int32_t x
, const int32_t y
,
573 EDGE (&rastEdges
)[RT::NumEdgesT::value
], __m256d (&vEdgeFix16
)[7]){};
576 //////////////////////////////////////////////////////////////////////////
577 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
578 /// specialization. Instantiated when conservative rast and scissor are enabled
579 template <typename RT
>
580 struct ComputeScissorEdges
<std::true_type
, std::true_type
, RT
>
582 //////////////////////////////////////////////////////////////////////////
583 /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
584 /// evaluate edge equations and offset them away from pixel center.
585 INLINE
ComputeScissorEdges(const BBOX
&triBBox
, const BBOX
&scissorBBox
, const int32_t x
, const int32_t y
,
586 EDGE (&rastEdges
)[RT::NumEdgesT::value
], __m256d (&vEdgeFix16
)[7])
588 // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
590 scissor
.left
= std::max(triBBox
.left
, scissorBBox
.left
);
591 scissor
.right
= std::min(triBBox
.right
, scissorBBox
.right
);
592 scissor
.top
= std::max(triBBox
.top
, scissorBBox
.top
);
593 scissor
.bottom
= std::min(triBBox
.bottom
, scissorBBox
.bottom
);
595 POS topLeft
{scissor
.left
, scissor
.top
};
596 POS bottomLeft
{scissor
.left
, scissor
.bottom
};
597 POS topRight
{scissor
.right
, scissor
.top
};
598 POS bottomRight
{scissor
.right
, scissor
.bottom
};
600 // construct 4 scissor edges in ccw direction
601 ComputeEdgeData(topLeft
, bottomLeft
, rastEdges
[3]);
602 ComputeEdgeData(bottomLeft
, bottomRight
, rastEdges
[4]);
603 ComputeEdgeData(bottomRight
, topRight
, rastEdges
[5]);
604 ComputeEdgeData(topRight
, topLeft
, rastEdges
[6]);
606 vEdgeFix16
[3] = _mm256_set1_pd((rastEdges
[3].a
* (x
- scissor
.left
)) + (rastEdges
[3].b
* (y
- scissor
.top
)));
607 vEdgeFix16
[4] = _mm256_set1_pd((rastEdges
[4].a
* (x
- scissor
.left
)) + (rastEdges
[4].b
* (y
- scissor
.bottom
)));
608 vEdgeFix16
[5] = _mm256_set1_pd((rastEdges
[5].a
* (x
- scissor
.right
)) + (rastEdges
[5].b
* (y
- scissor
.bottom
)));
609 vEdgeFix16
[6] = _mm256_set1_pd((rastEdges
[6].a
* (x
- scissor
.right
)) + (rastEdges
[6].b
* (y
- scissor
.top
)));
611 // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
612 adjustScissorEdge
<RT
>(rastEdges
[3].a
, rastEdges
[3].b
, vEdgeFix16
[3]);
613 adjustScissorEdge
<RT
>(rastEdges
[4].a
, rastEdges
[4].b
, vEdgeFix16
[4]);
614 adjustScissorEdge
<RT
>(rastEdges
[5].a
, rastEdges
[5].b
, vEdgeFix16
[5]);
615 adjustScissorEdge
<RT
>(rastEdges
[6].a
, rastEdges
[6].b
, vEdgeFix16
[6]);
619 //////////////////////////////////////////////////////////////////////////
620 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
621 /// specialization. Instantiated when scissor is enabled and conservative rast
623 template <typename RT
>
624 struct ComputeScissorEdges
<std::true_type
, std::false_type
, RT
>
626 //////////////////////////////////////////////////////////////////////////
627 /// @brief Compute scissor edge vectors and evaluate edge equations
628 INLINE
ComputeScissorEdges(const BBOX
&, const BBOX
&scissorBBox
, const int32_t x
, const int32_t y
,
629 EDGE (&rastEdges
)[RT::NumEdgesT::value
], __m256d (&vEdgeFix16
)[7])
631 const BBOX
&scissor
= scissorBBox
;
632 POS topLeft
{scissor
.left
, scissor
.top
};
633 POS bottomLeft
{scissor
.left
, scissor
.bottom
};
634 POS topRight
{scissor
.right
, scissor
.top
};
635 POS bottomRight
{scissor
.right
, scissor
.bottom
};
637 // construct 4 scissor edges in ccw direction
638 ComputeEdgeData(topLeft
, bottomLeft
, rastEdges
[3]);
639 ComputeEdgeData(bottomLeft
, bottomRight
, rastEdges
[4]);
640 ComputeEdgeData(bottomRight
, topRight
, rastEdges
[5]);
641 ComputeEdgeData(topRight
, topLeft
, rastEdges
[6]);
643 vEdgeFix16
[3] = _mm256_set1_pd((rastEdges
[3].a
* (x
- scissor
.left
)) + (rastEdges
[3].b
* (y
- scissor
.top
)));
644 vEdgeFix16
[4] = _mm256_set1_pd((rastEdges
[4].a
* (x
- scissor
.left
)) + (rastEdges
[4].b
* (y
- scissor
.bottom
)));
645 vEdgeFix16
[5] = _mm256_set1_pd((rastEdges
[5].a
* (x
- scissor
.right
)) + (rastEdges
[5].b
* (y
- scissor
.bottom
)));
646 vEdgeFix16
[6] = _mm256_set1_pd((rastEdges
[6].a
* (x
- scissor
.right
)) + (rastEdges
[6].b
* (y
- scissor
.top
)));
650 //////////////////////////////////////////////////////////////////////////
651 /// @brief Primary function template for TrivialRejectTest. Should
652 /// never be called, but TemplateUnroller instantiates a few unused values,
653 /// so it calls a runtime assert instead of a static_assert.
654 template <typename ValidEdgeMaskT
>
655 INLINE
bool TrivialRejectTest(const int, const int, const int)
657 SWR_ASSERT(0, "Primary templated function should never be called");
661 //////////////////////////////////////////////////////////////////////////
662 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
663 /// and edge 1 for trivial coverage reject
665 INLINE
bool TrivialRejectTest
<E0E1ValidT
>(const int mask0
, const int mask1
, const int)
667 return (!(mask0
&& mask1
)) ? true : false;
670 //////////////////////////////////////////////////////////////////////////
671 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
672 /// and edge 2 for trivial coverage reject
674 INLINE
bool TrivialRejectTest
<E0E2ValidT
>(const int mask0
, const int, const int mask2
)
676 return (!(mask0
&& mask2
)) ? true : false;
679 //////////////////////////////////////////////////////////////////////////
680 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
681 /// and edge 2 for trivial coverage reject
683 INLINE
bool TrivialRejectTest
<E1E2ValidT
>(const int, const int mask1
, const int mask2
)
685 return (!(mask1
&& mask2
)) ? true : false;
688 //////////////////////////////////////////////////////////////////////////
689 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
690 /// primitive edges for trivial coverage reject
692 INLINE
bool TrivialRejectTest
<AllEdgesValidT
>(const int mask0
, const int mask1
, const int mask2
)
694 return (!(mask0
&& mask1
&& mask2
)) ? true : false;;
697 //////////////////////////////////////////////////////////////////////////
698 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
699 /// point, so return false and rasterize against conservative BBox
701 INLINE
bool TrivialRejectTest
<NoEdgesValidT
>(const int, const int, const int)
706 //////////////////////////////////////////////////////////////////////////
707 /// @brief Primary function template for TrivialAcceptTest. Always returns
708 /// false, since it will only be called for degenerate tris, and as such
709 /// will never cover the entire raster tile
710 template <typename ValidEdgeMaskT
>
711 INLINE
bool TrivialAcceptTest(const int, const int, const int)
716 //////////////////////////////////////////////////////////////////////////
717 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
718 /// edge masks for a fully covered raster tile
720 INLINE
bool TrivialAcceptTest
<AllEdgesValidT
>(const int mask0
, const int mask1
, const int mask2
)
722 return ((mask0
& mask1
& mask2
) == 0xf);
725 template <typename RT
>
726 void RasterizeTriangle(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t macroTile
, void* pDesc
)
728 const TRIANGLE_WORK_DESC
&workDesc
= *((TRIANGLE_WORK_DESC
*)pDesc
);
729 #if KNOB_ENABLE_TOSS_POINTS
730 if (KNOB_TOSS_BIN_TRIS
)
735 RDTSC_START(BERasterizeTriangle
);
737 RDTSC_START(BETriangleSetup
);
738 const API_STATE
&state
= GetApiState(pDC
);
739 const SWR_RASTSTATE
&rastState
= state
.rastState
;
740 const BACKEND_FUNCS
& backendFuncs
= pDC
->pState
->backendFuncs
;
742 OSALIGNSIMD(SWR_TRIANGLE_DESC
) triDesc
;
743 triDesc
.pUserClipBuffer
= workDesc
.pUserClipBuffer
;
745 __m128 vX
, vY
, vZ
, vRecipW
;
747 // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
748 // eg: vX = [x0 x1 x2 dc]
749 vX
= _mm_load_ps(workDesc
.pTriBuffer
);
750 vY
= _mm_load_ps(workDesc
.pTriBuffer
+ 4);
751 vZ
= _mm_load_ps(workDesc
.pTriBuffer
+ 8);
752 vRecipW
= _mm_load_ps(workDesc
.pTriBuffer
+ 12);
754 // convert to fixed point
755 static_assert(std::is_same
<typename
RT::PrecisionT
, FixedPointTraits
<Fixed_16_8
>>::value
, "Rasterizer expects 16.8 fixed point precision");
756 __m128i vXi
= fpToFixedPoint(vX
);
757 __m128i vYi
= fpToFixedPoint(vY
);
759 // quantize floating point position to fixed point precision
760 // to prevent attribute creep around the triangle vertices
761 vX
= _mm_mul_ps(_mm_cvtepi32_ps(vXi
), _mm_set1_ps(1.0f
/ FIXED_POINT_SCALE
));
762 vY
= _mm_mul_ps(_mm_cvtepi32_ps(vYi
), _mm_set1_ps(1.0f
/ FIXED_POINT_SCALE
));
764 // triangle setup - A and B edge equation coefs
766 triangleSetupAB(vX
, vY
, vA
, vB
);
769 triangleSetupABInt(vXi
, vYi
, vAi
, vBi
);
772 float det
= calcDeterminantInt(vAi
, vBi
);
774 // Verts in Pixel Coordinate Space at this point
775 // Det > 0 = CW winding order
776 // Convert CW triangles to CCW
779 vA
= _mm_mul_ps(vA
, _mm_set1_ps(-1));
780 vB
= _mm_mul_ps(vB
, _mm_set1_ps(-1));
781 vAi
= _mm_mullo_epi32(vAi
, _mm_set1_epi32(-1));
782 vBi
= _mm_mullo_epi32(vBi
, _mm_set1_epi32(-1));
787 // Finish triangle setup - C edge coef
788 triangleSetupC(vX
, vY
, vA
, vB
, vC
);
790 if(RT::ValidEdgeMaskT::value
!= ALL_EDGES_VALID
)
792 // If we have degenerate edge(s) to rasterize, set I and J coefs
793 // to 0 for constant interpolation of attributes
801 // Degenerate triangles have no area
802 triDesc
.recipDet
= 0.0f
;
806 // only extract coefs for 2 of the barycentrics; the 3rd can be
807 // determined from the barycentric equation:
808 // i + j + k = 1 <=> k = 1 - j - i
809 _MM_EXTRACT_FLOAT(triDesc
.I
[0], vA
, 1);
810 _MM_EXTRACT_FLOAT(triDesc
.I
[1], vB
, 1);
811 _MM_EXTRACT_FLOAT(triDesc
.I
[2], vC
, 1);
812 _MM_EXTRACT_FLOAT(triDesc
.J
[0], vA
, 2);
813 _MM_EXTRACT_FLOAT(triDesc
.J
[1], vB
, 2);
814 _MM_EXTRACT_FLOAT(triDesc
.J
[2], vC
, 2);
816 // compute recipDet, used to calculate barycentric i and j in the backend
817 triDesc
.recipDet
= 1.0f
/det
;
820 OSALIGNSIMD(float) oneOverW
[4];
821 _mm_store_ps(oneOverW
, vRecipW
);
822 triDesc
.OneOverW
[0] = oneOverW
[0] - oneOverW
[2];
823 triDesc
.OneOverW
[1] = oneOverW
[1] - oneOverW
[2];
824 triDesc
.OneOverW
[2] = oneOverW
[2];
826 // calculate perspective correct coefs per vertex attrib
827 float* pPerspAttribs
= perspAttribsTLS
;
828 float* pAttribs
= workDesc
.pAttribs
;
829 triDesc
.pPerspAttribs
= pPerspAttribs
;
830 triDesc
.pAttribs
= pAttribs
;
831 float *pRecipW
= workDesc
.pTriBuffer
+ 12;
832 triDesc
.pRecipW
= pRecipW
;
833 __m128 vOneOverWV0
= _mm_broadcast_ss(pRecipW
);
834 __m128 vOneOverWV1
= _mm_broadcast_ss(pRecipW
+=1);
835 __m128 vOneOverWV2
= _mm_broadcast_ss(pRecipW
+=1);
836 for(uint32_t i
= 0; i
< workDesc
.numAttribs
; i
++)
838 __m128 attribA
= _mm_load_ps(pAttribs
);
839 __m128 attribB
= _mm_load_ps(pAttribs
+=4);
840 __m128 attribC
= _mm_load_ps(pAttribs
+=4);
843 attribA
= _mm_mul_ps(attribA
, vOneOverWV0
);
844 attribB
= _mm_mul_ps(attribB
, vOneOverWV1
);
845 attribC
= _mm_mul_ps(attribC
, vOneOverWV2
);
847 _mm_store_ps(pPerspAttribs
, attribA
);
848 _mm_store_ps(pPerspAttribs
+=4, attribB
);
849 _mm_store_ps(pPerspAttribs
+=4, attribC
);
854 // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
855 OSALIGNSIMD(float) a
[4];
857 triDesc
.Z
[0] = a
[0] - a
[2];
858 triDesc
.Z
[1] = a
[1] - a
[2];
862 triDesc
.Z
[2] += ComputeDepthBias(&rastState
, &triDesc
, workDesc
.pTriBuffer
+ 8);
864 // Calc bounding box of triangle
865 OSALIGNSIMD(BBOX
) bbox
;
866 calcBoundingBoxInt(vXi
, vYi
, bbox
);
868 if(RT::ValidEdgeMaskT::value
!= ALL_EDGES_VALID
)
870 // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
871 bbox
.left
--; bbox
.right
++; bbox
.top
--; bbox
.bottom
++;
872 SWR_ASSERT(state
.scissorInFixedPoint
.left
>= 0 && state
.scissorInFixedPoint
.top
>= 0,
873 "Conservative rast degenerate handling requires a valid scissor rect");
876 // Intersect with scissor/viewport
877 OSALIGNSIMD(BBOX
) intersect
;
878 intersect
.left
= std::max(bbox
.left
, state
.scissorInFixedPoint
.left
);
879 intersect
.right
= std::min(bbox
.right
- 1, state
.scissorInFixedPoint
.right
);
880 intersect
.top
= std::max(bbox
.top
, state
.scissorInFixedPoint
.top
);
881 intersect
.bottom
= std::min(bbox
.bottom
- 1, state
.scissorInFixedPoint
.bottom
);
883 triDesc
.triFlags
= workDesc
.triFlags
;
885 // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
886 uint32_t macroX
, macroY
;
887 MacroTileMgr::getTileIndices(macroTile
, macroX
, macroY
);
888 int32_t macroBoxLeft
= macroX
* KNOB_MACROTILE_X_DIM_FIXED
;
889 int32_t macroBoxRight
= macroBoxLeft
+ KNOB_MACROTILE_X_DIM_FIXED
- 1;
890 int32_t macroBoxTop
= macroY
* KNOB_MACROTILE_Y_DIM_FIXED
;
891 int32_t macroBoxBottom
= macroBoxTop
+ KNOB_MACROTILE_Y_DIM_FIXED
- 1;
893 intersect
.left
= std::max(intersect
.left
, macroBoxLeft
);
894 intersect
.top
= std::max(intersect
.top
, macroBoxTop
);
895 intersect
.right
= std::min(intersect
.right
, macroBoxRight
);
896 intersect
.bottom
= std::min(intersect
.bottom
, macroBoxBottom
);
898 SWR_ASSERT(intersect
.left
<= intersect
.right
&& intersect
.top
<= intersect
.bottom
&& intersect
.left
>= 0 && intersect
.right
>= 0 && intersect
.top
>= 0 && intersect
.bottom
>= 0);
900 RDTSC_STOP(BETriangleSetup
, 0, pDC
->drawId
);
902 // update triangle desc
903 uint32_t minTileX
= intersect
.left
>> (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
904 uint32_t minTileY
= intersect
.top
>> (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
905 uint32_t maxTileX
= intersect
.right
>> (KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
906 uint32_t maxTileY
= intersect
.bottom
>> (KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
907 uint32_t numTilesX
= maxTileX
- minTileX
+ 1;
908 uint32_t numTilesY
= maxTileY
- minTileY
+ 1;
910 if (numTilesX
== 0 || numTilesY
== 0)
912 RDTSC_EVENT(BEEmptyTriangle
, 1, 0);
913 RDTSC_STOP(BERasterizeTriangle
, 1, 0);
917 RDTSC_START(BEStepSetup
);
919 // Step to pixel center of top-left pixel of the triangle bbox
920 // Align intersect bbox (top/left) to raster tile's (top/left).
921 int32_t x
= AlignDown(intersect
.left
, (FIXED_POINT_SCALE
* KNOB_TILE_X_DIM
));
922 int32_t y
= AlignDown(intersect
.top
, (FIXED_POINT_SCALE
* KNOB_TILE_Y_DIM
));
924 // convenience typedef
925 typedef typename
RT::NumRasterSamplesT NumRasterSamplesT
;
927 // single sample rasterization evaluates edges at pixel center,
928 // multisample evaluates edges UL pixel corner and steps to each sample position
929 if(std::is_same
<NumRasterSamplesT
, SingleSampleT
>::value
)
931 // Add 0.5, in fixed point, to offset to pixel center
932 x
+= (FIXED_POINT_SCALE
/ 2);
933 y
+= (FIXED_POINT_SCALE
/ 2);
936 __m128i vTopLeftX
= _mm_set1_epi32(x
);
937 __m128i vTopLeftY
= _mm_set1_epi32(y
);
939 // evaluate edge equations at top-left pixel using 64bit math
941 // line = Ax + By + C
944 // we know x0 and y0 are on the line; plug them in:
946 // plug C back into line equation:
947 // line = Ax - By - Ax0 - By0
948 // line = A(x - x0) + B(y - y0)
949 // dX = (x-x0), dY = (y-y0)
950 // so all this simplifies to
951 // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
953 __m128i vDeltaX
= _mm_sub_epi32(vTopLeftX
, vXi
);
954 __m128i vDeltaY
= _mm_sub_epi32(vTopLeftY
, vYi
);
956 // evaluate A(dx) and B(dY) for all points
957 __m256d vAipd
= _mm256_cvtepi32_pd(vAi
);
958 __m256d vBipd
= _mm256_cvtepi32_pd(vBi
);
959 __m256d vDeltaXpd
= _mm256_cvtepi32_pd(vDeltaX
);
960 __m256d vDeltaYpd
= _mm256_cvtepi32_pd(vDeltaY
);
962 __m256d vAiDeltaXFix16
= _mm256_mul_pd(vAipd
, vDeltaXpd
);
963 __m256d vBiDeltaYFix16
= _mm256_mul_pd(vBipd
, vDeltaYpd
);
964 __m256d vEdge
= _mm256_add_pd(vAiDeltaXFix16
, vBiDeltaYFix16
);
966 // apply and edge adjustments(top-left, crast, etc)
967 adjustEdgesFix16
<RT
>(vAi
, vBi
, vEdge
);
969 // broadcast respective edge results to all lanes
970 double* pEdge
= (double*)&vEdge
;
971 __m256d vEdgeFix16
[7];
972 vEdgeFix16
[0] = _mm256_set1_pd(pEdge
[0]);
973 vEdgeFix16
[1] = _mm256_set1_pd(pEdge
[1]);
974 vEdgeFix16
[2] = _mm256_set1_pd(pEdge
[2]);
976 OSALIGNSIMD(int32_t) aAi
[4], aBi
[4];
977 _mm_store_si128((__m128i
*)aAi
, vAi
);
978 _mm_store_si128((__m128i
*)aBi
, vBi
);
979 EDGE rastEdges
[RT::NumEdgesT::value
];
981 // Compute and store triangle edge data
982 ComputeEdgeData(aAi
[0], aBi
[0], rastEdges
[0]);
983 ComputeEdgeData(aAi
[1], aBi
[1], rastEdges
[1]);
984 ComputeEdgeData(aAi
[2], aBi
[2], rastEdges
[2]);
986 // Compute and store triangle edge data if scissor needs to rasterized
987 ComputeScissorEdges
<typename
RT::RasterizeScissorEdgesT
, typename
RT::IsConservativeT
, RT
>
988 (bbox
, state
.scissorInFixedPoint
, x
, y
, rastEdges
, vEdgeFix16
);
990 // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
991 // used to for testing if entire raster tile is inside a triangle
992 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
994 vEdgeFix16
[e
] = _mm256_add_pd(vEdgeFix16
[e
], rastEdges
[e
].vRasterTileOffsets
);
997 // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
998 // step sample positions to the raster tile bbox of multisample points
999 // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples)
1002 // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
1003 __m256d vEdgeTileBbox
[3];
1004 if (NumRasterSamplesT::value
> 1)
1006 __m128i vTileSampleBBoxXh
= RT::MT::TileSampleOffsetsX();
1007 __m128i vTileSampleBBoxYh
= RT::MT::TileSampleOffsetsY();
1009 __m256d vTileSampleBBoxXFix8
= _mm256_cvtepi32_pd(vTileSampleBBoxXh
);
1010 __m256d vTileSampleBBoxYFix8
= _mm256_cvtepi32_pd(vTileSampleBBoxYh
);
1012 // step edge equation tests from Tile
1013 // used to for testing if entire raster tile is inside a triangle
1014 for (uint32_t e
= 0; e
< 3; ++e
)
1016 __m256d vResultAxFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].a
), vTileSampleBBoxXFix8
);
1017 __m256d vResultByFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].b
), vTileSampleBBoxYFix8
);
1018 vEdgeTileBbox
[e
] = _mm256_add_pd(vResultAxFix16
, vResultByFix16
);
1022 RDTSC_STOP(BEStepSetup
, 0, pDC
->drawId
);
1024 uint32_t tY
= minTileY
;
1025 uint32_t tX
= minTileX
;
1026 uint32_t maxY
= maxTileY
;
1027 uint32_t maxX
= maxTileX
;
1029 RenderOutputBuffers renderBuffers
, currentRenderBufferRow
;
1030 GetRenderHotTiles
<RT::MT::numSamples
>(pDC
, macroTile
, minTileX
, minTileY
, renderBuffers
, triDesc
.triFlags
.renderTargetArrayIndex
);
1031 currentRenderBufferRow
= renderBuffers
;
1033 // rasterize and generate coverage masks per sample
1034 for (uint32_t tileY
= tY
; tileY
<= maxY
; ++tileY
)
1036 __m256d vStartOfRowEdge
[RT::NumEdgesT::value
];
1037 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1039 vStartOfRowEdge
[e
] = vEdgeFix16
[e
];
1042 for (uint32_t tileX
= tX
; tileX
<= maxX
; ++tileX
)
1044 triDesc
.anyCoveredSamples
= 0;
1046 // is the corner of the edge outside of the raster tile? (vEdge < 0)
1047 int mask0
, mask1
, mask2
;
1048 UpdateEdgeMasks
<NumRasterSamplesT
>(vEdgeTileBbox
, vEdgeFix16
, mask0
, mask1
, mask2
);
1050 for (uint32_t sampleNum
= 0; sampleNum
< NumRasterSamplesT::value
; sampleNum
++)
1052 // trivial reject, at least one edge has all 4 corners of raster tile outside
1053 bool trivialReject
= TrivialRejectTest
<typename
RT::ValidEdgeMaskT
>(mask0
, mask1
, mask2
);
1057 // trivial accept mask
1058 triDesc
.coverageMask
[sampleNum
] = 0xffffffffffffffffULL
;
1059 if (TrivialAcceptTest
<typename
RT::ValidEdgeMaskT
>(mask0
, mask1
, mask2
))
1061 triDesc
.anyCoveredSamples
= triDesc
.coverageMask
[sampleNum
];
1062 // trivial accept, all 4 corners of all 3 edges are negative
1063 // i.e. raster tile completely inside triangle
1064 RDTSC_EVENT(BETrivialAccept
, 1, 0);
1068 __m256d vEdgeAtSample
[RT::NumEdgesT::value
];
1069 if(std::is_same
<NumRasterSamplesT
, SingleSampleT
>::value
)
1071 // should get optimized out for single sample case (global value numbering or copy propagation)
1072 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1074 vEdgeAtSample
[e
] = vEdgeFix16
[e
];
1079 __m128i vSampleOffsetXh
= RT::MT::vXi(sampleNum
);
1080 __m128i vSampleOffsetYh
= RT::MT::vYi(sampleNum
);
1081 __m256d vSampleOffsetX
= _mm256_cvtepi32_pd(vSampleOffsetXh
);
1082 __m256d vSampleOffsetY
= _mm256_cvtepi32_pd(vSampleOffsetYh
);
1084 // step edge equation tests from UL tile corner to pixel sample position
1085 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1087 __m256d vResultAxFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].a
), vSampleOffsetX
);
1088 __m256d vResultByFix16
= _mm256_mul_pd(_mm256_set1_pd(rastEdges
[e
].b
), vSampleOffsetY
);
1089 vEdgeAtSample
[e
] = _mm256_add_pd(vResultAxFix16
, vResultByFix16
);
1090 vEdgeAtSample
[e
] = _mm256_add_pd(vEdgeFix16
[e
], vEdgeAtSample
[e
]);
1094 double startQuadEdges
[RT::NumEdgesT::value
];
1095 const __m256i vLane0Mask
= _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1096 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1098 _mm256_maskstore_pd(&startQuadEdges
[e
], vLane0Mask
, vEdgeAtSample
[e
]);
1101 // not trivial accept or reject, must rasterize full tile
1102 RDTSC_START(BERasterizePartial
);
1103 triDesc
.coverageMask
[sampleNum
] = rasterizePartialTile
<RT::NumEdgesT::value
, typename
RT::ValidEdgeMaskT
>(pDC
, startQuadEdges
, rastEdges
);
1104 RDTSC_STOP(BERasterizePartial
, 0, 0);
1106 triDesc
.anyCoveredSamples
|= triDesc
.coverageMask
[sampleNum
];
1111 // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
1112 if(NumRasterSamplesT::value
> 1)
1114 triDesc
.coverageMask
[sampleNum
] = 0;
1116 RDTSC_EVENT(BETrivialReject
, 1, 0);
1120 #if KNOB_ENABLE_TOSS_POINTS
1123 gToss
= triDesc
.coverageMask
[0];
1127 if(triDesc
.anyCoveredSamples
)
1129 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
1130 // copy conservative coverage result to all samples
1131 if(RT::IsConservativeT::value
)
1133 auto copyCoverage
= [&](int sample
){triDesc
.coverageMask
[sample
] = triDesc
.coverageMask
[0]; };
1134 UnrollerL
<1, RT::MT::numSamples
, 1>::step(copyCoverage
);
1137 RDTSC_START(BEPixelBackend
);
1138 backendFuncs
.pfnBackend(pDC
, workerId
, tileX
<< KNOB_TILE_X_DIM_SHIFT
, tileY
<< KNOB_TILE_Y_DIM_SHIFT
, triDesc
, renderBuffers
);
1139 RDTSC_STOP(BEPixelBackend
, 0, 0);
1142 // step to the next tile in X
1143 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1145 vEdgeFix16
[e
] = _mm256_add_pd(vEdgeFix16
[e
], _mm256_set1_pd(rastEdges
[e
].stepRasterTileX
));
1147 StepRasterTileX
<RT
>(state
.psState
.numRenderTargets
, renderBuffers
);
1150 // step to the next tile in Y
1151 for (uint32_t e
= 0; e
< RT::NumEdgesT::value
; ++e
)
1153 vEdgeFix16
[e
] = _mm256_add_pd(vStartOfRowEdge
[e
], _mm256_set1_pd(rastEdges
[e
].stepRasterTileY
));
1155 StepRasterTileY
<RT
>(state
.psState
.numRenderTargets
, renderBuffers
, currentRenderBufferRow
);
1158 RDTSC_STOP(BERasterizeTriangle
, 1, 0);
1161 void RasterizeTriPoint(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void* pData
)
1163 const TRIANGLE_WORK_DESC
& workDesc
= *(const TRIANGLE_WORK_DESC
*)pData
;
1164 const SWR_RASTSTATE
& rastState
= pDC
->pState
->state
.rastState
;
1165 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
1167 bool isPointSpriteTexCoordEnabled
= backendState
.pointSpriteTexCoordMask
!= 0;
1169 // load point vertex
1170 float x
= *workDesc
.pTriBuffer
;
1171 float y
= *(workDesc
.pTriBuffer
+ 1);
1172 float z
= *(workDesc
.pTriBuffer
+ 2);
1174 // create a copy of the triangle buffer to write our adjusted vertices to
1175 OSALIGNSIMD(float) newTriBuffer
[4 * 4];
1176 TRIANGLE_WORK_DESC newWorkDesc
= workDesc
;
1177 newWorkDesc
.pTriBuffer
= &newTriBuffer
[0];
1179 // create a copy of the attrib buffer to write our adjusted attribs to
1180 OSALIGNSIMD(float) newAttribBuffer
[4 * 3 * KNOB_NUM_ATTRIBUTES
];
1181 newWorkDesc
.pAttribs
= &newAttribBuffer
[0];
1183 newWorkDesc
.pUserClipBuffer
= workDesc
.pUserClipBuffer
;
1184 newWorkDesc
.numAttribs
= workDesc
.numAttribs
;
1185 newWorkDesc
.triFlags
= workDesc
.triFlags
;
1187 // construct two tris by bloating point by point size
1188 float halfPointSize
= workDesc
.triFlags
.pointSize
* 0.5f
;
1189 float lowerX
= x
- halfPointSize
;
1190 float upperX
= x
+ halfPointSize
;
1191 float lowerY
= y
- halfPointSize
;
1192 float upperY
= y
+ halfPointSize
;
1195 float *pBuf
= &newTriBuffer
[0];
1204 _mm_store_ps(pBuf
, _mm_set1_ps(z
));
1205 _mm_store_ps(pBuf
+=4, _mm_set1_ps(1.0f
));
1207 // setup triangle rasterizer function
1208 PFN_WORK_FUNC pfnTriRast
;
1209 // for center sample pattern, all samples are at pixel center; calculate coverage
1210 // once at center and broadcast the results in the backend
1211 uint32_t sampleCount
= (rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
) ? rastState
.sampleCount
: SWR_MULTISAMPLE_1X
;
1212 // conservative rast not supported for points/lines
1213 pfnTriRast
= GetRasterizerFunc(sampleCount
, false, SWR_INPUT_COVERAGE_NONE
, ALL_EDGES_VALID
, (rastState
.scissorEnable
> 0));
1215 // overwrite texcoords for point sprites
1216 if (isPointSpriteTexCoordEnabled
)
1218 // copy original attribs
1219 memcpy(&newAttribBuffer
[0], workDesc
.pAttribs
, 4 * 3 * workDesc
.numAttribs
* sizeof(float));
1220 newWorkDesc
.pAttribs
= &newAttribBuffer
[0];
1222 // overwrite texcoord for point sprites
1223 uint32_t texCoordMask
= backendState
.pointSpriteTexCoordMask
;
1224 DWORD texCoordAttrib
= 0;
1226 while (_BitScanForward(&texCoordAttrib
, texCoordMask
))
1228 texCoordMask
&= ~(1 << texCoordAttrib
);
1229 __m128
* pTexAttrib
= (__m128
*)&newAttribBuffer
[0] + 3 * texCoordAttrib
;
1230 if (rastState
.pointSpriteTopOrigin
)
1232 pTexAttrib
[0] = _mm_set_ps(1, 0, 0, 0);
1233 pTexAttrib
[1] = _mm_set_ps(1, 0, 1, 0);
1234 pTexAttrib
[2] = _mm_set_ps(1, 0, 1, 1);
1238 pTexAttrib
[0] = _mm_set_ps(1, 0, 1, 0);
1239 pTexAttrib
[1] = _mm_set_ps(1, 0, 0, 0);
1240 pTexAttrib
[2] = _mm_set_ps(1, 0, 0, 1);
1246 // no texcoord overwrite, can reuse the attrib buffer from frontend
1247 newWorkDesc
.pAttribs
= workDesc
.pAttribs
;
1250 pfnTriRast(pDC
, workerId
, macroTile
, (void*)&newWorkDesc
);
1253 pBuf
= &newTriBuffer
[0];
1263 if (isPointSpriteTexCoordEnabled
)
1265 uint32_t texCoordMask
= backendState
.pointSpriteTexCoordMask
;
1266 DWORD texCoordAttrib
= 0;
1268 while (_BitScanForward(&texCoordAttrib
, texCoordMask
))
1270 texCoordMask
&= ~(1 << texCoordAttrib
);
1271 __m128
* pTexAttrib
= (__m128
*)&newAttribBuffer
[0] + 3 * texCoordAttrib
;
1272 if (rastState
.pointSpriteTopOrigin
)
1274 pTexAttrib
[0] = _mm_set_ps(1, 0, 0, 0);
1275 pTexAttrib
[1] = _mm_set_ps(1, 0, 1, 1);
1276 pTexAttrib
[2] = _mm_set_ps(1, 0, 0, 1);
1281 pTexAttrib
[0] = _mm_set_ps(1, 0, 1, 0);
1282 pTexAttrib
[1] = _mm_set_ps(1, 0, 0, 1);
1283 pTexAttrib
[2] = _mm_set_ps(1, 0, 1, 1);
1288 pfnTriRast(pDC
, workerId
, macroTile
, (void*)&newWorkDesc
);
1291 void RasterizeSimplePoint(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void* pData
)
1293 #if KNOB_ENABLE_TOSS_POINTS
1294 if (KNOB_TOSS_BIN_TRIS
)
1300 const TRIANGLE_WORK_DESC
& workDesc
= *(const TRIANGLE_WORK_DESC
*)pData
;
1301 const BACKEND_FUNCS
& backendFuncs
= pDC
->pState
->backendFuncs
;
1303 // map x,y relative offsets from start of raster tile to bit position in
1304 // coverage mask for the point
1305 static const uint32_t coverageMap
[8][8] = {
1306 { 0, 1, 4, 5, 8, 9, 12, 13 },
1307 { 2, 3, 6, 7, 10, 11, 14, 15 },
1308 { 16, 17, 20, 21, 24, 25, 28, 29 },
1309 { 18, 19, 22, 23, 26, 27, 30, 31 },
1310 { 32, 33, 36, 37, 40, 41, 44, 45 },
1311 { 34, 35, 38, 39, 42, 43, 46, 47 },
1312 { 48, 49, 52, 53, 56, 57, 60, 61 },
1313 { 50, 51, 54, 55, 58, 59, 62, 63 }
1316 OSALIGNSIMD(SWR_TRIANGLE_DESC
) triDesc
;
1318 // pull point information from triangle buffer
1319 // @todo use structs for readability
1320 uint32_t tileAlignedX
= *(uint32_t*)workDesc
.pTriBuffer
;
1321 uint32_t tileAlignedY
= *(uint32_t*)(workDesc
.pTriBuffer
+ 1);
1322 float z
= *(workDesc
.pTriBuffer
+ 2);
1324 // construct triangle descriptor for point
1325 // no interpolation, set up i,j for constant interpolation of z and attribs
1326 // @todo implement an optimized backend that doesn't require triangle information
1328 // compute coverage mask from x,y packed into the coverageMask flag
1329 // mask indices by the maximum valid index for x/y of coveragemap.
1330 uint32_t tX
= workDesc
.triFlags
.coverageMask
& 0x7;
1331 uint32_t tY
= (workDesc
.triFlags
.coverageMask
>> 4) & 0x7;
1332 // todo: multisample points?
1333 triDesc
.coverageMask
[0] = 1ULL << coverageMap
[tY
][tX
];
1335 // no persp divide needed for points
1336 triDesc
.pAttribs
= triDesc
.pPerspAttribs
= workDesc
.pAttribs
;
1337 triDesc
.triFlags
= workDesc
.triFlags
;
1338 triDesc
.recipDet
= 1.0f
;
1339 triDesc
.OneOverW
[0] = triDesc
.OneOverW
[1] = triDesc
.OneOverW
[2] = 1.0f
;
1340 triDesc
.I
[0] = triDesc
.I
[1] = triDesc
.I
[2] = 0.0f
;
1341 triDesc
.J
[0] = triDesc
.J
[1] = triDesc
.J
[2] = 0.0f
;
1342 triDesc
.Z
[0] = triDesc
.Z
[1] = triDesc
.Z
[2] = z
;
1344 RenderOutputBuffers renderBuffers
;
1345 GetRenderHotTiles(pDC
, macroTile
, tileAlignedX
>> KNOB_TILE_X_DIM_SHIFT
, tileAlignedY
>> KNOB_TILE_Y_DIM_SHIFT
,
1346 renderBuffers
, triDesc
.triFlags
.renderTargetArrayIndex
);
1348 RDTSC_START(BEPixelBackend
);
1349 backendFuncs
.pfnBackend(pDC
, workerId
, tileAlignedX
, tileAlignedY
, triDesc
, renderBuffers
);
1350 RDTSC_STOP(BEPixelBackend
, 0, 0);
1353 // Get pointers to hot tile memory for color RT, depth, stencil
1354 template <uint32_t numSamples
>
1355 void GetRenderHotTiles(DRAW_CONTEXT
*pDC
, uint32_t macroID
, uint32_t tileX
, uint32_t tileY
, RenderOutputBuffers
&renderBuffers
, uint32_t renderTargetArrayIndex
)
1357 const API_STATE
& state
= GetApiState(pDC
);
1358 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1361 MacroTileMgr::getTileIndices(macroID
, mx
, my
);
1362 tileX
-= KNOB_MACROTILE_X_DIM_IN_TILES
* mx
;
1363 tileY
-= KNOB_MACROTILE_Y_DIM_IN_TILES
* my
;
1365 // compute tile offset for active hottile buffers
1366 const uint32_t pitch
= KNOB_MACROTILE_X_DIM
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
/ 8;
1367 uint32_t offset
= ComputeTileOffset2D
<TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
> >(pitch
, tileX
, tileY
);
1370 unsigned long rtSlot
= 0;
1371 uint32_t colorHottileEnableMask
= state
.colorHottileEnable
;
1372 while(_BitScanForward(&rtSlot
, colorHottileEnableMask
))
1374 HOTTILE
*pColor
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroID
, (SWR_RENDERTARGET_ATTACHMENT
)(SWR_ATTACHMENT_COLOR0
+ rtSlot
), true,
1375 numSamples
, renderTargetArrayIndex
);
1376 pColor
->state
= HOTTILE_DIRTY
;
1377 renderBuffers
.pColor
[rtSlot
] = pColor
->pBuffer
+ offset
;
1379 colorHottileEnableMask
&= ~(1 << rtSlot
);
1381 if(state
.depthHottileEnable
)
1383 const uint32_t pitch
= KNOB_MACROTILE_X_DIM
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
/ 8;
1384 uint32_t offset
= ComputeTileOffset2D
<TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
> >(pitch
, tileX
, tileY
);
1386 HOTTILE
*pDepth
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroID
, SWR_ATTACHMENT_DEPTH
, true,
1387 numSamples
, renderTargetArrayIndex
);
1388 pDepth
->state
= HOTTILE_DIRTY
;
1389 SWR_ASSERT(pDepth
->pBuffer
!= nullptr);
1390 renderBuffers
.pDepth
= pDepth
->pBuffer
+ offset
;
1392 if(state
.stencilHottileEnable
)
1394 const uint32_t pitch
= KNOB_MACROTILE_X_DIM
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
/ 8;
1395 uint32_t offset
= ComputeTileOffset2D
<TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
> >(pitch
, tileX
, tileY
);
1397 HOTTILE
* pStencil
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroID
, SWR_ATTACHMENT_STENCIL
, true,
1398 numSamples
, renderTargetArrayIndex
);
1399 pStencil
->state
= HOTTILE_DIRTY
;
1400 SWR_ASSERT(pStencil
->pBuffer
!= nullptr);
1401 renderBuffers
.pStencil
= pStencil
->pBuffer
+ offset
;
1405 template <typename RT
>
1406 INLINE
void StepRasterTileX(uint32_t NumRT
, RenderOutputBuffers
&buffers
)
1408 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1410 buffers
.pColor
[rt
] += RT::colorRasterTileStep
;
1413 buffers
.pDepth
+= RT::depthRasterTileStep
;
1414 buffers
.pStencil
+= RT::stencilRasterTileStep
;
1417 template <typename RT
>
1418 INLINE
void StepRasterTileY(uint32_t NumRT
, RenderOutputBuffers
&buffers
, RenderOutputBuffers
&startBufferRow
)
1420 for(uint32_t rt
= 0; rt
< NumRT
; ++rt
)
1422 startBufferRow
.pColor
[rt
] += RT::colorRasterTileRowStep
;
1423 buffers
.pColor
[rt
] = startBufferRow
.pColor
[rt
];
1425 startBufferRow
.pDepth
+= RT::depthRasterTileRowStep
;
1426 buffers
.pDepth
= startBufferRow
.pDepth
;
1428 startBufferRow
.pStencil
+= RT::stencilRasterTileRowStep
;
1429 buffers
.pStencil
= startBufferRow
.pStencil
;
1432 void RasterizeLine(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
1434 const TRIANGLE_WORK_DESC
&workDesc
= *((TRIANGLE_WORK_DESC
*)pData
);
1435 #if KNOB_ENABLE_TOSS_POINTS
1436 if (KNOB_TOSS_BIN_TRIS
)
1442 // bloat line to two tris and call the triangle rasterizer twice
1443 RDTSC_START(BERasterizeLine
);
1445 const API_STATE
&state
= GetApiState(pDC
);
1446 const SWR_RASTSTATE
&rastState
= state
.rastState
;
1448 // macrotile dimensioning
1449 uint32_t macroX
, macroY
;
1450 MacroTileMgr::getTileIndices(macroTile
, macroX
, macroY
);
1451 int32_t macroBoxLeft
= macroX
* KNOB_MACROTILE_X_DIM_FIXED
;
1452 int32_t macroBoxRight
= macroBoxLeft
+ KNOB_MACROTILE_X_DIM_FIXED
- 1;
1453 int32_t macroBoxTop
= macroY
* KNOB_MACROTILE_Y_DIM_FIXED
;
1454 int32_t macroBoxBottom
= macroBoxTop
+ KNOB_MACROTILE_Y_DIM_FIXED
- 1;
1456 // create a copy of the triangle buffer to write our adjusted vertices to
1457 OSALIGNSIMD(float) newTriBuffer
[4 * 4];
1458 TRIANGLE_WORK_DESC newWorkDesc
= workDesc
;
1459 newWorkDesc
.pTriBuffer
= &newTriBuffer
[0];
1461 // create a copy of the attrib buffer to write our adjusted attribs to
1462 OSALIGNSIMD(float) newAttribBuffer
[4 * 3 * KNOB_NUM_ATTRIBUTES
];
1463 newWorkDesc
.pAttribs
= &newAttribBuffer
[0];
1465 const __m128 vBloat0
= _mm_set_ps(0.5f
, -0.5f
, -0.5f
, 0.5f
);
1466 const __m128 vBloat1
= _mm_set_ps(0.5f
, 0.5f
, 0.5f
, -0.5f
);
1468 __m128 vX
, vY
, vZ
, vRecipW
;
1470 vX
= _mm_load_ps(workDesc
.pTriBuffer
);
1471 vY
= _mm_load_ps(workDesc
.pTriBuffer
+ 4);
1472 vZ
= _mm_load_ps(workDesc
.pTriBuffer
+ 8);
1473 vRecipW
= _mm_load_ps(workDesc
.pTriBuffer
+ 12);
1476 // v0,v1 -> v0,v0,v1
1477 __m128 vXa
= _mm_shuffle_ps(vX
, vX
, _MM_SHUFFLE(1, 1, 0, 0));
1478 __m128 vYa
= _mm_shuffle_ps(vY
, vY
, _MM_SHUFFLE(1, 1, 0, 0));
1479 __m128 vZa
= _mm_shuffle_ps(vZ
, vZ
, _MM_SHUFFLE(1, 1, 0, 0));
1480 __m128 vRecipWa
= _mm_shuffle_ps(vRecipW
, vRecipW
, _MM_SHUFFLE(1, 1, 0, 0));
1482 __m128 vLineWidth
= _mm_set1_ps(pDC
->pState
->state
.rastState
.lineWidth
);
1483 __m128 vAdjust
= _mm_mul_ps(vLineWidth
, vBloat0
);
1484 if (workDesc
.triFlags
.yMajor
)
1486 vXa
= _mm_add_ps(vAdjust
, vXa
);
1490 vYa
= _mm_add_ps(vAdjust
, vYa
);
1493 // Store triangle description for rasterizer
1494 _mm_store_ps((float*)&newTriBuffer
[0], vXa
);
1495 _mm_store_ps((float*)&newTriBuffer
[4], vYa
);
1496 _mm_store_ps((float*)&newTriBuffer
[8], vZa
);
1497 _mm_store_ps((float*)&newTriBuffer
[12], vRecipWa
);
1499 // binner bins 3 edges for lines as v0, v1, v1
1500 // tri0 needs v0, v0, v1
1501 for (uint32_t a
= 0; a
< workDesc
.numAttribs
; ++a
)
1503 __m128 vAttrib0
= _mm_load_ps(&workDesc
.pAttribs
[a
*12 + 0]);
1504 __m128 vAttrib1
= _mm_load_ps(&workDesc
.pAttribs
[a
*12 + 4]);
1506 _mm_store_ps((float*)&newAttribBuffer
[a
*12 + 0], vAttrib0
);
1507 _mm_store_ps((float*)&newAttribBuffer
[a
*12 + 4], vAttrib0
);
1508 _mm_store_ps((float*)&newAttribBuffer
[a
*12 + 8], vAttrib1
);
1511 // Store user clip distances for triangle 0
1512 float newClipBuffer
[3 * 8];
1513 uint32_t numClipDist
= _mm_popcnt_u32(state
.rastState
.clipDistanceMask
);
1516 newWorkDesc
.pUserClipBuffer
= newClipBuffer
;
1518 float* pOldBuffer
= workDesc
.pUserClipBuffer
;
1519 float* pNewBuffer
= newClipBuffer
;
1520 for (uint32_t i
= 0; i
< numClipDist
; ++i
)
1522 // read barycentric coeffs from binner
1523 float a
= *(pOldBuffer
++);
1524 float b
= *(pOldBuffer
++);
1526 // reconstruct original clip distance at vertices
1530 // construct triangle barycentrics
1531 *(pNewBuffer
++) = c0
- c1
;
1532 *(pNewBuffer
++) = c0
- c1
;
1533 *(pNewBuffer
++) = c1
;
1537 // setup triangle rasterizer function
1538 PFN_WORK_FUNC pfnTriRast
;
1539 uint32_t sampleCount
= (rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
) ? rastState
.sampleCount
: SWR_MULTISAMPLE_1X
;
1540 // conservative rast not supported for points/lines
1541 pfnTriRast
= GetRasterizerFunc(sampleCount
, false, SWR_INPUT_COVERAGE_NONE
, ALL_EDGES_VALID
, (rastState
.scissorEnable
> 0));
1543 // make sure this macrotile intersects the triangle
1544 __m128i vXai
= fpToFixedPoint(vXa
);
1545 __m128i vYai
= fpToFixedPoint(vYa
);
1546 OSALIGNSIMD(BBOX
) bboxA
;
1547 calcBoundingBoxInt(vXai
, vYai
, bboxA
);
1549 if (!(bboxA
.left
> macroBoxRight
||
1550 bboxA
.left
> state
.scissorInFixedPoint
.right
||
1551 bboxA
.right
- 1 < macroBoxLeft
||
1552 bboxA
.right
- 1 < state
.scissorInFixedPoint
.left
||
1553 bboxA
.top
> macroBoxBottom
||
1554 bboxA
.top
> state
.scissorInFixedPoint
.bottom
||
1555 bboxA
.bottom
- 1 < macroBoxTop
||
1556 bboxA
.bottom
- 1 < state
.scissorInFixedPoint
.top
)) {
1557 // rasterize triangle
1558 pfnTriRast(pDC
, workerId
, macroTile
, (void*)&newWorkDesc
);
1562 // v0,v1 -> v1,v1,v0
1563 vXa
= _mm_shuffle_ps(vX
, vX
, _MM_SHUFFLE(1, 0, 1, 1));
1564 vYa
= _mm_shuffle_ps(vY
, vY
, _MM_SHUFFLE(1, 0, 1, 1));
1565 vZa
= _mm_shuffle_ps(vZ
, vZ
, _MM_SHUFFLE(1, 0, 1, 1));
1566 vRecipWa
= _mm_shuffle_ps(vRecipW
, vRecipW
, _MM_SHUFFLE(1, 0, 1, 1));
1568 vAdjust
= _mm_mul_ps(vLineWidth
, vBloat1
);
1569 if (workDesc
.triFlags
.yMajor
)
1571 vXa
= _mm_add_ps(vAdjust
, vXa
);
1575 vYa
= _mm_add_ps(vAdjust
, vYa
);
1578 // Store triangle description for rasterizer
1579 _mm_store_ps((float*)&newTriBuffer
[0], vXa
);
1580 _mm_store_ps((float*)&newTriBuffer
[4], vYa
);
1581 _mm_store_ps((float*)&newTriBuffer
[8], vZa
);
1582 _mm_store_ps((float*)&newTriBuffer
[12], vRecipWa
);
1584 // binner bins 3 edges for lines as v0, v1, v1
1585 // tri1 needs v1, v1, v0
1586 for (uint32_t a
= 0; a
< workDesc
.numAttribs
; ++a
)
1588 __m128 vAttrib0
= _mm_load_ps(&workDesc
.pAttribs
[a
* 12 + 0]);
1589 __m128 vAttrib1
= _mm_load_ps(&workDesc
.pAttribs
[a
* 12 + 4]);
1591 _mm_store_ps((float*)&newAttribBuffer
[a
* 12 + 0], vAttrib1
);
1592 _mm_store_ps((float*)&newAttribBuffer
[a
* 12 + 4], vAttrib1
);
1593 _mm_store_ps((float*)&newAttribBuffer
[a
* 12 + 8], vAttrib0
);
1596 // store user clip distance for triangle 1
1599 float* pOldBuffer
= workDesc
.pUserClipBuffer
;
1600 float* pNewBuffer
= newClipBuffer
;
1601 for (uint32_t i
= 0; i
< numClipDist
; ++i
)
1603 // read barycentric coeffs from binner
1604 float a
= *(pOldBuffer
++);
1605 float b
= *(pOldBuffer
++);
1607 // reconstruct original clip distance at vertices
1611 // construct triangle barycentrics
1612 *(pNewBuffer
++) = c1
- c0
;
1613 *(pNewBuffer
++) = c1
- c0
;
1614 *(pNewBuffer
++) = c0
;
1618 vXai
= fpToFixedPoint(vXa
);
1619 vYai
= fpToFixedPoint(vYa
);
1620 calcBoundingBoxInt(vXai
, vYai
, bboxA
);
1622 if (!(bboxA
.left
> macroBoxRight
||
1623 bboxA
.left
> state
.scissorInFixedPoint
.right
||
1624 bboxA
.right
- 1 < macroBoxLeft
||
1625 bboxA
.right
- 1 < state
.scissorInFixedPoint
.left
||
1626 bboxA
.top
> macroBoxBottom
||
1627 bboxA
.top
> state
.scissorInFixedPoint
.bottom
||
1628 bboxA
.bottom
- 1 < macroBoxTop
||
1629 bboxA
.bottom
- 1 < state
.scissorInFixedPoint
.top
)) {
1630 // rasterize triangle
1631 pfnTriRast(pDC
, workerId
, macroTile
, (void*)&newWorkDesc
);
1634 RDTSC_STOP(BERasterizeLine
, 1, 0);
1637 struct RasterizerChooser
1639 typedef PFN_WORK_FUNC FuncType
;
1641 template <typename
... ArgsB
>
1642 static FuncType
GetFunc()
1644 return RasterizeTriangle
<RasterizerTraits
<ArgsB
...>>;
1648 // Selector for correct templated RasterizeTriangle function
1649 PFN_WORK_FUNC
GetRasterizerFunc(
1650 uint32_t numSamples
,
1651 bool IsConservative
,
1652 uint32_t InputCoverage
,
1653 uint32_t EdgeEnable
,
1654 bool RasterizeScissorEdges
1657 return TemplateArgUnroller
<RasterizerChooser
>::GetFunc(
1658 IntArg
<SWR_MULTISAMPLE_1X
,SWR_MULTISAMPLE_TYPE_COUNT
-1>{numSamples
},
1660 IntArg
<SWR_INPUT_COVERAGE_NONE
, SWR_INPUT_COVERAGE_COUNT
-1>{InputCoverage
},
1661 IntArg
<0, VALID_TRI_EDGE_COUNT
-1>{EdgeEnable
},
1662 RasterizeScissorEdges
);