3c5d73466e206ea5657e8bf9ce2b544367219610
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / rasterizer.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file rasterizer.cpp
24 *
25 * @brief Implementation for the rasterizer.
26 *
27 ******************************************************************************/
28
29 #include <vector>
30 #include <algorithm>
31
32 #include "rasterizer.h"
33 #include "rdtsc_core.h"
34 #include "backend.h"
35 #include "utils.h"
36 #include "frontend.h"
37 #include "tilemgr.h"
38 #include "memory/tilingtraits.h"
39
40 template <uint32_t numSamples = 1>
41 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
42 template <typename RT>
43 void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers);
44 template <typename RT>
45 void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
46
47 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
48 const __m256d gMaskToVecpd[] =
49 {
50 MASKTOVEC(0, 0, 0, 0),
51 MASKTOVEC(0, 0, 0, 1),
52 MASKTOVEC(0, 0, 1, 0),
53 MASKTOVEC(0, 0, 1, 1),
54 MASKTOVEC(0, 1, 0, 0),
55 MASKTOVEC(0, 1, 0, 1),
56 MASKTOVEC(0, 1, 1, 0),
57 MASKTOVEC(0, 1, 1, 1),
58 MASKTOVEC(1, 0, 0, 0),
59 MASKTOVEC(1, 0, 0, 1),
60 MASKTOVEC(1, 0, 1, 0),
61 MASKTOVEC(1, 0, 1, 1),
62 MASKTOVEC(1, 1, 0, 0),
63 MASKTOVEC(1, 1, 0, 1),
64 MASKTOVEC(1, 1, 1, 0),
65 MASKTOVEC(1, 1, 1, 1),
66 };
67
68 struct POS
69 {
70 int32_t x, y;
71 };
72
73 struct EDGE
74 {
75 double a, b; // a, b edge coefficients in fix8
76 double stepQuadX; // step to adjacent horizontal quad in fix16
77 double stepQuadY; // step to adjacent vertical quad in fix16
78 double stepRasterTileX; // step to adjacent horizontal raster tile in fix16
79 double stepRasterTileY; // step to adjacent vertical raster tile in fix16
80
81 __m256d vQuadOffsets; // offsets for 4 samples of a quad
82 __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
83 };
84
85 //////////////////////////////////////////////////////////////////////////
86 /// @brief rasterize a raster tile partially covered by the triangle
87 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
88 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
89 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
90 /// Used to step between quads when sweeping over the raster tile.
91 template<uint32_t NumEdges, typename EdgeMaskT>
92 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
93 {
94 uint64_t coverageMask = 0;
95
96 __m256d vEdges[NumEdges];
97 __m256d vStepX[NumEdges];
98 __m256d vStepY[NumEdges];
99
100 for (uint32_t e = 0; e < NumEdges; ++e)
101 {
102 // Step to the pixel sample locations of the 1st quad
103 vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
104
105 // compute step to next quad (mul by 2 in x and y direction)
106 vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
107 vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
108 }
109
110 // fast unrolled version for 8x8 tile
111 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
112 int edgeMask[NumEdges];
113 uint64_t mask;
114
115 auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
116 auto update_lambda = [&](int e){mask &= edgeMask[e];};
117 auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
118 auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
119 auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
120
121 // evaluate which pixels in the quad are covered
122 #define EVAL \
123 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
124
125 // update coverage mask
126 #define UPDATE_MASK(bit) \
127 mask = edgeMask[0]; \
128 UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
129 coverageMask |= (mask << bit);
130
131 // step in the +x direction to the next quad
132 #define INCX \
133 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
134
135 // step in the +y direction to the next quad
136 #define INCY \
137 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
138
139 // step in the -x direction to the next quad
140 #define DECX \
141 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
142
143 // sweep 2x2 quad back and forth through the raster tile,
144 // computing coverage masks for the entire tile
145
146 // raster tile
147 // 0 1 2 3 4 5 6 7
148 // x x
149 // x x ------------------>
150 // x x |
151 // <-----------------x x V
152 // ..
153
154 // row 0
155 EVAL;
156 UPDATE_MASK(0);
157 INCX;
158 EVAL;
159 UPDATE_MASK(4);
160 INCX;
161 EVAL;
162 UPDATE_MASK(8);
163 INCX;
164 EVAL;
165 UPDATE_MASK(12);
166 INCY;
167
168 //row 1
169 EVAL;
170 UPDATE_MASK(28);
171 DECX;
172 EVAL;
173 UPDATE_MASK(24);
174 DECX;
175 EVAL;
176 UPDATE_MASK(20);
177 DECX;
178 EVAL;
179 UPDATE_MASK(16);
180 INCY;
181
182 // row 2
183 EVAL;
184 UPDATE_MASK(32);
185 INCX;
186 EVAL;
187 UPDATE_MASK(36);
188 INCX;
189 EVAL;
190 UPDATE_MASK(40);
191 INCX;
192 EVAL;
193 UPDATE_MASK(44);
194 INCY;
195
196 // row 3
197 EVAL;
198 UPDATE_MASK(60);
199 DECX;
200 EVAL;
201 UPDATE_MASK(56);
202 DECX;
203 EVAL;
204 UPDATE_MASK(52);
205 DECX;
206 EVAL;
207 UPDATE_MASK(48);
208 #else
209 uint32_t bit = 0;
210 for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
211 {
212 __m256d vStartOfRowEdge[NumEdges];
213 for (uint32_t e = 0; e < NumEdges; ++e)
214 {
215 vStartOfRowEdge[e] = vEdges[e];
216 }
217
218 for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
219 {
220 int edgeMask[NumEdges];
221 for (uint32_t e = 0; e < NumEdges; ++e)
222 {
223 edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
224 }
225
226 uint64_t mask = edgeMask[0];
227 for (uint32_t e = 1; e < NumEdges; ++e)
228 {
229 mask &= edgeMask[e];
230 }
231 coverageMask |= (mask << bit);
232
233 // step to the next pixel in the x
234 for (uint32_t e = 0; e < NumEdges; ++e)
235 {
236 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
237 }
238 bit+=4;
239 }
240
241 // step to the next row
242 for (uint32_t e = 0; e < NumEdges; ++e)
243 {
244 vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
245 }
246 }
247 #endif
248 return coverageMask;
249
250 }
251 // Top left rule:
252 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
253 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
254 // Top left: a sample is in if it is a top or left edge.
255 // Out: !(horizontal && above) = !horizontal && below
256 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right
257 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge)
258 {
259 // if vA < 0, vC--
260 // if vA == 0 && vB < 0, vC--
261
262 __m256d vEdgeOut = vEdge;
263 __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
264
265 // if vA < 0 (line is not horizontal and below)
266 int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
267
268 // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
269 __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
270 int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
271 msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
272
273 // if either of these are true and we're on the line (edge == 0), bump it outside the line
274 vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
275 }
276
277 //////////////////////////////////////////////////////////////////////////
278 /// @brief calculates difference in precision between the result of manh
279 /// calculation and the edge precision, based on compile time trait values
280 template<typename RT>
281 constexpr int64_t ManhToEdgePrecisionAdjust()
282 {
283 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
284 "Inadequate precision of result of manh calculation ");
285 return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
286 }
287
288 //////////////////////////////////////////////////////////////////////////
289 /// @struct adjustEdgeConservative
290 /// @brief Primary template definition used for partially specializing
291 /// the adjustEdgeConservative function. This struct should never
292 /// be instantiated.
293 /// @tparam RT: rasterizer traits
294 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
295 template <typename RT, typename ConservativeEdgeOffsetT>
296 struct adjustEdgeConservative
297 {
298 //////////////////////////////////////////////////////////////////////////
299 /// @brief Performs calculations to adjust each edge of a triangle away
300 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
301 /// direction.
302 ///
303 /// Uncertainty regions arise from fixed point rounding, which
304 /// can snap a vertex +/- by min fixed point value.
305 /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
306 /// This allows the rasterizer to test for coverage only at the pixel center,
307 /// instead of having to test individual pixel corners for conservative coverage
308 INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
309 {
310 // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
311 // from the pixel center (in the direction of the edge normal A/B)
312
313 // edge = Ax + Bx + C - (manh/e)
314 // manh = manhattan distance = abs(A) + abs(B)
315 // e = absolute rounding error from snapping from float to fixed point precision
316
317 // 'fixed point' multiply (in double to be avx1 friendly)
318 // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
319 __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
320 __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
321 _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
322
323 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
324 "Inadequate precision of result of manh calculation ");
325
326 // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
327 // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
328 manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
329
330 // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
331 // this allows the rasterizer to do a single conservative coverage test to see if the primitive
332 // intersects the pixel at all
333 vEdge = _mm256_sub_pd(vEdge, manh);
334 };
335 };
336
337 //////////////////////////////////////////////////////////////////////////
338 /// @brief adjustEdgeConservative specialization where no edge offset is needed
339 template <typename RT>
340 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
341 {
342 INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
343 };
344
345 //////////////////////////////////////////////////////////////////////////
346 /// @brief calculates the distance a degenerate BBox needs to be adjusted
347 /// for conservative rast based on compile time trait values
348 template<typename RT>
349 constexpr int64_t ConservativeScissorOffset()
350 {
351 static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
352 // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
353 typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
354 // 1/2 pixel edge offset + conservative offset - degenerateTriangle
355 return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
356 }
357
358 //////////////////////////////////////////////////////////////////////////
359 /// @brief Performs calculations to adjust each a vector of evaluated edges out
360 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
361 /// direction.
362 template <typename RT>
363 INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
364 {
365 int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
366 int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
367 vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
368 };
369
370 //////////////////////////////////////////////////////////////////////////
371 /// @brief Performs calculations to adjust each a scalar evaluated edge out
372 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
373 /// direction.
374 template <typename RT, typename OffsetT>
375 INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
376 {
377 int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
378 int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
379 return (Edge - manh);
380 };
381
382 //////////////////////////////////////////////////////////////////////////
383 /// @brief Perform any needed adjustments to evaluated triangle edges
384 template <typename RT, typename EdgeOffsetT>
385 struct adjustEdgesFix16
386 {
387 INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
388 {
389 static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
390 "Edge equation expected to be in x.16 fixed point");
391
392 static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
393
394 // need to apply any edge offsets before applying the top-left rule
395 adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
396
397 adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
398 }
399 };
400
401 //////////////////////////////////////////////////////////////////////////
402 /// @brief Perform top left adjustments to evaluated triangle edges
403 template <typename RT>
404 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
405 {
406 INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
407 {
408 adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
409 }
410 };
411
412 // max(abs(dz/dx), abs(dz,dy)
413 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
414 {
415 /*
416 // evaluate i,j at (0,0)
417 float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
418 float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
419
420 // evaluate i,j at (1,0)
421 float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
422 float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
423
424 // compute dz/dx
425 float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
426 float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
427 float dzdx = abs(d10 - d00);
428
429 // evaluate i,j at (0,1)
430 float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
431 float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
432
433 float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
434 float dzdy = abs(d01 - d00);
435 */
436
437 // optimized version of above
438 float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
439 float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
440
441 return std::max(dzdx, dzdy);
442 }
443
444 INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
445 {
446 if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
447 {
448 return (1.0f / (1 << 24));
449 }
450 else if (pState->depthFormat == R16_UNORM)
451 {
452 return (1.0f / (1 << 16));
453 }
454 else
455 {
456 SWR_ASSERT(pState->depthFormat == R32_FLOAT);
457
458 // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
459 float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
460 uint32_t zMaxInt = *(uint32_t*)&zMax;
461 zMaxInt &= 0x7f800000;
462 zMax = *(float*)&zMaxInt;
463
464 return zMax * (1.0f / (1 << 23));
465 }
466 }
467
468 INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
469 {
470 if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
471 {
472 return 0.0f;
473 }
474
475 float scale = pState->slopeScaledDepthBias;
476 if (scale != 0.0f)
477 {
478 scale *= ComputeMaxDepthSlope(pTri);
479 }
480
481 float bias = pState->depthBias;
482 if (!pState->depthBiasPreAdjusted)
483 {
484 bias *= ComputeBiasFactor(pState, pTri, z);
485 }
486 bias += scale;
487
488 if (pState->depthBiasClamp > 0.0f)
489 {
490 bias = std::min(bias, pState->depthBiasClamp);
491 }
492 else if (pState->depthBiasClamp < 0.0f)
493 {
494 bias = std::max(bias, pState->depthBiasClamp);
495 }
496
497 return bias;
498 }
499
500 // Prevent DCE by writing coverage mask from rasterizer to volatile
501 #if KNOB_ENABLE_TOSS_POINTS
502 __declspec(thread) volatile uint64_t gToss;
503 #endif
504
505 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
506 // try to avoid _chkstk insertions; make this thread local
507 static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib];
508
509 INLINE
510 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
511 {
512 edge.a = a;
513 edge.b = b;
514
515 // compute constant steps to adjacent quads
516 edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
517 edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
518
519 // compute constant steps to adjacent raster tiles
520 edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
521 edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
522
523 // compute quad offsets
524 const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
525 const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
526
527 __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
528 __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
529 edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
530
531 // compute raster tile offsets
532 const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
533 const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
534
535 __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
536 __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
537 edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
538 }
539
540 INLINE
541 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
542 {
543 ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
544 }
545
546 //////////////////////////////////////////////////////////////////////////
547 /// @brief Primary template definition used for partially specializing
548 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
549 /// corner to sample position, and test for coverage
550 /// @tparam sampleCount: multisample count
551 template <typename NumSamplesT>
552 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
553 int32_t &mask0, int32_t &mask1, int32_t &mask2)
554 {
555 __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
556 // evaluate edge equations at the tile multisample bounding box
557 vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
558 vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
559 vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
560 mask0 = _mm256_movemask_pd(vSampleBboxTest0);
561 mask1 = _mm256_movemask_pd(vSampleBboxTest1);
562 mask2 = _mm256_movemask_pd(vSampleBboxTest2);
563 }
564
565 //////////////////////////////////////////////////////////////////////////
566 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
567 /// when only rasterizing a single coverage test point
568 template <>
569 INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
570 int32_t &mask0, int32_t &mask1, int32_t &mask2)
571 {
572 mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
573 mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
574 mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
575 }
576
577 //////////////////////////////////////////////////////////////////////////
578 /// @struct ComputeScissorEdges
579 /// @brief Primary template definition. Allows the function to be generically
580 /// called. When paired with below specializations, will result in an empty
581 /// inlined function if scissor is not enabled
582 /// @tparam RasterScissorEdgesT: is scissor enabled?
583 /// @tparam IsConservativeT: is conservative rast enabled?
584 /// @tparam RT: rasterizer traits
585 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
586 struct ComputeScissorEdges
587 {
588 INLINE ComputeScissorEdges(const BBOX &triBBox, const BBOX &scissorBBox, const int32_t x, const int32_t y,
589 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
590 };
591
592 //////////////////////////////////////////////////////////////////////////
593 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
594 /// specialization. Instantiated when conservative rast and scissor are enabled
595 template <typename RT>
596 struct ComputeScissorEdges<std::true_type, std::true_type, RT>
597 {
598 //////////////////////////////////////////////////////////////////////////
599 /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
600 /// evaluate edge equations and offset them away from pixel center.
601 INLINE ComputeScissorEdges(const BBOX &triBBox, const BBOX &scissorBBox, const int32_t x, const int32_t y,
602 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
603 {
604 // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
605 BBOX scissor;
606 scissor.left = std::max(triBBox.left, scissorBBox.left);
607 scissor.right = std::min(triBBox.right, scissorBBox.right);
608 scissor.top = std::max(triBBox.top, scissorBBox.top);
609 scissor.bottom = std::min(triBBox.bottom, scissorBBox.bottom);
610
611 POS topLeft{scissor.left, scissor.top};
612 POS bottomLeft{scissor.left, scissor.bottom};
613 POS topRight{scissor.right, scissor.top};
614 POS bottomRight{scissor.right, scissor.bottom};
615
616 // construct 4 scissor edges in ccw direction
617 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
618 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
619 ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
620 ComputeEdgeData(topRight, topLeft, rastEdges[6]);
621
622 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.left)) + (rastEdges[3].b * (y - scissor.top)));
623 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.left)) + (rastEdges[4].b * (y - scissor.bottom)));
624 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.right)) + (rastEdges[5].b * (y - scissor.bottom)));
625 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.right)) + (rastEdges[6].b * (y - scissor.top)));
626
627 // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
628 adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
629 adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
630 adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
631 adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
632 }
633 };
634
635 //////////////////////////////////////////////////////////////////////////
636 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
637 /// specialization. Instantiated when scissor is enabled and conservative rast
638 /// is disabled.
639 template <typename RT>
640 struct ComputeScissorEdges<std::true_type, std::false_type, RT>
641 {
642 //////////////////////////////////////////////////////////////////////////
643 /// @brief Compute scissor edge vectors and evaluate edge equations
644 INLINE ComputeScissorEdges(const BBOX &, const BBOX &scissorBBox, const int32_t x, const int32_t y,
645 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
646 {
647 const BBOX &scissor = scissorBBox;
648 POS topLeft{scissor.left, scissor.top};
649 POS bottomLeft{scissor.left, scissor.bottom};
650 POS topRight{scissor.right, scissor.top};
651 POS bottomRight{scissor.right, scissor.bottom};
652
653 // construct 4 scissor edges in ccw direction
654 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
655 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
656 ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
657 ComputeEdgeData(topRight, topLeft, rastEdges[6]);
658
659 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.left)) + (rastEdges[3].b * (y - scissor.top)));
660 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.left)) + (rastEdges[4].b * (y - scissor.bottom)));
661 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.right)) + (rastEdges[5].b * (y - scissor.bottom)));
662 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.right)) + (rastEdges[6].b * (y - scissor.top)));
663 }
664 };
665
666 //////////////////////////////////////////////////////////////////////////
667 /// @brief Primary function template for TrivialRejectTest. Should
668 /// never be called, but TemplateUnroller instantiates a few unused values,
669 /// so it calls a runtime assert instead of a static_assert.
670 template <typename ValidEdgeMaskT>
671 INLINE bool TrivialRejectTest(const int, const int, const int)
672 {
673 SWR_ASSERT(0, "Primary templated function should never be called");
674 return false;
675 };
676
677 //////////////////////////////////////////////////////////////////////////
678 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
679 /// and edge 1 for trivial coverage reject
680 template <>
681 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
682 {
683 return (!(mask0 && mask1)) ? true : false;
684 };
685
686 //////////////////////////////////////////////////////////////////////////
687 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
688 /// and edge 2 for trivial coverage reject
689 template <>
690 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
691 {
692 return (!(mask0 && mask2)) ? true : false;
693 };
694
695 //////////////////////////////////////////////////////////////////////////
696 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
697 /// and edge 2 for trivial coverage reject
698 template <>
699 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
700 {
701 return (!(mask1 && mask2)) ? true : false;
702 };
703
704 //////////////////////////////////////////////////////////////////////////
705 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
706 /// primitive edges for trivial coverage reject
707 template <>
708 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
709 {
710 return (!(mask0 && mask1 && mask2)) ? true : false;;
711 };
712
713 //////////////////////////////////////////////////////////////////////////
714 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
715 /// point, so return false and rasterize against conservative BBox
716 template <>
717 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
718 {
719 return false;
720 };
721
722 //////////////////////////////////////////////////////////////////////////
723 /// @brief Primary function template for TrivialAcceptTest. Always returns
724 /// false, since it will only be called for degenerate tris, and as such
725 /// will never cover the entire raster tile
726 template <typename ValidEdgeMaskT>
727 INLINE bool TrivialAcceptTest(const int, const int, const int)
728 {
729 return false;
730 };
731
732 //////////////////////////////////////////////////////////////////////////
733 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
734 /// edge masks for a fully covered raster tile
735 template <>
736 INLINE bool TrivialAcceptTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
737 {
738 return ((mask0 & mask1 & mask2) == 0xf);
739 };
740
741 //////////////////////////////////////////////////////////////////////////
742 /// @brief Primary function template for GenerateSVInnerCoverage. Results
743 /// in an empty function call if SVInnerCoverage isn't requested
744 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
745 struct GenerateSVInnerCoverage
746 {
747 INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, EDGE*, double*, uint64_t &){};
748 };
749
750 //////////////////////////////////////////////////////////////////////////
751 /// @brief Specialization of GenerateSVInnerCoverage where all edges
752 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
753 /// edge values from OuterConservative to InnerConservative and rasterizes.
754 template <typename RT>
755 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
756 {
757 INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, EDGE* pRastEdges, double* pStartQuadEdges, uint64_t &innerCoverageMask)
758 {
759 double startQuadEdgesAdj[RT::NumEdgesT::value];
760 for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
761 {
762 startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
763 }
764
765 // not trivial accept or reject, must rasterize full tile
766 RDTSC_START(BERasterizePartial);
767 innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
768 RDTSC_STOP(BERasterizePartial, 0, 0);
769 }
770 };
771
772 //////////////////////////////////////////////////////////////////////////
773 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
774 /// in an empty function call if SVInnerCoverage isn't requested
775 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
776 struct UpdateEdgeMasksInnerConservative
777 {
778 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
779 const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
780 };
781
782 //////////////////////////////////////////////////////////////////////////
783 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
784 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
785 /// evaluated at raster tile corners to inner conservative position and
786 /// updates edge masks
787 template <typename RT>
788 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
789 {
790 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
791 const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
792 {
793 __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
794
795 // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
796 // conservative evaluated edge when adjusting the edge in for inner conservative tests
797 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
798 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
799 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
800
801 UpdateEdgeMasks<typename RT::NumRasterSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
802 }
803 };
804
805 //////////////////////////////////////////////////////////////////////////
806 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
807 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
808 /// cover an entire raster tile, set mask0 to 0 to force it down the
809 /// rastierizePartialTile path
810 template <typename RT, typename ValidEdgeMaskT>
811 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
812 {
813 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
814 const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
815 {
816 // set one mask to zero to force the triangle down the rastierizePartialTile path
817 mask0 = 0;
818 }
819 };
820
821 template <typename RT>
822 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
823 {
824 const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
825 #if KNOB_ENABLE_TOSS_POINTS
826 if (KNOB_TOSS_BIN_TRIS)
827 {
828 return;
829 }
830 #endif
831 RDTSC_START(BERasterizeTriangle);
832
833 RDTSC_START(BETriangleSetup);
834 const API_STATE &state = GetApiState(pDC);
835 const SWR_RASTSTATE &rastState = state.rastState;
836 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
837
838 OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
839 triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
840
841 __m128 vX, vY, vZ, vRecipW;
842
843 // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
844 // eg: vX = [x0 x1 x2 dc]
845 vX = _mm_load_ps(workDesc.pTriBuffer);
846 vY = _mm_load_ps(workDesc.pTriBuffer + 4);
847 vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
848 vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
849
850 // convert to fixed point
851 static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
852 __m128i vXi = fpToFixedPoint(vX);
853 __m128i vYi = fpToFixedPoint(vY);
854
855 // quantize floating point position to fixed point precision
856 // to prevent attribute creep around the triangle vertices
857 vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
858 vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
859
860 // triangle setup - A and B edge equation coefs
861 __m128 vA, vB;
862 triangleSetupAB(vX, vY, vA, vB);
863
864 __m128i vAi, vBi;
865 triangleSetupABInt(vXi, vYi, vAi, vBi);
866
867 // determinant
868 float det = calcDeterminantInt(vAi, vBi);
869
870 // Verts in Pixel Coordinate Space at this point
871 // Det > 0 = CW winding order
872 // Convert CW triangles to CCW
873 if (det > 0.0)
874 {
875 vA = _mm_mul_ps(vA, _mm_set1_ps(-1));
876 vB = _mm_mul_ps(vB, _mm_set1_ps(-1));
877 vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
878 vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
879 det = -det;
880 }
881
882 __m128 vC;
883 // Finish triangle setup - C edge coef
884 triangleSetupC(vX, vY, vA, vB, vC);
885
886 if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
887 {
888 // If we have degenerate edge(s) to rasterize, set I and J coefs
889 // to 0 for constant interpolation of attributes
890 triDesc.I[0] = 0.0f;
891 triDesc.I[1] = 0.0f;
892 triDesc.I[2] = 0.0f;
893 triDesc.J[0] = 0.0f;
894 triDesc.J[1] = 0.0f;
895 triDesc.J[2] = 0.0f;
896
897 // Degenerate triangles have no area
898 triDesc.recipDet = 0.0f;
899 }
900 else
901 {
902 // only extract coefs for 2 of the barycentrics; the 3rd can be
903 // determined from the barycentric equation:
904 // i + j + k = 1 <=> k = 1 - j - i
905 _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
906 _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
907 _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
908 _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
909 _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
910 _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
911
912 // compute recipDet, used to calculate barycentric i and j in the backend
913 triDesc.recipDet = 1.0f/det;
914 }
915
916 OSALIGNSIMD(float) oneOverW[4];
917 _mm_store_ps(oneOverW, vRecipW);
918 triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
919 triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
920 triDesc.OneOverW[2] = oneOverW[2];
921
922 // calculate perspective correct coefs per vertex attrib
923 float* pPerspAttribs = perspAttribsTLS;
924 float* pAttribs = workDesc.pAttribs;
925 triDesc.pPerspAttribs = pPerspAttribs;
926 triDesc.pAttribs = pAttribs;
927 float *pRecipW = workDesc.pTriBuffer + 12;
928 triDesc.pRecipW = pRecipW;
929 __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
930 __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
931 __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
932 for(uint32_t i = 0; i < workDesc.numAttribs; i++)
933 {
934 __m128 attribA = _mm_load_ps(pAttribs);
935 __m128 attribB = _mm_load_ps(pAttribs+=4);
936 __m128 attribC = _mm_load_ps(pAttribs+=4);
937 pAttribs+=4;
938
939 attribA = _mm_mul_ps(attribA, vOneOverWV0);
940 attribB = _mm_mul_ps(attribB, vOneOverWV1);
941 attribC = _mm_mul_ps(attribC, vOneOverWV2);
942
943 _mm_store_ps(pPerspAttribs, attribA);
944 _mm_store_ps(pPerspAttribs+=4, attribB);
945 _mm_store_ps(pPerspAttribs+=4, attribC);
946 pPerspAttribs+=4;
947 }
948
949 // compute bary Z
950 // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
951 OSALIGNSIMD(float) a[4];
952 _mm_store_ps(a, vZ);
953 triDesc.Z[0] = a[0] - a[2];
954 triDesc.Z[1] = a[1] - a[2];
955 triDesc.Z[2] = a[2];
956
957 // add depth bias
958 triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
959
960 // Calc bounding box of triangle
961 OSALIGNSIMD(BBOX) bbox;
962 calcBoundingBoxInt(vXi, vYi, bbox);
963
964 if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
965 {
966 // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
967 bbox.left--; bbox.right++; bbox.top--; bbox.bottom++;
968 SWR_ASSERT(state.scissorInFixedPoint.left >= 0 && state.scissorInFixedPoint.top >= 0,
969 "Conservative rast degenerate handling requires a valid scissor rect");
970 }
971
972 // Intersect with scissor/viewport
973 OSALIGNSIMD(BBOX) intersect;
974 intersect.left = std::max(bbox.left, state.scissorInFixedPoint.left);
975 intersect.right = std::min(bbox.right - 1, state.scissorInFixedPoint.right);
976 intersect.top = std::max(bbox.top, state.scissorInFixedPoint.top);
977 intersect.bottom = std::min(bbox.bottom - 1, state.scissorInFixedPoint.bottom);
978
979 triDesc.triFlags = workDesc.triFlags;
980
981 // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
982 uint32_t macroX, macroY;
983 MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
984 int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
985 int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
986 int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
987 int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
988
989 intersect.left = std::max(intersect.left, macroBoxLeft);
990 intersect.top = std::max(intersect.top, macroBoxTop);
991 intersect.right = std::min(intersect.right, macroBoxRight);
992 intersect.bottom = std::min(intersect.bottom, macroBoxBottom);
993
994 SWR_ASSERT(intersect.left <= intersect.right && intersect.top <= intersect.bottom && intersect.left >= 0 && intersect.right >= 0 && intersect.top >= 0 && intersect.bottom >= 0);
995
996 RDTSC_STOP(BETriangleSetup, 0, pDC->drawId);
997
998 // update triangle desc
999 uint32_t minTileX = intersect.left >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1000 uint32_t minTileY = intersect.top >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1001 uint32_t maxTileX = intersect.right >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1002 uint32_t maxTileY = intersect.bottom >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1003 uint32_t numTilesX = maxTileX - minTileX + 1;
1004 uint32_t numTilesY = maxTileY - minTileY + 1;
1005
1006 if (numTilesX == 0 || numTilesY == 0)
1007 {
1008 RDTSC_EVENT(BEEmptyTriangle, 1, 0);
1009 RDTSC_STOP(BERasterizeTriangle, 1, 0);
1010 return;
1011 }
1012
1013 RDTSC_START(BEStepSetup);
1014
1015 // Step to pixel center of top-left pixel of the triangle bbox
1016 // Align intersect bbox (top/left) to raster tile's (top/left).
1017 int32_t x = AlignDown(intersect.left, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
1018 int32_t y = AlignDown(intersect.top, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
1019
1020 // convenience typedef
1021 typedef typename RT::NumRasterSamplesT NumRasterSamplesT;
1022
1023 // single sample rasterization evaluates edges at pixel center,
1024 // multisample evaluates edges UL pixel corner and steps to each sample position
1025 if(std::is_same<NumRasterSamplesT, SingleSampleT>::value)
1026 {
1027 // Add 0.5, in fixed point, to offset to pixel center
1028 x += (FIXED_POINT_SCALE / 2);
1029 y += (FIXED_POINT_SCALE / 2);
1030 }
1031
1032 __m128i vTopLeftX = _mm_set1_epi32(x);
1033 __m128i vTopLeftY = _mm_set1_epi32(y);
1034
1035 // evaluate edge equations at top-left pixel using 64bit math
1036 //
1037 // line = Ax + By + C
1038 // solving for C:
1039 // C = -Ax - By
1040 // we know x0 and y0 are on the line; plug them in:
1041 // C = -Ax0 - By0
1042 // plug C back into line equation:
1043 // line = Ax - By - Ax0 - By0
1044 // line = A(x - x0) + B(y - y0)
1045 // dX = (x-x0), dY = (y-y0)
1046 // so all this simplifies to
1047 // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
1048
1049 __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
1050 __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
1051
1052 // evaluate A(dx) and B(dY) for all points
1053 __m256d vAipd = _mm256_cvtepi32_pd(vAi);
1054 __m256d vBipd = _mm256_cvtepi32_pd(vBi);
1055 __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
1056 __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
1057
1058 __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
1059 __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
1060 __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
1061
1062 // apply any edge adjustments(top-left, crast, etc)
1063 adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
1064
1065 // broadcast respective edge results to all lanes
1066 double* pEdge = (double*)&vEdge;
1067 __m256d vEdgeFix16[7];
1068 vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
1069 vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
1070 vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
1071
1072 OSALIGNSIMD(int32_t) aAi[4], aBi[4];
1073 _mm_store_si128((__m128i*)aAi, vAi);
1074 _mm_store_si128((__m128i*)aBi, vBi);
1075 EDGE rastEdges[RT::NumEdgesT::value];
1076
1077 // Compute and store triangle edge data
1078 ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
1079 ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
1080 ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
1081
1082 // Compute and store triangle edge data if scissor needs to rasterized
1083 ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
1084 (bbox, state.scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
1085
1086 // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
1087 // used to for testing if entire raster tile is inside a triangle
1088 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1089 {
1090 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
1091 }
1092
1093 // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
1094 // step sample positions to the raster tile bbox of multisample points
1095 // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples)
1096 // | |
1097 // | |
1098 // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
1099 __m256d vEdgeTileBbox[3];
1100 if (NumRasterSamplesT::value > 1)
1101 {
1102 __m128i vTileSampleBBoxXh = RT::MT::TileSampleOffsetsX();
1103 __m128i vTileSampleBBoxYh = RT::MT::TileSampleOffsetsY();
1104
1105 __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
1106 __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
1107
1108 // step edge equation tests from Tile
1109 // used to for testing if entire raster tile is inside a triangle
1110 for (uint32_t e = 0; e < 3; ++e)
1111 {
1112 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
1113 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
1114 vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1115
1116 // adjust for msaa tile bbox edges outward for conservative rast, if enabled
1117 adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
1118 }
1119 }
1120
1121 RDTSC_STOP(BEStepSetup, 0, pDC->drawId);
1122
1123 uint32_t tY = minTileY;
1124 uint32_t tX = minTileX;
1125 uint32_t maxY = maxTileY;
1126 uint32_t maxX = maxTileX;
1127
1128 RenderOutputBuffers renderBuffers, currentRenderBufferRow;
1129 GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
1130 currentRenderBufferRow = renderBuffers;
1131
1132 // rasterize and generate coverage masks per sample
1133 for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
1134 {
1135 __m256d vStartOfRowEdge[RT::NumEdgesT::value];
1136 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1137 {
1138 vStartOfRowEdge[e] = vEdgeFix16[e];
1139 }
1140
1141 for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
1142 {
1143 triDesc.anyCoveredSamples = 0;
1144
1145 // is the corner of the edge outside of the raster tile? (vEdge < 0)
1146 int mask0, mask1, mask2;
1147 UpdateEdgeMasks<NumRasterSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
1148
1149 for (uint32_t sampleNum = 0; sampleNum < NumRasterSamplesT::value; sampleNum++)
1150 {
1151 // trivial reject, at least one edge has all 4 corners of raster tile outside
1152 bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
1153
1154 if (!trivialReject)
1155 {
1156 // trivial accept mask
1157 triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
1158
1159 // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
1160 UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
1161 (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
1162
1163 if (TrivialAcceptTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2))
1164 {
1165 // trivial accept, all 4 corners of all 3 edges are negative
1166 // i.e. raster tile completely inside triangle
1167 triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
1168 if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
1169 {
1170 triDesc.innerCoverageMask = 0xffffffffffffffffULL;
1171 }
1172 RDTSC_EVENT(BETrivialAccept, 1, 0);
1173 }
1174 else
1175 {
1176 __m256d vEdgeAtSample[RT::NumEdgesT::value];
1177 if(std::is_same<NumRasterSamplesT, SingleSampleT>::value)
1178 {
1179 // should get optimized out for single sample case (global value numbering or copy propagation)
1180 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1181 {
1182 vEdgeAtSample[e] = vEdgeFix16[e];
1183 }
1184 }
1185 else
1186 {
1187 __m128i vSampleOffsetXh = RT::MT::vXi(sampleNum);
1188 __m128i vSampleOffsetYh = RT::MT::vYi(sampleNum);
1189 __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
1190 __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
1191
1192 // step edge equation tests from UL tile corner to pixel sample position
1193 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1194 {
1195 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
1196 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
1197 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1198 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
1199 }
1200 }
1201
1202 double startQuadEdges[RT::NumEdgesT::value];
1203 const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1204 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1205 {
1206 _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
1207 }
1208
1209 // not trivial accept or reject, must rasterize full tile
1210 RDTSC_START(BERasterizePartial);
1211 triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
1212 RDTSC_STOP(BERasterizePartial, 0, 0);
1213
1214 triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
1215
1216 // Output SV InnerCoverage, if needed
1217 GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
1218 }
1219 }
1220 else
1221 {
1222 // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
1223 if(NumRasterSamplesT::value > 1)
1224 {
1225 triDesc.coverageMask[sampleNum] = 0;
1226 }
1227 RDTSC_EVENT(BETrivialReject, 1, 0);
1228 }
1229 }
1230
1231 #if KNOB_ENABLE_TOSS_POINTS
1232 if(KNOB_TOSS_RS)
1233 {
1234 gToss = triDesc.coverageMask[0];
1235 }
1236 else
1237 #endif
1238 if(triDesc.anyCoveredSamples)
1239 {
1240 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
1241 // copy conservative coverage result to all samples
1242 if(RT::IsConservativeT::value)
1243 {
1244 auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
1245 UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
1246 }
1247
1248 RDTSC_START(BEPixelBackend);
1249 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
1250 RDTSC_STOP(BEPixelBackend, 0, 0);
1251 }
1252
1253 // step to the next tile in X
1254 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1255 {
1256 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
1257 }
1258 StepRasterTileX<RT>(state.psState.numRenderTargets, renderBuffers);
1259 }
1260
1261 // step to the next tile in Y
1262 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1263 {
1264 vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
1265 }
1266 StepRasterTileY<RT>(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow);
1267 }
1268
1269 RDTSC_STOP(BERasterizeTriangle, 1, 0);
1270 }
1271
1272 void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
1273 {
1274 const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
1275 const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
1276 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1277
1278 bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
1279
1280 // load point vertex
1281 float x = *workDesc.pTriBuffer;
1282 float y = *(workDesc.pTriBuffer + 1);
1283 float z = *(workDesc.pTriBuffer + 2);
1284
1285 // create a copy of the triangle buffer to write our adjusted vertices to
1286 OSALIGNSIMD(float) newTriBuffer[4 * 4];
1287 TRIANGLE_WORK_DESC newWorkDesc = workDesc;
1288 newWorkDesc.pTriBuffer = &newTriBuffer[0];
1289
1290 // create a copy of the attrib buffer to write our adjusted attribs to
1291 OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES];
1292 newWorkDesc.pAttribs = &newAttribBuffer[0];
1293
1294 newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
1295 newWorkDesc.numAttribs = workDesc.numAttribs;
1296 newWorkDesc.triFlags = workDesc.triFlags;
1297
1298 // construct two tris by bloating point by point size
1299 float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
1300 float lowerX = x - halfPointSize;
1301 float upperX = x + halfPointSize;
1302 float lowerY = y - halfPointSize;
1303 float upperY = y + halfPointSize;
1304
1305 // tri 0
1306 float *pBuf = &newTriBuffer[0];
1307 *pBuf++ = lowerX;
1308 *pBuf++ = lowerX;
1309 *pBuf++ = upperX;
1310 pBuf++;
1311 *pBuf++ = lowerY;
1312 *pBuf++ = upperY;
1313 *pBuf++ = upperY;
1314 pBuf++;
1315 _mm_store_ps(pBuf, _mm_set1_ps(z));
1316 _mm_store_ps(pBuf+=4, _mm_set1_ps(1.0f));
1317
1318 // setup triangle rasterizer function
1319 PFN_WORK_FUNC pfnTriRast;
1320 // for center sample pattern, all samples are at pixel center; calculate coverage
1321 // once at center and broadcast the results in the backend
1322 uint32_t sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1323 // conservative rast not supported for points/lines
1324 pfnTriRast = GetRasterizerFunc(sampleCount, false, SWR_INPUT_COVERAGE_NONE, ALL_EDGES_VALID, (rastState.scissorEnable > 0));
1325
1326 // overwrite texcoords for point sprites
1327 if (isPointSpriteTexCoordEnabled)
1328 {
1329 // copy original attribs
1330 memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
1331 newWorkDesc.pAttribs = &newAttribBuffer[0];
1332
1333 // overwrite texcoord for point sprites
1334 uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
1335 DWORD texCoordAttrib = 0;
1336
1337 while (_BitScanForward(&texCoordAttrib, texCoordMask))
1338 {
1339 texCoordMask &= ~(1 << texCoordAttrib);
1340 __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
1341 if (rastState.pointSpriteTopOrigin)
1342 {
1343 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
1344 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
1345 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
1346 }
1347 else
1348 {
1349 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
1350 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
1351 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
1352 }
1353 }
1354 }
1355 else
1356 {
1357 // no texcoord overwrite, can reuse the attrib buffer from frontend
1358 newWorkDesc.pAttribs = workDesc.pAttribs;
1359 }
1360
1361 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1362
1363 // tri 1
1364 pBuf = &newTriBuffer[0];
1365 *pBuf++ = lowerX;
1366 *pBuf++ = upperX;
1367 *pBuf++ = upperX;
1368 pBuf++;
1369 *pBuf++ = lowerY;
1370 *pBuf++ = upperY;
1371 *pBuf++ = lowerY;
1372 // z, w unchanged
1373
1374 if (isPointSpriteTexCoordEnabled)
1375 {
1376 uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
1377 DWORD texCoordAttrib = 0;
1378
1379 while (_BitScanForward(&texCoordAttrib, texCoordMask))
1380 {
1381 texCoordMask &= ~(1 << texCoordAttrib);
1382 __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
1383 if (rastState.pointSpriteTopOrigin)
1384 {
1385 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
1386 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
1387 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
1388
1389 }
1390 else
1391 {
1392 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
1393 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
1394 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
1395 }
1396 }
1397 }
1398
1399 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1400 }
1401
1402 void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
1403 {
1404 #if KNOB_ENABLE_TOSS_POINTS
1405 if (KNOB_TOSS_BIN_TRIS)
1406 {
1407 return;
1408 }
1409 #endif
1410
1411 const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
1412 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
1413
1414 // map x,y relative offsets from start of raster tile to bit position in
1415 // coverage mask for the point
1416 static const uint32_t coverageMap[8][8] = {
1417 { 0, 1, 4, 5, 8, 9, 12, 13 },
1418 { 2, 3, 6, 7, 10, 11, 14, 15 },
1419 { 16, 17, 20, 21, 24, 25, 28, 29 },
1420 { 18, 19, 22, 23, 26, 27, 30, 31 },
1421 { 32, 33, 36, 37, 40, 41, 44, 45 },
1422 { 34, 35, 38, 39, 42, 43, 46, 47 },
1423 { 48, 49, 52, 53, 56, 57, 60, 61 },
1424 { 50, 51, 54, 55, 58, 59, 62, 63 }
1425 };
1426
1427 OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
1428
1429 // pull point information from triangle buffer
1430 // @todo use structs for readability
1431 uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
1432 uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
1433 float z = *(workDesc.pTriBuffer + 2);
1434
1435 // construct triangle descriptor for point
1436 // no interpolation, set up i,j for constant interpolation of z and attribs
1437 // @todo implement an optimized backend that doesn't require triangle information
1438
1439 // compute coverage mask from x,y packed into the coverageMask flag
1440 // mask indices by the maximum valid index for x/y of coveragemap.
1441 uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
1442 uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
1443 // todo: multisample points?
1444 triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
1445
1446 // no persp divide needed for points
1447 triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
1448 triDesc.triFlags = workDesc.triFlags;
1449 triDesc.recipDet = 1.0f;
1450 triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
1451 triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
1452 triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
1453 triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
1454
1455 RenderOutputBuffers renderBuffers;
1456 GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
1457 renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
1458
1459 RDTSC_START(BEPixelBackend);
1460 backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
1461 RDTSC_STOP(BEPixelBackend, 0, 0);
1462 }
1463
1464 // Get pointers to hot tile memory for color RT, depth, stencil
1465 template <uint32_t numSamples>
1466 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
1467 {
1468 const API_STATE& state = GetApiState(pDC);
1469 SWR_CONTEXT *pContext = pDC->pContext;
1470
1471 uint32_t mx, my;
1472 MacroTileMgr::getTileIndices(macroID, mx, my);
1473 tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
1474 tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
1475
1476 // compute tile offset for active hottile buffers
1477 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
1478 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1479 offset*=numSamples;
1480
1481 unsigned long rtSlot = 0;
1482 uint32_t colorHottileEnableMask = state.colorHottileEnable;
1483 while(_BitScanForward(&rtSlot, colorHottileEnableMask))
1484 {
1485 HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true,
1486 numSamples, renderTargetArrayIndex);
1487 pColor->state = HOTTILE_DIRTY;
1488 renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
1489
1490 colorHottileEnableMask &= ~(1 << rtSlot);
1491 }
1492 if(state.depthHottileEnable)
1493 {
1494 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
1495 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1496 offset*=numSamples;
1497 HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true,
1498 numSamples, renderTargetArrayIndex);
1499 pDepth->state = HOTTILE_DIRTY;
1500 SWR_ASSERT(pDepth->pBuffer != nullptr);
1501 renderBuffers.pDepth = pDepth->pBuffer + offset;
1502 }
1503 if(state.stencilHottileEnable)
1504 {
1505 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
1506 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1507 offset*=numSamples;
1508 HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true,
1509 numSamples, renderTargetArrayIndex);
1510 pStencil->state = HOTTILE_DIRTY;
1511 SWR_ASSERT(pStencil->pBuffer != nullptr);
1512 renderBuffers.pStencil = pStencil->pBuffer + offset;
1513 }
1514 }
1515
1516 template <typename RT>
1517 INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers)
1518 {
1519 for(uint32_t rt = 0; rt < NumRT; ++rt)
1520 {
1521 buffers.pColor[rt] += RT::colorRasterTileStep;
1522 }
1523
1524 buffers.pDepth += RT::depthRasterTileStep;
1525 buffers.pStencil += RT::stencilRasterTileStep;
1526 }
1527
1528 template <typename RT>
1529 INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
1530 {
1531 for(uint32_t rt = 0; rt < NumRT; ++rt)
1532 {
1533 startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
1534 buffers.pColor[rt] = startBufferRow.pColor[rt];
1535 }
1536 startBufferRow.pDepth += RT::depthRasterTileRowStep;
1537 buffers.pDepth = startBufferRow.pDepth;
1538
1539 startBufferRow.pStencil += RT::stencilRasterTileRowStep;
1540 buffers.pStencil = startBufferRow.pStencil;
1541 }
1542
1543 void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
1544 {
1545 const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData);
1546 #if KNOB_ENABLE_TOSS_POINTS
1547 if (KNOB_TOSS_BIN_TRIS)
1548 {
1549 return;
1550 }
1551 #endif
1552
1553 // bloat line to two tris and call the triangle rasterizer twice
1554 RDTSC_START(BERasterizeLine);
1555
1556 const API_STATE &state = GetApiState(pDC);
1557 const SWR_RASTSTATE &rastState = state.rastState;
1558
1559 // macrotile dimensioning
1560 uint32_t macroX, macroY;
1561 MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
1562 int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
1563 int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
1564 int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
1565 int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
1566
1567 // create a copy of the triangle buffer to write our adjusted vertices to
1568 OSALIGNSIMD(float) newTriBuffer[4 * 4];
1569 TRIANGLE_WORK_DESC newWorkDesc = workDesc;
1570 newWorkDesc.pTriBuffer = &newTriBuffer[0];
1571
1572 // create a copy of the attrib buffer to write our adjusted attribs to
1573 OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES];
1574 newWorkDesc.pAttribs = &newAttribBuffer[0];
1575
1576 const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
1577 const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
1578
1579 __m128 vX, vY, vZ, vRecipW;
1580
1581 vX = _mm_load_ps(workDesc.pTriBuffer);
1582 vY = _mm_load_ps(workDesc.pTriBuffer + 4);
1583 vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
1584 vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
1585
1586 // triangle 0
1587 // v0,v1 -> v0,v0,v1
1588 __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
1589 __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
1590 __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
1591 __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
1592
1593 __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
1594 __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
1595 if (workDesc.triFlags.yMajor)
1596 {
1597 vXa = _mm_add_ps(vAdjust, vXa);
1598 }
1599 else
1600 {
1601 vYa = _mm_add_ps(vAdjust, vYa);
1602 }
1603
1604 // Store triangle description for rasterizer
1605 _mm_store_ps((float*)&newTriBuffer[0], vXa);
1606 _mm_store_ps((float*)&newTriBuffer[4], vYa);
1607 _mm_store_ps((float*)&newTriBuffer[8], vZa);
1608 _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
1609
1610 // binner bins 3 edges for lines as v0, v1, v1
1611 // tri0 needs v0, v0, v1
1612 for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
1613 {
1614 __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]);
1615 __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]);
1616
1617 _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0);
1618 _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0);
1619 _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1);
1620 }
1621
1622 // Store user clip distances for triangle 0
1623 float newClipBuffer[3 * 8];
1624 uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask);
1625 if (numClipDist)
1626 {
1627 newWorkDesc.pUserClipBuffer = newClipBuffer;
1628
1629 float* pOldBuffer = workDesc.pUserClipBuffer;
1630 float* pNewBuffer = newClipBuffer;
1631 for (uint32_t i = 0; i < numClipDist; ++i)
1632 {
1633 // read barycentric coeffs from binner
1634 float a = *(pOldBuffer++);
1635 float b = *(pOldBuffer++);
1636
1637 // reconstruct original clip distance at vertices
1638 float c0 = a + b;
1639 float c1 = b;
1640
1641 // construct triangle barycentrics
1642 *(pNewBuffer++) = c0 - c1;
1643 *(pNewBuffer++) = c0 - c1;
1644 *(pNewBuffer++) = c1;
1645 }
1646 }
1647
1648 // setup triangle rasterizer function
1649 PFN_WORK_FUNC pfnTriRast;
1650 uint32_t sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1651 // conservative rast not supported for points/lines
1652 pfnTriRast = GetRasterizerFunc(sampleCount, false, SWR_INPUT_COVERAGE_NONE, ALL_EDGES_VALID, (rastState.scissorEnable > 0));
1653
1654 // make sure this macrotile intersects the triangle
1655 __m128i vXai = fpToFixedPoint(vXa);
1656 __m128i vYai = fpToFixedPoint(vYa);
1657 OSALIGNSIMD(BBOX) bboxA;
1658 calcBoundingBoxInt(vXai, vYai, bboxA);
1659
1660 if (!(bboxA.left > macroBoxRight ||
1661 bboxA.left > state.scissorInFixedPoint.right ||
1662 bboxA.right - 1 < macroBoxLeft ||
1663 bboxA.right - 1 < state.scissorInFixedPoint.left ||
1664 bboxA.top > macroBoxBottom ||
1665 bboxA.top > state.scissorInFixedPoint.bottom ||
1666 bboxA.bottom - 1 < macroBoxTop ||
1667 bboxA.bottom - 1 < state.scissorInFixedPoint.top)) {
1668 // rasterize triangle
1669 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1670 }
1671
1672 // triangle 1
1673 // v0,v1 -> v1,v1,v0
1674 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
1675 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
1676 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
1677 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
1678
1679 vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
1680 if (workDesc.triFlags.yMajor)
1681 {
1682 vXa = _mm_add_ps(vAdjust, vXa);
1683 }
1684 else
1685 {
1686 vYa = _mm_add_ps(vAdjust, vYa);
1687 }
1688
1689 // Store triangle description for rasterizer
1690 _mm_store_ps((float*)&newTriBuffer[0], vXa);
1691 _mm_store_ps((float*)&newTriBuffer[4], vYa);
1692 _mm_store_ps((float*)&newTriBuffer[8], vZa);
1693 _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
1694
1695 // binner bins 3 edges for lines as v0, v1, v1
1696 // tri1 needs v1, v1, v0
1697 for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
1698 {
1699 __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
1700 __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
1701
1702 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
1703 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
1704 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
1705 }
1706
1707 // store user clip distance for triangle 1
1708 if (numClipDist)
1709 {
1710 float* pOldBuffer = workDesc.pUserClipBuffer;
1711 float* pNewBuffer = newClipBuffer;
1712 for (uint32_t i = 0; i < numClipDist; ++i)
1713 {
1714 // read barycentric coeffs from binner
1715 float a = *(pOldBuffer++);
1716 float b = *(pOldBuffer++);
1717
1718 // reconstruct original clip distance at vertices
1719 float c0 = a + b;
1720 float c1 = b;
1721
1722 // construct triangle barycentrics
1723 *(pNewBuffer++) = c1 - c0;
1724 *(pNewBuffer++) = c1 - c0;
1725 *(pNewBuffer++) = c0;
1726 }
1727 }
1728
1729 vXai = fpToFixedPoint(vXa);
1730 vYai = fpToFixedPoint(vYa);
1731 calcBoundingBoxInt(vXai, vYai, bboxA);
1732
1733 if (!(bboxA.left > macroBoxRight ||
1734 bboxA.left > state.scissorInFixedPoint.right ||
1735 bboxA.right - 1 < macroBoxLeft ||
1736 bboxA.right - 1 < state.scissorInFixedPoint.left ||
1737 bboxA.top > macroBoxBottom ||
1738 bboxA.top > state.scissorInFixedPoint.bottom ||
1739 bboxA.bottom - 1 < macroBoxTop ||
1740 bboxA.bottom - 1 < state.scissorInFixedPoint.top)) {
1741 // rasterize triangle
1742 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1743 }
1744
1745 RDTSC_STOP(BERasterizeLine, 1, 0);
1746 }
1747
1748 struct RasterizerChooser
1749 {
1750 typedef PFN_WORK_FUNC FuncType;
1751
1752 template <typename... ArgsB>
1753 static FuncType GetFunc()
1754 {
1755 return RasterizeTriangle<RasterizerTraits<ArgsB...>>;
1756 }
1757 };
1758
1759 // Selector for correct templated RasterizeTriangle function
1760 PFN_WORK_FUNC GetRasterizerFunc(
1761 uint32_t numSamples,
1762 bool IsConservative,
1763 uint32_t InputCoverage,
1764 uint32_t EdgeEnable,
1765 bool RasterizeScissorEdges
1766 )
1767 {
1768 return TemplateArgUnroller<RasterizerChooser>::GetFunc(
1769 IntArg<SWR_MULTISAMPLE_1X,SWR_MULTISAMPLE_TYPE_COUNT-1>{numSamples},
1770 IsConservative,
1771 IntArg<SWR_INPUT_COVERAGE_NONE, SWR_INPUT_COVERAGE_COUNT-1>{InputCoverage},
1772 IntArg<0, VALID_TRI_EDGE_COUNT-1>{EdgeEnable},
1773 RasterizeScissorEdges);
1774 }