c5ef072de39dca559153cdc43eb4d3363ca4566a
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / rasterizer.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file rasterizer.cpp
24 *
25 * @brief Implementation for the rasterizer.
26 *
27 ******************************************************************************/
28
29 #include <vector>
30 #include <algorithm>
31
32 #include "rasterizer.h"
33 #include "rdtsc_core.h"
34 #include "backend.h"
35 #include "utils.h"
36 #include "frontend.h"
37 #include "tilemgr.h"
38 #include "memory/tilingtraits.h"
39
40 template <uint32_t numSamples = 1>
41 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
42 template <typename RT>
43 void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers);
44 template <typename RT>
45 void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
46
47 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
48 const __m256d gMaskToVecpd[] =
49 {
50 MASKTOVEC(0, 0, 0, 0),
51 MASKTOVEC(0, 0, 0, 1),
52 MASKTOVEC(0, 0, 1, 0),
53 MASKTOVEC(0, 0, 1, 1),
54 MASKTOVEC(0, 1, 0, 0),
55 MASKTOVEC(0, 1, 0, 1),
56 MASKTOVEC(0, 1, 1, 0),
57 MASKTOVEC(0, 1, 1, 1),
58 MASKTOVEC(1, 0, 0, 0),
59 MASKTOVEC(1, 0, 0, 1),
60 MASKTOVEC(1, 0, 1, 0),
61 MASKTOVEC(1, 0, 1, 1),
62 MASKTOVEC(1, 1, 0, 0),
63 MASKTOVEC(1, 1, 0, 1),
64 MASKTOVEC(1, 1, 1, 0),
65 MASKTOVEC(1, 1, 1, 1),
66 };
67
68 struct POS
69 {
70 int32_t x, y;
71 };
72
73 struct EDGE
74 {
75 double a, b; // a, b edge coefficients in fix8
76 double stepQuadX; // step to adjacent horizontal quad in fix16
77 double stepQuadY; // step to adjacent vertical quad in fix16
78 double stepRasterTileX; // step to adjacent horizontal raster tile in fix16
79 double stepRasterTileY; // step to adjacent vertical raster tile in fix16
80
81 __m256d vQuadOffsets; // offsets for 4 samples of a quad
82 __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
83 };
84
85 //////////////////////////////////////////////////////////////////////////
86 /// @brief rasterize a raster tile partially covered by the triangle
87 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
88 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
89 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
90 /// Used to step between quads when sweeping over the raster tile.
91 template<uint32_t NumEdges, typename EdgeMaskT>
92 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
93 {
94 uint64_t coverageMask = 0;
95
96 __m256d vEdges[NumEdges];
97 __m256d vStepX[NumEdges];
98 __m256d vStepY[NumEdges];
99
100 for (uint32_t e = 0; e < NumEdges; ++e)
101 {
102 // Step to the pixel sample locations of the 1st quad
103 vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
104
105 // compute step to next quad (mul by 2 in x and y direction)
106 vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
107 vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
108 }
109
110 // fast unrolled version for 8x8 tile
111 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
112 int edgeMask[NumEdges];
113 uint64_t mask;
114
115 auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
116 auto update_lambda = [&](int e){mask &= edgeMask[e];};
117 auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
118 auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
119 auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
120
121 // evaluate which pixels in the quad are covered
122 #define EVAL \
123 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
124
125 // update coverage mask
126 #define UPDATE_MASK(bit) \
127 mask = edgeMask[0]; \
128 UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
129 coverageMask |= (mask << bit);
130
131 // step in the +x direction to the next quad
132 #define INCX \
133 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
134
135 // step in the +y direction to the next quad
136 #define INCY \
137 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
138
139 // step in the -x direction to the next quad
140 #define DECX \
141 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
142
143 // sweep 2x2 quad back and forth through the raster tile,
144 // computing coverage masks for the entire tile
145
146 // raster tile
147 // 0 1 2 3 4 5 6 7
148 // x x
149 // x x ------------------>
150 // x x |
151 // <-----------------x x V
152 // ..
153
154 // row 0
155 EVAL;
156 UPDATE_MASK(0);
157 INCX;
158 EVAL;
159 UPDATE_MASK(4);
160 INCX;
161 EVAL;
162 UPDATE_MASK(8);
163 INCX;
164 EVAL;
165 UPDATE_MASK(12);
166 INCY;
167
168 //row 1
169 EVAL;
170 UPDATE_MASK(28);
171 DECX;
172 EVAL;
173 UPDATE_MASK(24);
174 DECX;
175 EVAL;
176 UPDATE_MASK(20);
177 DECX;
178 EVAL;
179 UPDATE_MASK(16);
180 INCY;
181
182 // row 2
183 EVAL;
184 UPDATE_MASK(32);
185 INCX;
186 EVAL;
187 UPDATE_MASK(36);
188 INCX;
189 EVAL;
190 UPDATE_MASK(40);
191 INCX;
192 EVAL;
193 UPDATE_MASK(44);
194 INCY;
195
196 // row 3
197 EVAL;
198 UPDATE_MASK(60);
199 DECX;
200 EVAL;
201 UPDATE_MASK(56);
202 DECX;
203 EVAL;
204 UPDATE_MASK(52);
205 DECX;
206 EVAL;
207 UPDATE_MASK(48);
208 #else
209 uint32_t bit = 0;
210 for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
211 {
212 __m256d vStartOfRowEdge[NumEdges];
213 for (uint32_t e = 0; e < NumEdges; ++e)
214 {
215 vStartOfRowEdge[e] = vEdges[e];
216 }
217
218 for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
219 {
220 int edgeMask[NumEdges];
221 for (uint32_t e = 0; e < NumEdges; ++e)
222 {
223 edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
224 }
225
226 uint64_t mask = edgeMask[0];
227 for (uint32_t e = 1; e < NumEdges; ++e)
228 {
229 mask &= edgeMask[e];
230 }
231 coverageMask |= (mask << bit);
232
233 // step to the next pixel in the x
234 for (uint32_t e = 0; e < NumEdges; ++e)
235 {
236 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
237 }
238 bit+=4;
239 }
240
241 // step to the next row
242 for (uint32_t e = 0; e < NumEdges; ++e)
243 {
244 vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
245 }
246 }
247 #endif
248 return coverageMask;
249
250 }
251 // Top left rule:
252 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
253 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
254 // Top left: a sample is in if it is a top or left edge.
255 // Out: !(horizontal && above) = !horizontal && below
256 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right
257 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge)
258 {
259 // if vA < 0, vC--
260 // if vA == 0 && vB < 0, vC--
261
262 __m256d vEdgeOut = vEdge;
263 __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
264
265 // if vA < 0 (line is not horizontal and below)
266 int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
267
268 // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
269 __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
270 int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
271 msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
272
273 // if either of these are true and we're on the line (edge == 0), bump it outside the line
274 vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
275 }
276
277 //////////////////////////////////////////////////////////////////////////
278 /// @brief calculates difference in precision between the result of manh
279 /// calculation and the edge precision, based on compile time trait values
280 template<typename RT>
281 constexpr int64_t ManhToEdgePrecisionAdjust()
282 {
283 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
284 "Inadequate precision of result of manh calculation ");
285 return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
286 }
287
288 //////////////////////////////////////////////////////////////////////////
289 /// @struct adjustEdgeConservative
290 /// @brief Primary template definition used for partially specializing
291 /// the adjustEdgeConservative function. This struct should never
292 /// be instantiated.
293 /// @tparam RT: rasterizer traits
294 /// @tparam IsConservativeT: is conservative rast enabled?
295 template <typename RT, typename IsConservativeT>
296 struct adjustEdgeConservative
297 {
298 INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) = delete;
299 };
300
301 //////////////////////////////////////////////////////////////////////////
302 /// @brief adjustEdgeConservative<RT, std::true_type> specialization
303 /// of adjustEdgeConservative. Used for conservative rasterization specific
304 /// edge adjustments
305 template <typename RT>
306 struct adjustEdgeConservative<RT, std::true_type>
307 {
308 //////////////////////////////////////////////////////////////////////////
309 /// @brief Performs calculations to adjust each edge of a triangle away
310 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
311 /// direction.
312 ///
313 /// Uncertainty regions arise from fixed point rounding, which
314 /// can snap a vertex +/- by min fixed point value.
315 /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
316 /// This allows the rasterizer to test for coverage only at the pixel center,
317 /// instead of having to test individual pixel corners for conservative coverage
318 INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
319 {
320 // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
321 // from the pixel center (in the direction of the edge normal A/B)
322
323 // edge = Ax + Bx + C - (manh/e)
324 // manh = manhattan distance = abs(A) + abs(B)
325 // e = absolute rounding error from snapping from float to fixed point precision
326
327 // 'fixed point' multiply (in double to be avx1 friendly)
328 // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
329 __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
330 __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(RT::ConservativeEdgeOffsetT::value)),
331 _mm256_mul_pd(vBai, _mm256_set1_pd(RT::ConservativeEdgeOffsetT::value)));
332
333 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
334 "Inadequate precision of result of manh calculation ");
335
336 // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
337 // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
338 manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
339
340 // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
341 // this allows the rasterizer to do a single conservative coverage test to see if the primitive
342 // intersects the pixel at all
343 vEdge = _mm256_sub_pd(vEdge, manh);
344 };
345 };
346
347 //////////////////////////////////////////////////////////////////////////
348 /// @brief adjustEdgeConservative<RT, std::false_type> specialization
349 /// of adjustEdgeConservative. Allows code to be generically called; when
350 /// IsConservativeT trait is disabled this inlines an empty function, which
351 /// should get optimized out.
352 template <typename RT>
353 struct adjustEdgeConservative<RT, std::false_type>
354 {
355 INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge){};
356 };
357
358 //////////////////////////////////////////////////////////////////////////
359 /// @brief calculates the distance a degenerate BBox needs to be adjusted
360 /// for conservative rast based on compile time trait values
361 template<typename RT>
362 constexpr int64_t ConservativeScissorOffset()
363 {
364 static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
365 // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
366 typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
367 // 1/2 pixel edge offset + conservative offset - degenerateTriangle
368 return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
369 }
370
371 //////////////////////////////////////////////////////////////////////////
372 /// @brief Performs calculations to adjust each a scalar edge out
373 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
374 /// direction.
375 template <typename RT>
376 INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
377 {
378 int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
379 int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
380 vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
381 };
382
383 //////////////////////////////////////////////////////////////////////////
384 /// @brief Perform any needed adjustments to evaluated triangle edges
385 template <typename RT>
386 INLINE void adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
387 {
388 static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
389 "Edge equation expected to be in x.16 fixed point");
390 // need to offset the edge before applying the top-left rule
391 adjustEdgeConservative<RT, typename RT::IsConservativeT>(vAi, vBi, vEdge);
392
393 adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
394 }
395
396 // max(abs(dz/dx), abs(dz,dy)
397 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
398 {
399 /*
400 // evaluate i,j at (0,0)
401 float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
402 float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
403
404 // evaluate i,j at (1,0)
405 float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
406 float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
407
408 // compute dz/dx
409 float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
410 float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
411 float dzdx = abs(d10 - d00);
412
413 // evaluate i,j at (0,1)
414 float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
415 float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
416
417 float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
418 float dzdy = abs(d01 - d00);
419 */
420
421 // optimized version of above
422 float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
423 float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
424
425 return std::max(dzdx, dzdy);
426 }
427
428 INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
429 {
430 if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
431 {
432 return (1.0f / (1 << 24));
433 }
434 else if (pState->depthFormat == R16_UNORM)
435 {
436 return (1.0f / (1 << 16));
437 }
438 else
439 {
440 SWR_ASSERT(pState->depthFormat == R32_FLOAT);
441
442 // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
443 float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
444 uint32_t zMaxInt = *(uint32_t*)&zMax;
445 zMaxInt &= 0x7f800000;
446 zMax = *(float*)&zMaxInt;
447
448 return zMax * (1.0f / (1 << 23));
449 }
450 }
451
452 INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
453 {
454 if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
455 {
456 return 0.0f;
457 }
458
459 float scale = pState->slopeScaledDepthBias;
460 if (scale != 0.0f)
461 {
462 scale *= ComputeMaxDepthSlope(pTri);
463 }
464
465 float bias = pState->depthBias;
466 if (!pState->depthBiasPreAdjusted)
467 {
468 bias *= ComputeBiasFactor(pState, pTri, z);
469 }
470 bias += scale;
471
472 if (pState->depthBiasClamp > 0.0f)
473 {
474 bias = std::min(bias, pState->depthBiasClamp);
475 }
476 else if (pState->depthBiasClamp < 0.0f)
477 {
478 bias = std::max(bias, pState->depthBiasClamp);
479 }
480
481 return bias;
482 }
483
484 // Prevent DCE by writing coverage mask from rasterizer to volatile
485 #if KNOB_ENABLE_TOSS_POINTS
486 __declspec(thread) volatile uint64_t gToss;
487 #endif
488
489 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
490 // try to avoid _chkstk insertions; make this thread local
491 static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib];
492
493 INLINE
494 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
495 {
496 edge.a = a;
497 edge.b = b;
498
499 // compute constant steps to adjacent quads
500 edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
501 edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
502
503 // compute constant steps to adjacent raster tiles
504 edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
505 edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
506
507 // compute quad offsets
508 const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
509 const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
510
511 __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
512 __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
513 edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
514
515 // compute raster tile offsets
516 const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
517 const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
518
519 __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
520 __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
521 edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
522 }
523
524 INLINE
525 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
526 {
527 ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
528 }
529
530 //////////////////////////////////////////////////////////////////////////
531 /// @brief Primary template definition used for partially specializing
532 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
533 /// corner to sample position, and test for coverage
534 /// @tparam sampleCount: multisample count
535 template <typename NumSamplesT>
536 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d (&vEdgeFix16)[7],
537 int32_t &mask0, int32_t &mask1, int32_t &mask2)
538 {
539 __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
540 // evaluate edge equations at the tile multisample bounding box
541 vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
542 vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
543 vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
544 mask0 = _mm256_movemask_pd(vSampleBboxTest0);
545 mask1 = _mm256_movemask_pd(vSampleBboxTest1);
546 mask2 = _mm256_movemask_pd(vSampleBboxTest2);
547 }
548
549 //////////////////////////////////////////////////////////////////////////
550 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
551 /// when only rasterizing a single coverage test point
552 template <>
553 INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d (&vEdgeFix16)[7],
554 int32_t &mask0, int32_t &mask1, int32_t &mask2)
555 {
556 mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
557 mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
558 mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
559 }
560
561 //////////////////////////////////////////////////////////////////////////
562 /// @struct ComputeScissorEdges
563 /// @brief Primary template definition. Allows the function to be generically
564 /// called. When paired with below specializations, will result in an empty
565 /// inlined function if scissor is not enabled
566 /// @tparam RasterScissorEdgesT: is scissor enabled?
567 /// @tparam IsConservativeT: is conservative rast enabled?
568 /// @tparam RT: rasterizer traits
569 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
570 struct ComputeScissorEdges
571 {
572 INLINE ComputeScissorEdges(const BBOX &triBBox, const BBOX &scissorBBox, const int32_t x, const int32_t y,
573 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
574 };
575
576 //////////////////////////////////////////////////////////////////////////
577 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
578 /// specialization. Instantiated when conservative rast and scissor are enabled
579 template <typename RT>
580 struct ComputeScissorEdges<std::true_type, std::true_type, RT>
581 {
582 //////////////////////////////////////////////////////////////////////////
583 /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
584 /// evaluate edge equations and offset them away from pixel center.
585 INLINE ComputeScissorEdges(const BBOX &triBBox, const BBOX &scissorBBox, const int32_t x, const int32_t y,
586 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
587 {
588 // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
589 BBOX scissor;
590 scissor.left = std::max(triBBox.left, scissorBBox.left);
591 scissor.right = std::min(triBBox.right, scissorBBox.right);
592 scissor.top = std::max(triBBox.top, scissorBBox.top);
593 scissor.bottom = std::min(triBBox.bottom, scissorBBox.bottom);
594
595 POS topLeft{scissor.left, scissor.top};
596 POS bottomLeft{scissor.left, scissor.bottom};
597 POS topRight{scissor.right, scissor.top};
598 POS bottomRight{scissor.right, scissor.bottom};
599
600 // construct 4 scissor edges in ccw direction
601 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
602 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
603 ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
604 ComputeEdgeData(topRight, topLeft, rastEdges[6]);
605
606 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.left)) + (rastEdges[3].b * (y - scissor.top)));
607 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.left)) + (rastEdges[4].b * (y - scissor.bottom)));
608 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.right)) + (rastEdges[5].b * (y - scissor.bottom)));
609 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.right)) + (rastEdges[6].b * (y - scissor.top)));
610
611 // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
612 adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
613 adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
614 adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
615 adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
616 }
617 };
618
619 //////////////////////////////////////////////////////////////////////////
620 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
621 /// specialization. Instantiated when scissor is enabled and conservative rast
622 /// is disabled.
623 template <typename RT>
624 struct ComputeScissorEdges<std::true_type, std::false_type, RT>
625 {
626 //////////////////////////////////////////////////////////////////////////
627 /// @brief Compute scissor edge vectors and evaluate edge equations
628 INLINE ComputeScissorEdges(const BBOX &, const BBOX &scissorBBox, const int32_t x, const int32_t y,
629 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
630 {
631 const BBOX &scissor = scissorBBox;
632 POS topLeft{scissor.left, scissor.top};
633 POS bottomLeft{scissor.left, scissor.bottom};
634 POS topRight{scissor.right, scissor.top};
635 POS bottomRight{scissor.right, scissor.bottom};
636
637 // construct 4 scissor edges in ccw direction
638 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
639 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
640 ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
641 ComputeEdgeData(topRight, topLeft, rastEdges[6]);
642
643 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.left)) + (rastEdges[3].b * (y - scissor.top)));
644 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.left)) + (rastEdges[4].b * (y - scissor.bottom)));
645 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.right)) + (rastEdges[5].b * (y - scissor.bottom)));
646 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.right)) + (rastEdges[6].b * (y - scissor.top)));
647 }
648 };
649
650 //////////////////////////////////////////////////////////////////////////
651 /// @brief Primary function template for TrivialRejectTest. Should
652 /// never be called, but TemplateUnroller instantiates a few unused values,
653 /// so it calls a runtime assert instead of a static_assert.
654 template <typename ValidEdgeMaskT>
655 INLINE bool TrivialRejectTest(const int, const int, const int)
656 {
657 SWR_ASSERT(0, "Primary templated function should never be called");
658 return false;
659 };
660
661 //////////////////////////////////////////////////////////////////////////
662 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
663 /// and edge 1 for trivial coverage reject
664 template <>
665 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
666 {
667 return (!(mask0 && mask1)) ? true : false;
668 };
669
670 //////////////////////////////////////////////////////////////////////////
671 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
672 /// and edge 2 for trivial coverage reject
673 template <>
674 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
675 {
676 return (!(mask0 && mask2)) ? true : false;
677 };
678
679 //////////////////////////////////////////////////////////////////////////
680 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
681 /// and edge 2 for trivial coverage reject
682 template <>
683 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
684 {
685 return (!(mask1 && mask2)) ? true : false;
686 };
687
688 //////////////////////////////////////////////////////////////////////////
689 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
690 /// primitive edges for trivial coverage reject
691 template <>
692 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
693 {
694 return (!(mask0 && mask1 && mask2)) ? true : false;;
695 };
696
697 //////////////////////////////////////////////////////////////////////////
698 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
699 /// point, so return false and rasterize against conservative BBox
700 template <>
701 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
702 {
703 return false;
704 };
705
706 //////////////////////////////////////////////////////////////////////////
707 /// @brief Primary function template for TrivialAcceptTest. Always returns
708 /// false, since it will only be called for degenerate tris, and as such
709 /// will never cover the entire raster tile
710 template <typename ValidEdgeMaskT>
711 INLINE bool TrivialAcceptTest(const int, const int, const int)
712 {
713 return false;
714 };
715
716 //////////////////////////////////////////////////////////////////////////
717 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
718 /// edge masks for a fully covered raster tile
719 template <>
720 INLINE bool TrivialAcceptTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
721 {
722 return ((mask0 & mask1 & mask2) == 0xf);
723 };
724
725 template <typename RT>
726 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
727 {
728 const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
729 #if KNOB_ENABLE_TOSS_POINTS
730 if (KNOB_TOSS_BIN_TRIS)
731 {
732 return;
733 }
734 #endif
735 RDTSC_START(BERasterizeTriangle);
736
737 RDTSC_START(BETriangleSetup);
738 const API_STATE &state = GetApiState(pDC);
739 const SWR_RASTSTATE &rastState = state.rastState;
740 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
741
742 OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
743 triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
744
745 __m128 vX, vY, vZ, vRecipW;
746
747 // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
748 // eg: vX = [x0 x1 x2 dc]
749 vX = _mm_load_ps(workDesc.pTriBuffer);
750 vY = _mm_load_ps(workDesc.pTriBuffer + 4);
751 vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
752 vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
753
754 // convert to fixed point
755 static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
756 __m128i vXi = fpToFixedPoint(vX);
757 __m128i vYi = fpToFixedPoint(vY);
758
759 // quantize floating point position to fixed point precision
760 // to prevent attribute creep around the triangle vertices
761 vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
762 vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
763
764 // triangle setup - A and B edge equation coefs
765 __m128 vA, vB;
766 triangleSetupAB(vX, vY, vA, vB);
767
768 __m128i vAi, vBi;
769 triangleSetupABInt(vXi, vYi, vAi, vBi);
770
771 // determinant
772 float det = calcDeterminantInt(vAi, vBi);
773
774 // Verts in Pixel Coordinate Space at this point
775 // Det > 0 = CW winding order
776 // Convert CW triangles to CCW
777 if (det > 0.0)
778 {
779 vA = _mm_mul_ps(vA, _mm_set1_ps(-1));
780 vB = _mm_mul_ps(vB, _mm_set1_ps(-1));
781 vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
782 vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
783 det = -det;
784 }
785
786 __m128 vC;
787 // Finish triangle setup - C edge coef
788 triangleSetupC(vX, vY, vA, vB, vC);
789
790 if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
791 {
792 // If we have degenerate edge(s) to rasterize, set I and J coefs
793 // to 0 for constant interpolation of attributes
794 triDesc.I[0] = 0.0f;
795 triDesc.I[1] = 0.0f;
796 triDesc.I[2] = 0.0f;
797 triDesc.J[0] = 0.0f;
798 triDesc.J[1] = 0.0f;
799 triDesc.J[2] = 0.0f;
800
801 // Degenerate triangles have no area
802 triDesc.recipDet = 0.0f;
803 }
804 else
805 {
806 // only extract coefs for 2 of the barycentrics; the 3rd can be
807 // determined from the barycentric equation:
808 // i + j + k = 1 <=> k = 1 - j - i
809 _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
810 _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
811 _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
812 _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
813 _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
814 _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
815
816 // compute recipDet, used to calculate barycentric i and j in the backend
817 triDesc.recipDet = 1.0f/det;
818 }
819
820 OSALIGNSIMD(float) oneOverW[4];
821 _mm_store_ps(oneOverW, vRecipW);
822 triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
823 triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
824 triDesc.OneOverW[2] = oneOverW[2];
825
826 // calculate perspective correct coefs per vertex attrib
827 float* pPerspAttribs = perspAttribsTLS;
828 float* pAttribs = workDesc.pAttribs;
829 triDesc.pPerspAttribs = pPerspAttribs;
830 triDesc.pAttribs = pAttribs;
831 float *pRecipW = workDesc.pTriBuffer + 12;
832 triDesc.pRecipW = pRecipW;
833 __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
834 __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
835 __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
836 for(uint32_t i = 0; i < workDesc.numAttribs; i++)
837 {
838 __m128 attribA = _mm_load_ps(pAttribs);
839 __m128 attribB = _mm_load_ps(pAttribs+=4);
840 __m128 attribC = _mm_load_ps(pAttribs+=4);
841 pAttribs+=4;
842
843 attribA = _mm_mul_ps(attribA, vOneOverWV0);
844 attribB = _mm_mul_ps(attribB, vOneOverWV1);
845 attribC = _mm_mul_ps(attribC, vOneOverWV2);
846
847 _mm_store_ps(pPerspAttribs, attribA);
848 _mm_store_ps(pPerspAttribs+=4, attribB);
849 _mm_store_ps(pPerspAttribs+=4, attribC);
850 pPerspAttribs+=4;
851 }
852
853 // compute bary Z
854 // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
855 OSALIGNSIMD(float) a[4];
856 _mm_store_ps(a, vZ);
857 triDesc.Z[0] = a[0] - a[2];
858 triDesc.Z[1] = a[1] - a[2];
859 triDesc.Z[2] = a[2];
860
861 // add depth bias
862 triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
863
864 // Calc bounding box of triangle
865 OSALIGNSIMD(BBOX) bbox;
866 calcBoundingBoxInt(vXi, vYi, bbox);
867
868 if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
869 {
870 // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
871 bbox.left--; bbox.right++; bbox.top--; bbox.bottom++;
872 SWR_ASSERT(state.scissorInFixedPoint.left >= 0 && state.scissorInFixedPoint.top >= 0,
873 "Conservative rast degenerate handling requires a valid scissor rect");
874 }
875
876 // Intersect with scissor/viewport
877 OSALIGNSIMD(BBOX) intersect;
878 intersect.left = std::max(bbox.left, state.scissorInFixedPoint.left);
879 intersect.right = std::min(bbox.right - 1, state.scissorInFixedPoint.right);
880 intersect.top = std::max(bbox.top, state.scissorInFixedPoint.top);
881 intersect.bottom = std::min(bbox.bottom - 1, state.scissorInFixedPoint.bottom);
882
883 triDesc.triFlags = workDesc.triFlags;
884
885 // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
886 uint32_t macroX, macroY;
887 MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
888 int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
889 int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
890 int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
891 int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
892
893 intersect.left = std::max(intersect.left, macroBoxLeft);
894 intersect.top = std::max(intersect.top, macroBoxTop);
895 intersect.right = std::min(intersect.right, macroBoxRight);
896 intersect.bottom = std::min(intersect.bottom, macroBoxBottom);
897
898 SWR_ASSERT(intersect.left <= intersect.right && intersect.top <= intersect.bottom && intersect.left >= 0 && intersect.right >= 0 && intersect.top >= 0 && intersect.bottom >= 0);
899
900 RDTSC_STOP(BETriangleSetup, 0, pDC->drawId);
901
902 // update triangle desc
903 uint32_t minTileX = intersect.left >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
904 uint32_t minTileY = intersect.top >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
905 uint32_t maxTileX = intersect.right >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
906 uint32_t maxTileY = intersect.bottom >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
907 uint32_t numTilesX = maxTileX - minTileX + 1;
908 uint32_t numTilesY = maxTileY - minTileY + 1;
909
910 if (numTilesX == 0 || numTilesY == 0)
911 {
912 RDTSC_EVENT(BEEmptyTriangle, 1, 0);
913 RDTSC_STOP(BERasterizeTriangle, 1, 0);
914 return;
915 }
916
917 RDTSC_START(BEStepSetup);
918
919 // Step to pixel center of top-left pixel of the triangle bbox
920 // Align intersect bbox (top/left) to raster tile's (top/left).
921 int32_t x = AlignDown(intersect.left, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
922 int32_t y = AlignDown(intersect.top, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
923
924 // convenience typedef
925 typedef typename RT::NumRasterSamplesT NumRasterSamplesT;
926
927 // single sample rasterization evaluates edges at pixel center,
928 // multisample evaluates edges UL pixel corner and steps to each sample position
929 if(std::is_same<NumRasterSamplesT, SingleSampleT>::value)
930 {
931 // Add 0.5, in fixed point, to offset to pixel center
932 x += (FIXED_POINT_SCALE / 2);
933 y += (FIXED_POINT_SCALE / 2);
934 }
935
936 __m128i vTopLeftX = _mm_set1_epi32(x);
937 __m128i vTopLeftY = _mm_set1_epi32(y);
938
939 // evaluate edge equations at top-left pixel using 64bit math
940 //
941 // line = Ax + By + C
942 // solving for C:
943 // C = -Ax - By
944 // we know x0 and y0 are on the line; plug them in:
945 // C = -Ax0 - By0
946 // plug C back into line equation:
947 // line = Ax - By - Ax0 - By0
948 // line = A(x - x0) + B(y - y0)
949 // dX = (x-x0), dY = (y-y0)
950 // so all this simplifies to
951 // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
952
953 __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
954 __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
955
956 // evaluate A(dx) and B(dY) for all points
957 __m256d vAipd = _mm256_cvtepi32_pd(vAi);
958 __m256d vBipd = _mm256_cvtepi32_pd(vBi);
959 __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
960 __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
961
962 __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
963 __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
964 __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
965
966 // apply and edge adjustments(top-left, crast, etc)
967 adjustEdgesFix16<RT>(vAi, vBi, vEdge);
968
969 // broadcast respective edge results to all lanes
970 double* pEdge = (double*)&vEdge;
971 __m256d vEdgeFix16[7];
972 vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
973 vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
974 vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
975
976 OSALIGNSIMD(int32_t) aAi[4], aBi[4];
977 _mm_store_si128((__m128i*)aAi, vAi);
978 _mm_store_si128((__m128i*)aBi, vBi);
979 EDGE rastEdges[RT::NumEdgesT::value];
980
981 // Compute and store triangle edge data
982 ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
983 ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
984 ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
985
986 // Compute and store triangle edge data if scissor needs to rasterized
987 ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
988 (bbox, state.scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
989
990 // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
991 // used to for testing if entire raster tile is inside a triangle
992 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
993 {
994 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
995 }
996
997 // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
998 // step sample positions to the raster tile bbox of multisample points
999 // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples)
1000 // | |
1001 // | |
1002 // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
1003 __m256d vEdgeTileBbox[3];
1004 if (NumRasterSamplesT::value > 1)
1005 {
1006 __m128i vTileSampleBBoxXh = RT::MT::TileSampleOffsetsX();
1007 __m128i vTileSampleBBoxYh = RT::MT::TileSampleOffsetsY();
1008
1009 __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
1010 __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
1011
1012 // step edge equation tests from Tile
1013 // used to for testing if entire raster tile is inside a triangle
1014 for (uint32_t e = 0; e < 3; ++e)
1015 {
1016 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
1017 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
1018 vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1019 }
1020 }
1021
1022 RDTSC_STOP(BEStepSetup, 0, pDC->drawId);
1023
1024 uint32_t tY = minTileY;
1025 uint32_t tX = minTileX;
1026 uint32_t maxY = maxTileY;
1027 uint32_t maxX = maxTileX;
1028
1029 RenderOutputBuffers renderBuffers, currentRenderBufferRow;
1030 GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
1031 currentRenderBufferRow = renderBuffers;
1032
1033 // rasterize and generate coverage masks per sample
1034 for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
1035 {
1036 __m256d vStartOfRowEdge[RT::NumEdgesT::value];
1037 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1038 {
1039 vStartOfRowEdge[e] = vEdgeFix16[e];
1040 }
1041
1042 for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
1043 {
1044 triDesc.anyCoveredSamples = 0;
1045
1046 // is the corner of the edge outside of the raster tile? (vEdge < 0)
1047 int mask0, mask1, mask2;
1048 UpdateEdgeMasks<NumRasterSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
1049
1050 for (uint32_t sampleNum = 0; sampleNum < NumRasterSamplesT::value; sampleNum++)
1051 {
1052 // trivial reject, at least one edge has all 4 corners of raster tile outside
1053 bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
1054
1055 if (!trivialReject)
1056 {
1057 // trivial accept mask
1058 triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
1059 if (TrivialAcceptTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2))
1060 {
1061 triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
1062 // trivial accept, all 4 corners of all 3 edges are negative
1063 // i.e. raster tile completely inside triangle
1064 RDTSC_EVENT(BETrivialAccept, 1, 0);
1065 }
1066 else
1067 {
1068 __m256d vEdgeAtSample[RT::NumEdgesT::value];
1069 if(std::is_same<NumRasterSamplesT, SingleSampleT>::value)
1070 {
1071 // should get optimized out for single sample case (global value numbering or copy propagation)
1072 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1073 {
1074 vEdgeAtSample[e] = vEdgeFix16[e];
1075 }
1076 }
1077 else
1078 {
1079 __m128i vSampleOffsetXh = RT::MT::vXi(sampleNum);
1080 __m128i vSampleOffsetYh = RT::MT::vYi(sampleNum);
1081 __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
1082 __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
1083
1084 // step edge equation tests from UL tile corner to pixel sample position
1085 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1086 {
1087 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
1088 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
1089 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1090 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
1091 }
1092 }
1093
1094 double startQuadEdges[RT::NumEdgesT::value];
1095 const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1096 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1097 {
1098 _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
1099 }
1100
1101 // not trivial accept or reject, must rasterize full tile
1102 RDTSC_START(BERasterizePartial);
1103 triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
1104 RDTSC_STOP(BERasterizePartial, 0, 0);
1105
1106 triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
1107 }
1108 }
1109 else
1110 {
1111 // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
1112 if(NumRasterSamplesT::value > 1)
1113 {
1114 triDesc.coverageMask[sampleNum] = 0;
1115 }
1116 RDTSC_EVENT(BETrivialReject, 1, 0);
1117 }
1118 }
1119
1120 #if KNOB_ENABLE_TOSS_POINTS
1121 if(KNOB_TOSS_RS)
1122 {
1123 gToss = triDesc.coverageMask[0];
1124 }
1125 else
1126 #endif
1127 if(triDesc.anyCoveredSamples)
1128 {
1129 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
1130 // copy conservative coverage result to all samples
1131 if(RT::IsConservativeT::value)
1132 {
1133 auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
1134 UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
1135 }
1136
1137 RDTSC_START(BEPixelBackend);
1138 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
1139 RDTSC_STOP(BEPixelBackend, 0, 0);
1140 }
1141
1142 // step to the next tile in X
1143 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1144 {
1145 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
1146 }
1147 StepRasterTileX<RT>(state.psState.numRenderTargets, renderBuffers);
1148 }
1149
1150 // step to the next tile in Y
1151 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1152 {
1153 vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
1154 }
1155 StepRasterTileY<RT>(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow);
1156 }
1157
1158 RDTSC_STOP(BERasterizeTriangle, 1, 0);
1159 }
1160
1161 void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
1162 {
1163 const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
1164 const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
1165 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1166
1167 bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
1168
1169 // load point vertex
1170 float x = *workDesc.pTriBuffer;
1171 float y = *(workDesc.pTriBuffer + 1);
1172 float z = *(workDesc.pTriBuffer + 2);
1173
1174 // create a copy of the triangle buffer to write our adjusted vertices to
1175 OSALIGNSIMD(float) newTriBuffer[4 * 4];
1176 TRIANGLE_WORK_DESC newWorkDesc = workDesc;
1177 newWorkDesc.pTriBuffer = &newTriBuffer[0];
1178
1179 // create a copy of the attrib buffer to write our adjusted attribs to
1180 OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES];
1181 newWorkDesc.pAttribs = &newAttribBuffer[0];
1182
1183 newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
1184 newWorkDesc.numAttribs = workDesc.numAttribs;
1185 newWorkDesc.triFlags = workDesc.triFlags;
1186
1187 // construct two tris by bloating point by point size
1188 float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
1189 float lowerX = x - halfPointSize;
1190 float upperX = x + halfPointSize;
1191 float lowerY = y - halfPointSize;
1192 float upperY = y + halfPointSize;
1193
1194 // tri 0
1195 float *pBuf = &newTriBuffer[0];
1196 *pBuf++ = lowerX;
1197 *pBuf++ = lowerX;
1198 *pBuf++ = upperX;
1199 pBuf++;
1200 *pBuf++ = lowerY;
1201 *pBuf++ = upperY;
1202 *pBuf++ = upperY;
1203 pBuf++;
1204 _mm_store_ps(pBuf, _mm_set1_ps(z));
1205 _mm_store_ps(pBuf+=4, _mm_set1_ps(1.0f));
1206
1207 // setup triangle rasterizer function
1208 PFN_WORK_FUNC pfnTriRast;
1209 // for center sample pattern, all samples are at pixel center; calculate coverage
1210 // once at center and broadcast the results in the backend
1211 uint32_t sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1212 // conservative rast not supported for points/lines
1213 pfnTriRast = GetRasterizerFunc(sampleCount, false, SWR_INPUT_COVERAGE_NONE, ALL_EDGES_VALID, (rastState.scissorEnable > 0));
1214
1215 // overwrite texcoords for point sprites
1216 if (isPointSpriteTexCoordEnabled)
1217 {
1218 // copy original attribs
1219 memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
1220 newWorkDesc.pAttribs = &newAttribBuffer[0];
1221
1222 // overwrite texcoord for point sprites
1223 uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
1224 DWORD texCoordAttrib = 0;
1225
1226 while (_BitScanForward(&texCoordAttrib, texCoordMask))
1227 {
1228 texCoordMask &= ~(1 << texCoordAttrib);
1229 __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
1230 if (rastState.pointSpriteTopOrigin)
1231 {
1232 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
1233 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
1234 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
1235 }
1236 else
1237 {
1238 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
1239 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
1240 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
1241 }
1242 }
1243 }
1244 else
1245 {
1246 // no texcoord overwrite, can reuse the attrib buffer from frontend
1247 newWorkDesc.pAttribs = workDesc.pAttribs;
1248 }
1249
1250 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1251
1252 // tri 1
1253 pBuf = &newTriBuffer[0];
1254 *pBuf++ = lowerX;
1255 *pBuf++ = upperX;
1256 *pBuf++ = upperX;
1257 pBuf++;
1258 *pBuf++ = lowerY;
1259 *pBuf++ = upperY;
1260 *pBuf++ = lowerY;
1261 // z, w unchanged
1262
1263 if (isPointSpriteTexCoordEnabled)
1264 {
1265 uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
1266 DWORD texCoordAttrib = 0;
1267
1268 while (_BitScanForward(&texCoordAttrib, texCoordMask))
1269 {
1270 texCoordMask &= ~(1 << texCoordAttrib);
1271 __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
1272 if (rastState.pointSpriteTopOrigin)
1273 {
1274 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
1275 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
1276 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
1277
1278 }
1279 else
1280 {
1281 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
1282 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
1283 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
1284 }
1285 }
1286 }
1287
1288 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1289 }
1290
1291 void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
1292 {
1293 #if KNOB_ENABLE_TOSS_POINTS
1294 if (KNOB_TOSS_BIN_TRIS)
1295 {
1296 return;
1297 }
1298 #endif
1299
1300 const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
1301 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
1302
1303 // map x,y relative offsets from start of raster tile to bit position in
1304 // coverage mask for the point
1305 static const uint32_t coverageMap[8][8] = {
1306 { 0, 1, 4, 5, 8, 9, 12, 13 },
1307 { 2, 3, 6, 7, 10, 11, 14, 15 },
1308 { 16, 17, 20, 21, 24, 25, 28, 29 },
1309 { 18, 19, 22, 23, 26, 27, 30, 31 },
1310 { 32, 33, 36, 37, 40, 41, 44, 45 },
1311 { 34, 35, 38, 39, 42, 43, 46, 47 },
1312 { 48, 49, 52, 53, 56, 57, 60, 61 },
1313 { 50, 51, 54, 55, 58, 59, 62, 63 }
1314 };
1315
1316 OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
1317
1318 // pull point information from triangle buffer
1319 // @todo use structs for readability
1320 uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
1321 uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
1322 float z = *(workDesc.pTriBuffer + 2);
1323
1324 // construct triangle descriptor for point
1325 // no interpolation, set up i,j for constant interpolation of z and attribs
1326 // @todo implement an optimized backend that doesn't require triangle information
1327
1328 // compute coverage mask from x,y packed into the coverageMask flag
1329 // mask indices by the maximum valid index for x/y of coveragemap.
1330 uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
1331 uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
1332 // todo: multisample points?
1333 triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
1334
1335 // no persp divide needed for points
1336 triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
1337 triDesc.triFlags = workDesc.triFlags;
1338 triDesc.recipDet = 1.0f;
1339 triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
1340 triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
1341 triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
1342 triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
1343
1344 RenderOutputBuffers renderBuffers;
1345 GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
1346 renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
1347
1348 RDTSC_START(BEPixelBackend);
1349 backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
1350 RDTSC_STOP(BEPixelBackend, 0, 0);
1351 }
1352
1353 // Get pointers to hot tile memory for color RT, depth, stencil
1354 template <uint32_t numSamples>
1355 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
1356 {
1357 const API_STATE& state = GetApiState(pDC);
1358 SWR_CONTEXT *pContext = pDC->pContext;
1359
1360 uint32_t mx, my;
1361 MacroTileMgr::getTileIndices(macroID, mx, my);
1362 tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
1363 tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
1364
1365 // compute tile offset for active hottile buffers
1366 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
1367 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1368 offset*=numSamples;
1369
1370 unsigned long rtSlot = 0;
1371 uint32_t colorHottileEnableMask = state.colorHottileEnable;
1372 while(_BitScanForward(&rtSlot, colorHottileEnableMask))
1373 {
1374 HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true,
1375 numSamples, renderTargetArrayIndex);
1376 pColor->state = HOTTILE_DIRTY;
1377 renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
1378
1379 colorHottileEnableMask &= ~(1 << rtSlot);
1380 }
1381 if(state.depthHottileEnable)
1382 {
1383 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
1384 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1385 offset*=numSamples;
1386 HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true,
1387 numSamples, renderTargetArrayIndex);
1388 pDepth->state = HOTTILE_DIRTY;
1389 SWR_ASSERT(pDepth->pBuffer != nullptr);
1390 renderBuffers.pDepth = pDepth->pBuffer + offset;
1391 }
1392 if(state.stencilHottileEnable)
1393 {
1394 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
1395 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1396 offset*=numSamples;
1397 HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true,
1398 numSamples, renderTargetArrayIndex);
1399 pStencil->state = HOTTILE_DIRTY;
1400 SWR_ASSERT(pStencil->pBuffer != nullptr);
1401 renderBuffers.pStencil = pStencil->pBuffer + offset;
1402 }
1403 }
1404
1405 template <typename RT>
1406 INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers)
1407 {
1408 for(uint32_t rt = 0; rt < NumRT; ++rt)
1409 {
1410 buffers.pColor[rt] += RT::colorRasterTileStep;
1411 }
1412
1413 buffers.pDepth += RT::depthRasterTileStep;
1414 buffers.pStencil += RT::stencilRasterTileStep;
1415 }
1416
1417 template <typename RT>
1418 INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
1419 {
1420 for(uint32_t rt = 0; rt < NumRT; ++rt)
1421 {
1422 startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
1423 buffers.pColor[rt] = startBufferRow.pColor[rt];
1424 }
1425 startBufferRow.pDepth += RT::depthRasterTileRowStep;
1426 buffers.pDepth = startBufferRow.pDepth;
1427
1428 startBufferRow.pStencil += RT::stencilRasterTileRowStep;
1429 buffers.pStencil = startBufferRow.pStencil;
1430 }
1431
1432 void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
1433 {
1434 const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData);
1435 #if KNOB_ENABLE_TOSS_POINTS
1436 if (KNOB_TOSS_BIN_TRIS)
1437 {
1438 return;
1439 }
1440 #endif
1441
1442 // bloat line to two tris and call the triangle rasterizer twice
1443 RDTSC_START(BERasterizeLine);
1444
1445 const API_STATE &state = GetApiState(pDC);
1446 const SWR_RASTSTATE &rastState = state.rastState;
1447
1448 // macrotile dimensioning
1449 uint32_t macroX, macroY;
1450 MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
1451 int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
1452 int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
1453 int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
1454 int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
1455
1456 // create a copy of the triangle buffer to write our adjusted vertices to
1457 OSALIGNSIMD(float) newTriBuffer[4 * 4];
1458 TRIANGLE_WORK_DESC newWorkDesc = workDesc;
1459 newWorkDesc.pTriBuffer = &newTriBuffer[0];
1460
1461 // create a copy of the attrib buffer to write our adjusted attribs to
1462 OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES];
1463 newWorkDesc.pAttribs = &newAttribBuffer[0];
1464
1465 const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
1466 const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
1467
1468 __m128 vX, vY, vZ, vRecipW;
1469
1470 vX = _mm_load_ps(workDesc.pTriBuffer);
1471 vY = _mm_load_ps(workDesc.pTriBuffer + 4);
1472 vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
1473 vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
1474
1475 // triangle 0
1476 // v0,v1 -> v0,v0,v1
1477 __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
1478 __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
1479 __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
1480 __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
1481
1482 __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
1483 __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
1484 if (workDesc.triFlags.yMajor)
1485 {
1486 vXa = _mm_add_ps(vAdjust, vXa);
1487 }
1488 else
1489 {
1490 vYa = _mm_add_ps(vAdjust, vYa);
1491 }
1492
1493 // Store triangle description for rasterizer
1494 _mm_store_ps((float*)&newTriBuffer[0], vXa);
1495 _mm_store_ps((float*)&newTriBuffer[4], vYa);
1496 _mm_store_ps((float*)&newTriBuffer[8], vZa);
1497 _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
1498
1499 // binner bins 3 edges for lines as v0, v1, v1
1500 // tri0 needs v0, v0, v1
1501 for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
1502 {
1503 __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]);
1504 __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]);
1505
1506 _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0);
1507 _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0);
1508 _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1);
1509 }
1510
1511 // Store user clip distances for triangle 0
1512 float newClipBuffer[3 * 8];
1513 uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask);
1514 if (numClipDist)
1515 {
1516 newWorkDesc.pUserClipBuffer = newClipBuffer;
1517
1518 float* pOldBuffer = workDesc.pUserClipBuffer;
1519 float* pNewBuffer = newClipBuffer;
1520 for (uint32_t i = 0; i < numClipDist; ++i)
1521 {
1522 // read barycentric coeffs from binner
1523 float a = *(pOldBuffer++);
1524 float b = *(pOldBuffer++);
1525
1526 // reconstruct original clip distance at vertices
1527 float c0 = a + b;
1528 float c1 = b;
1529
1530 // construct triangle barycentrics
1531 *(pNewBuffer++) = c0 - c1;
1532 *(pNewBuffer++) = c0 - c1;
1533 *(pNewBuffer++) = c1;
1534 }
1535 }
1536
1537 // setup triangle rasterizer function
1538 PFN_WORK_FUNC pfnTriRast;
1539 uint32_t sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1540 // conservative rast not supported for points/lines
1541 pfnTriRast = GetRasterizerFunc(sampleCount, false, SWR_INPUT_COVERAGE_NONE, ALL_EDGES_VALID, (rastState.scissorEnable > 0));
1542
1543 // make sure this macrotile intersects the triangle
1544 __m128i vXai = fpToFixedPoint(vXa);
1545 __m128i vYai = fpToFixedPoint(vYa);
1546 OSALIGNSIMD(BBOX) bboxA;
1547 calcBoundingBoxInt(vXai, vYai, bboxA);
1548
1549 if (!(bboxA.left > macroBoxRight ||
1550 bboxA.left > state.scissorInFixedPoint.right ||
1551 bboxA.right - 1 < macroBoxLeft ||
1552 bboxA.right - 1 < state.scissorInFixedPoint.left ||
1553 bboxA.top > macroBoxBottom ||
1554 bboxA.top > state.scissorInFixedPoint.bottom ||
1555 bboxA.bottom - 1 < macroBoxTop ||
1556 bboxA.bottom - 1 < state.scissorInFixedPoint.top)) {
1557 // rasterize triangle
1558 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1559 }
1560
1561 // triangle 1
1562 // v0,v1 -> v1,v1,v0
1563 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
1564 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
1565 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
1566 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
1567
1568 vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
1569 if (workDesc.triFlags.yMajor)
1570 {
1571 vXa = _mm_add_ps(vAdjust, vXa);
1572 }
1573 else
1574 {
1575 vYa = _mm_add_ps(vAdjust, vYa);
1576 }
1577
1578 // Store triangle description for rasterizer
1579 _mm_store_ps((float*)&newTriBuffer[0], vXa);
1580 _mm_store_ps((float*)&newTriBuffer[4], vYa);
1581 _mm_store_ps((float*)&newTriBuffer[8], vZa);
1582 _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
1583
1584 // binner bins 3 edges for lines as v0, v1, v1
1585 // tri1 needs v1, v1, v0
1586 for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
1587 {
1588 __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
1589 __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
1590
1591 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
1592 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
1593 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
1594 }
1595
1596 // store user clip distance for triangle 1
1597 if (numClipDist)
1598 {
1599 float* pOldBuffer = workDesc.pUserClipBuffer;
1600 float* pNewBuffer = newClipBuffer;
1601 for (uint32_t i = 0; i < numClipDist; ++i)
1602 {
1603 // read barycentric coeffs from binner
1604 float a = *(pOldBuffer++);
1605 float b = *(pOldBuffer++);
1606
1607 // reconstruct original clip distance at vertices
1608 float c0 = a + b;
1609 float c1 = b;
1610
1611 // construct triangle barycentrics
1612 *(pNewBuffer++) = c1 - c0;
1613 *(pNewBuffer++) = c1 - c0;
1614 *(pNewBuffer++) = c0;
1615 }
1616 }
1617
1618 vXai = fpToFixedPoint(vXa);
1619 vYai = fpToFixedPoint(vYa);
1620 calcBoundingBoxInt(vXai, vYai, bboxA);
1621
1622 if (!(bboxA.left > macroBoxRight ||
1623 bboxA.left > state.scissorInFixedPoint.right ||
1624 bboxA.right - 1 < macroBoxLeft ||
1625 bboxA.right - 1 < state.scissorInFixedPoint.left ||
1626 bboxA.top > macroBoxBottom ||
1627 bboxA.top > state.scissorInFixedPoint.bottom ||
1628 bboxA.bottom - 1 < macroBoxTop ||
1629 bboxA.bottom - 1 < state.scissorInFixedPoint.top)) {
1630 // rasterize triangle
1631 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1632 }
1633
1634 RDTSC_STOP(BERasterizeLine, 1, 0);
1635 }
1636
1637 struct RasterizerChooser
1638 {
1639 typedef PFN_WORK_FUNC FuncType;
1640
1641 template <typename... ArgsB>
1642 static FuncType GetFunc()
1643 {
1644 return RasterizeTriangle<RasterizerTraits<ArgsB...>>;
1645 }
1646 };
1647
1648 // Selector for correct templated RasterizeTriangle function
1649 PFN_WORK_FUNC GetRasterizerFunc(
1650 uint32_t numSamples,
1651 bool IsConservative,
1652 uint32_t InputCoverage,
1653 uint32_t EdgeEnable,
1654 bool RasterizeScissorEdges
1655 )
1656 {
1657 return TemplateArgUnroller<RasterizerChooser>::GetFunc(
1658 IntArg<SWR_MULTISAMPLE_1X,SWR_MULTISAMPLE_TYPE_COUNT-1>{numSamples},
1659 IsConservative,
1660 IntArg<SWR_INPUT_COVERAGE_NONE, SWR_INPUT_COVERAGE_COUNT-1>{InputCoverage},
1661 IntArg<0, VALID_TRI_EDGE_COUNT-1>{EdgeEnable},
1662 RasterizeScissorEdges);
1663 }