swr/rast: more flexible max attribute slots
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / rasterizer.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file rasterizer.cpp
24 *
25 * @brief Implementation for the rasterizer.
26 *
27 ******************************************************************************/
28
29 #include <vector>
30 #include <algorithm>
31
32 #include "rasterizer.h"
33 #include "rdtsc_core.h"
34 #include "backend.h"
35 #include "utils.h"
36 #include "frontend.h"
37 #include "tilemgr.h"
38 #include "memory/tilingtraits.h"
39
40 template <uint32_t numSamples = 1>
41 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
42 template <typename RT>
43 void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers);
44 template <typename RT>
45 void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
46
47 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
48 const __m256d gMaskToVecpd[] =
49 {
50 MASKTOVEC(0, 0, 0, 0),
51 MASKTOVEC(0, 0, 0, 1),
52 MASKTOVEC(0, 0, 1, 0),
53 MASKTOVEC(0, 0, 1, 1),
54 MASKTOVEC(0, 1, 0, 0),
55 MASKTOVEC(0, 1, 0, 1),
56 MASKTOVEC(0, 1, 1, 0),
57 MASKTOVEC(0, 1, 1, 1),
58 MASKTOVEC(1, 0, 0, 0),
59 MASKTOVEC(1, 0, 0, 1),
60 MASKTOVEC(1, 0, 1, 0),
61 MASKTOVEC(1, 0, 1, 1),
62 MASKTOVEC(1, 1, 0, 0),
63 MASKTOVEC(1, 1, 0, 1),
64 MASKTOVEC(1, 1, 1, 0),
65 MASKTOVEC(1, 1, 1, 1),
66 };
67
68 struct POS
69 {
70 int32_t x, y;
71 };
72
73 struct EDGE
74 {
75 double a, b; // a, b edge coefficients in fix8
76 double stepQuadX; // step to adjacent horizontal quad in fix16
77 double stepQuadY; // step to adjacent vertical quad in fix16
78 double stepRasterTileX; // step to adjacent horizontal raster tile in fix16
79 double stepRasterTileY; // step to adjacent vertical raster tile in fix16
80
81 __m256d vQuadOffsets; // offsets for 4 samples of a quad
82 __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
83 };
84
85 //////////////////////////////////////////////////////////////////////////
86 /// @brief rasterize a raster tile partially covered by the triangle
87 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
88 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
89 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
90 /// Used to step between quads when sweeping over the raster tile.
91 template<uint32_t NumEdges, typename EdgeMaskT>
92 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
93 {
94 uint64_t coverageMask = 0;
95
96 __m256d vEdges[NumEdges];
97 __m256d vStepX[NumEdges];
98 __m256d vStepY[NumEdges];
99
100 for (uint32_t e = 0; e < NumEdges; ++e)
101 {
102 // Step to the pixel sample locations of the 1st quad
103 vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
104
105 // compute step to next quad (mul by 2 in x and y direction)
106 vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
107 vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
108 }
109
110 // fast unrolled version for 8x8 tile
111 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
112 int edgeMask[NumEdges];
113 uint64_t mask;
114
115 auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
116 auto update_lambda = [&](int e){mask &= edgeMask[e];};
117 auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
118 auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
119 auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
120
121 // evaluate which pixels in the quad are covered
122 #define EVAL \
123 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
124
125 // update coverage mask
126 // if edge 0 is degenerate and will be skipped; init the mask
127 #define UPDATE_MASK(bit) \
128 if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
129 mask = 0xf;\
130 }\
131 else{\
132 mask = edgeMask[0]; \
133 }\
134 UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
135 coverageMask |= (mask << bit);
136
137 // step in the +x direction to the next quad
138 #define INCX \
139 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
140
141 // step in the +y direction to the next quad
142 #define INCY \
143 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
144
145 // step in the -x direction to the next quad
146 #define DECX \
147 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
148
149 // sweep 2x2 quad back and forth through the raster tile,
150 // computing coverage masks for the entire tile
151
152 // raster tile
153 // 0 1 2 3 4 5 6 7
154 // x x
155 // x x ------------------>
156 // x x |
157 // <-----------------x x V
158 // ..
159
160 // row 0
161 EVAL;
162 UPDATE_MASK(0);
163 INCX;
164 EVAL;
165 UPDATE_MASK(4);
166 INCX;
167 EVAL;
168 UPDATE_MASK(8);
169 INCX;
170 EVAL;
171 UPDATE_MASK(12);
172 INCY;
173
174 //row 1
175 EVAL;
176 UPDATE_MASK(28);
177 DECX;
178 EVAL;
179 UPDATE_MASK(24);
180 DECX;
181 EVAL;
182 UPDATE_MASK(20);
183 DECX;
184 EVAL;
185 UPDATE_MASK(16);
186 INCY;
187
188 // row 2
189 EVAL;
190 UPDATE_MASK(32);
191 INCX;
192 EVAL;
193 UPDATE_MASK(36);
194 INCX;
195 EVAL;
196 UPDATE_MASK(40);
197 INCX;
198 EVAL;
199 UPDATE_MASK(44);
200 INCY;
201
202 // row 3
203 EVAL;
204 UPDATE_MASK(60);
205 DECX;
206 EVAL;
207 UPDATE_MASK(56);
208 DECX;
209 EVAL;
210 UPDATE_MASK(52);
211 DECX;
212 EVAL;
213 UPDATE_MASK(48);
214 #else
215 uint32_t bit = 0;
216 for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
217 {
218 __m256d vStartOfRowEdge[NumEdges];
219 for (uint32_t e = 0; e < NumEdges; ++e)
220 {
221 vStartOfRowEdge[e] = vEdges[e];
222 }
223
224 for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
225 {
226 int edgeMask[NumEdges];
227 for (uint32_t e = 0; e < NumEdges; ++e)
228 {
229 edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
230 }
231
232 uint64_t mask = edgeMask[0];
233 for (uint32_t e = 1; e < NumEdges; ++e)
234 {
235 mask &= edgeMask[e];
236 }
237 coverageMask |= (mask << bit);
238
239 // step to the next pixel in the x
240 for (uint32_t e = 0; e < NumEdges; ++e)
241 {
242 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
243 }
244 bit+=4;
245 }
246
247 // step to the next row
248 for (uint32_t e = 0; e < NumEdges; ++e)
249 {
250 vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
251 }
252 }
253 #endif
254 return coverageMask;
255
256 }
257 // Top left rule:
258 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
259 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
260 // Top left: a sample is in if it is a top or left edge.
261 // Out: !(horizontal && above) = !horizontal && below
262 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right
263 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge)
264 {
265 // if vA < 0, vC--
266 // if vA == 0 && vB < 0, vC--
267
268 __m256d vEdgeOut = vEdge;
269 __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
270
271 // if vA < 0 (line is not horizontal and below)
272 int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
273
274 // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
275 __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
276 int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
277 msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
278
279 // if either of these are true and we're on the line (edge == 0), bump it outside the line
280 vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
281 }
282
283 //////////////////////////////////////////////////////////////////////////
284 /// @brief calculates difference in precision between the result of manh
285 /// calculation and the edge precision, based on compile time trait values
286 template<typename RT>
287 constexpr int64_t ManhToEdgePrecisionAdjust()
288 {
289 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
290 "Inadequate precision of result of manh calculation ");
291 return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
292 }
293
294 //////////////////////////////////////////////////////////////////////////
295 /// @struct adjustEdgeConservative
296 /// @brief Primary template definition used for partially specializing
297 /// the adjustEdgeConservative function. This struct should never
298 /// be instantiated.
299 /// @tparam RT: rasterizer traits
300 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
301 template <typename RT, typename ConservativeEdgeOffsetT>
302 struct adjustEdgeConservative
303 {
304 //////////////////////////////////////////////////////////////////////////
305 /// @brief Performs calculations to adjust each edge of a triangle away
306 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
307 /// direction.
308 ///
309 /// Uncertainty regions arise from fixed point rounding, which
310 /// can snap a vertex +/- by min fixed point value.
311 /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
312 /// This allows the rasterizer to test for coverage only at the pixel center,
313 /// instead of having to test individual pixel corners for conservative coverage
314 INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
315 {
316 // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
317 // from the pixel center (in the direction of the edge normal A/B)
318
319 // edge = Ax + Bx + C - (manh/e)
320 // manh = manhattan distance = abs(A) + abs(B)
321 // e = absolute rounding error from snapping from float to fixed point precision
322
323 // 'fixed point' multiply (in double to be avx1 friendly)
324 // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
325 __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
326 __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
327 _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
328
329 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
330 "Inadequate precision of result of manh calculation ");
331
332 // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
333 // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
334 manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
335
336 // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
337 // this allows the rasterizer to do a single conservative coverage test to see if the primitive
338 // intersects the pixel at all
339 vEdge = _mm256_sub_pd(vEdge, manh);
340 };
341 };
342
343 //////////////////////////////////////////////////////////////////////////
344 /// @brief adjustEdgeConservative specialization where no edge offset is needed
345 template <typename RT>
346 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
347 {
348 INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
349 };
350
351 //////////////////////////////////////////////////////////////////////////
352 /// @brief calculates the distance a degenerate BBox needs to be adjusted
353 /// for conservative rast based on compile time trait values
354 template<typename RT>
355 constexpr int64_t ConservativeScissorOffset()
356 {
357 static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
358 // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
359 typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
360 // 1/2 pixel edge offset + conservative offset - degenerateTriangle
361 return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
362 }
363
364 //////////////////////////////////////////////////////////////////////////
365 /// @brief Performs calculations to adjust each a vector of evaluated edges out
366 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
367 /// direction.
368 template <typename RT>
369 INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
370 {
371 int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
372 int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
373 vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
374 };
375
376 //////////////////////////////////////////////////////////////////////////
377 /// @brief Performs calculations to adjust each a scalar evaluated edge out
378 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
379 /// direction.
380 template <typename RT, typename OffsetT>
381 INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
382 {
383 int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
384 int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
385 return (Edge - manh);
386 };
387
388 //////////////////////////////////////////////////////////////////////////
389 /// @brief Perform any needed adjustments to evaluated triangle edges
390 template <typename RT, typename EdgeOffsetT>
391 struct adjustEdgesFix16
392 {
393 INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
394 {
395 static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
396 "Edge equation expected to be in x.16 fixed point");
397
398 static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
399
400 // need to apply any edge offsets before applying the top-left rule
401 adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
402
403 adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
404 }
405 };
406
407 //////////////////////////////////////////////////////////////////////////
408 /// @brief Perform top left adjustments to evaluated triangle edges
409 template <typename RT>
410 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
411 {
412 INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
413 {
414 adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
415 }
416 };
417
418 // max(abs(dz/dx), abs(dz,dy)
419 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
420 {
421 /*
422 // evaluate i,j at (0,0)
423 float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
424 float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
425
426 // evaluate i,j at (1,0)
427 float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
428 float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
429
430 // compute dz/dx
431 float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
432 float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
433 float dzdx = abs(d10 - d00);
434
435 // evaluate i,j at (0,1)
436 float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
437 float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
438
439 float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
440 float dzdy = abs(d01 - d00);
441 */
442
443 // optimized version of above
444 float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
445 float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
446
447 return std::max(dzdx, dzdy);
448 }
449
450 INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
451 {
452 if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
453 {
454 return (1.0f / (1 << 24));
455 }
456 else if (pState->depthFormat == R16_UNORM)
457 {
458 return (1.0f / (1 << 16));
459 }
460 else
461 {
462 SWR_ASSERT(pState->depthFormat == R32_FLOAT);
463
464 // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
465 float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
466 uint32_t zMaxInt = *(uint32_t*)&zMax;
467 zMaxInt &= 0x7f800000;
468 zMax = *(float*)&zMaxInt;
469
470 return zMax * (1.0f / (1 << 23));
471 }
472 }
473
474 INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
475 {
476 if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
477 {
478 return 0.0f;
479 }
480
481 float scale = pState->slopeScaledDepthBias;
482 if (scale != 0.0f)
483 {
484 scale *= ComputeMaxDepthSlope(pTri);
485 }
486
487 float bias = pState->depthBias;
488 if (!pState->depthBiasPreAdjusted)
489 {
490 bias *= ComputeBiasFactor(pState, pTri, z);
491 }
492 bias += scale;
493
494 if (pState->depthBiasClamp > 0.0f)
495 {
496 bias = std::min(bias, pState->depthBiasClamp);
497 }
498 else if (pState->depthBiasClamp < 0.0f)
499 {
500 bias = std::max(bias, pState->depthBiasClamp);
501 }
502
503 return bias;
504 }
505
506 // Prevent DCE by writing coverage mask from rasterizer to volatile
507 #if KNOB_ENABLE_TOSS_POINTS
508 __declspec(thread) volatile uint64_t gToss;
509 #endif
510
511 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
512 // try to avoid _chkstk insertions; make this thread local
513 static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
514
515 INLINE
516 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
517 {
518 edge.a = a;
519 edge.b = b;
520
521 // compute constant steps to adjacent quads
522 edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
523 edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
524
525 // compute constant steps to adjacent raster tiles
526 edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
527 edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
528
529 // compute quad offsets
530 const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
531 const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
532
533 __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
534 __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
535 edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
536
537 // compute raster tile offsets
538 const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
539 const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
540
541 __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
542 __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
543 edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
544 }
545
546 INLINE
547 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
548 {
549 ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
550 }
551
552 //////////////////////////////////////////////////////////////////////////
553 /// @brief Primary template definition used for partially specializing
554 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
555 /// corner to sample position, and test for coverage
556 /// @tparam sampleCount: multisample count
557 template <typename NumSamplesT>
558 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
559 int32_t &mask0, int32_t &mask1, int32_t &mask2)
560 {
561 __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
562 // evaluate edge equations at the tile multisample bounding box
563 vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
564 vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
565 vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
566 mask0 = _mm256_movemask_pd(vSampleBboxTest0);
567 mask1 = _mm256_movemask_pd(vSampleBboxTest1);
568 mask2 = _mm256_movemask_pd(vSampleBboxTest2);
569 }
570
571 //////////////////////////////////////////////////////////////////////////
572 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
573 /// when only rasterizing a single coverage test point
574 template <>
575 INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
576 int32_t &mask0, int32_t &mask1, int32_t &mask2)
577 {
578 mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
579 mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
580 mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
581 }
582
583 //////////////////////////////////////////////////////////////////////////
584 /// @struct ComputeScissorEdges
585 /// @brief Primary template definition. Allows the function to be generically
586 /// called. When paired with below specializations, will result in an empty
587 /// inlined function if scissor is not enabled
588 /// @tparam RasterScissorEdgesT: is scissor enabled?
589 /// @tparam IsConservativeT: is conservative rast enabled?
590 /// @tparam RT: rasterizer traits
591 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
592 struct ComputeScissorEdges
593 {
594 INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
595 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
596 };
597
598 //////////////////////////////////////////////////////////////////////////
599 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
600 /// specialization. Instantiated when conservative rast and scissor are enabled
601 template <typename RT>
602 struct ComputeScissorEdges<std::true_type, std::true_type, RT>
603 {
604 //////////////////////////////////////////////////////////////////////////
605 /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
606 /// evaluate edge equations and offset them away from pixel center.
607 INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
608 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
609 {
610 // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
611 SWR_RECT scissor;
612 scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
613 scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
614 scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
615 scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
616
617 POS topLeft{scissor.xmin, scissor.ymin};
618 POS bottomLeft{scissor.xmin, scissor.ymax};
619 POS topRight{scissor.xmax, scissor.ymin};
620 POS bottomRight{scissor.xmax, scissor.ymax};
621
622 // construct 4 scissor edges in ccw direction
623 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
624 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
625 ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
626 ComputeEdgeData(topRight, topLeft, rastEdges[6]);
627
628 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
629 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
630 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
631 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
632
633 // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
634 adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
635 adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
636 adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
637 adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
638
639 // Upper left rule for scissor
640 vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
641 vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
642 }
643 };
644
645 //////////////////////////////////////////////////////////////////////////
646 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
647 /// specialization. Instantiated when scissor is enabled and conservative rast
648 /// is disabled.
649 template <typename RT>
650 struct ComputeScissorEdges<std::true_type, std::false_type, RT>
651 {
652 //////////////////////////////////////////////////////////////////////////
653 /// @brief Compute scissor edge vectors and evaluate edge equations
654 INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
655 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
656 {
657 const SWR_RECT &scissor = scissorBBox;
658 POS topLeft{scissor.xmin, scissor.ymin};
659 POS bottomLeft{scissor.xmin, scissor.ymax};
660 POS topRight{scissor.xmax, scissor.ymin};
661 POS bottomRight{scissor.xmax, scissor.ymax};
662
663 // construct 4 scissor edges in ccw direction
664 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
665 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
666 ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
667 ComputeEdgeData(topRight, topLeft, rastEdges[6]);
668
669 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
670 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
671 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
672 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
673
674 // Upper left rule for scissor
675 vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
676 vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
677 }
678 };
679
680 //////////////////////////////////////////////////////////////////////////
681 /// @brief Primary function template for TrivialRejectTest. Should
682 /// never be called, but TemplateUnroller instantiates a few unused values,
683 /// so it calls a runtime assert instead of a static_assert.
684 template <typename ValidEdgeMaskT>
685 INLINE bool TrivialRejectTest(const int, const int, const int)
686 {
687 SWR_INVALID("Primary templated function should never be called");
688 return false;
689 };
690
691 //////////////////////////////////////////////////////////////////////////
692 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
693 /// and edge 1 for trivial coverage reject
694 template <>
695 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
696 {
697 return (!(mask0 && mask1)) ? true : false;
698 };
699
700 //////////////////////////////////////////////////////////////////////////
701 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
702 /// and edge 2 for trivial coverage reject
703 template <>
704 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
705 {
706 return (!(mask0 && mask2)) ? true : false;
707 };
708
709 //////////////////////////////////////////////////////////////////////////
710 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
711 /// and edge 2 for trivial coverage reject
712 template <>
713 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
714 {
715 return (!(mask1 && mask2)) ? true : false;
716 };
717
718 //////////////////////////////////////////////////////////////////////////
719 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
720 /// primitive edges for trivial coverage reject
721 template <>
722 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
723 {
724 return (!(mask0 && mask1 && mask2)) ? true : false;;
725 };
726
727 //////////////////////////////////////////////////////////////////////////
728 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
729 /// point, so return false and rasterize against conservative BBox
730 template <>
731 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
732 {
733 return false;
734 };
735
736 //////////////////////////////////////////////////////////////////////////
737 /// @brief Primary function template for TrivialAcceptTest. Always returns
738 /// false, since it will only be called for degenerate tris, and as such
739 /// will never cover the entire raster tile
740 template <typename ScissorEnableT>
741 INLINE bool TrivialAcceptTest(const int, const int, const int)
742 {
743 return false;
744 };
745
746 //////////////////////////////////////////////////////////////////////////
747 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
748 /// edge masks for a fully covered raster tile
749 template <>
750 INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
751 {
752 return ((mask0 & mask1 & mask2) == 0xf);
753 };
754
755 //////////////////////////////////////////////////////////////////////////
756 /// @brief Primary function template for GenerateSVInnerCoverage. Results
757 /// in an empty function call if SVInnerCoverage isn't requested
758 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
759 struct GenerateSVInnerCoverage
760 {
761 INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t &){};
762 };
763
764 //////////////////////////////////////////////////////////////////////////
765 /// @brief Specialization of GenerateSVInnerCoverage where all edges
766 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
767 /// edge values from OuterConservative to InnerConservative and rasterizes.
768 template <typename RT>
769 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
770 {
771 INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges, uint64_t &innerCoverageMask)
772 {
773 SWR_CONTEXT *pContext = pDC->pContext;
774
775 double startQuadEdgesAdj[RT::NumEdgesT::value];
776 for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
777 {
778 startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
779 }
780
781 // not trivial accept or reject, must rasterize full tile
782 AR_BEGIN(BERasterizePartial, pDC->drawId);
783 innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
784 AR_END(BERasterizePartial, 0);
785 }
786 };
787
788 //////////////////////////////////////////////////////////////////////////
789 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
790 /// in an empty function call if SVInnerCoverage isn't requested
791 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
792 struct UpdateEdgeMasksInnerConservative
793 {
794 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
795 const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
796 };
797
798 //////////////////////////////////////////////////////////////////////////
799 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
800 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
801 /// evaluated at raster tile corners to inner conservative position and
802 /// updates edge masks
803 template <typename RT>
804 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
805 {
806 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
807 const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
808 {
809 __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
810
811 // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
812 // conservative evaluated edge when adjusting the edge in for inner conservative tests
813 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
814 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
815 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
816
817 UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
818 }
819 };
820
821 //////////////////////////////////////////////////////////////////////////
822 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
823 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
824 /// cover an entire raster tile, set mask0 to 0 to force it down the
825 /// rastierizePartialTile path
826 template <typename RT, typename ValidEdgeMaskT>
827 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
828 {
829 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
830 const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
831 {
832 // set one mask to zero to force the triangle down the rastierizePartialTile path
833 mask0 = 0;
834 }
835 };
836
837 template <typename RT>
838 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
839 {
840 SWR_CONTEXT *pContext = pDC->pContext;
841 const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
842 #if KNOB_ENABLE_TOSS_POINTS
843 if (KNOB_TOSS_BIN_TRIS)
844 {
845 return;
846 }
847 #endif
848 AR_BEGIN(BERasterizeTriangle, pDC->drawId);
849 AR_BEGIN(BETriangleSetup, pDC->drawId);
850
851 const API_STATE &state = GetApiState(pDC);
852 const SWR_RASTSTATE &rastState = state.rastState;
853 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
854
855 OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
856 triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
857
858 __m128 vX, vY, vZ, vRecipW;
859
860 // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
861 // eg: vX = [x0 x1 x2 dc]
862 vX = _mm_load_ps(workDesc.pTriBuffer);
863 vY = _mm_load_ps(workDesc.pTriBuffer + 4);
864 vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
865 vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
866
867 // convert to fixed point
868 static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
869 __m128i vXi = fpToFixedPoint(vX);
870 __m128i vYi = fpToFixedPoint(vY);
871
872 // quantize floating point position to fixed point precision
873 // to prevent attribute creep around the triangle vertices
874 vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
875 vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
876
877 // triangle setup - A and B edge equation coefs
878 __m128 vA, vB;
879 triangleSetupAB(vX, vY, vA, vB);
880
881 __m128i vAi, vBi;
882 triangleSetupABInt(vXi, vYi, vAi, vBi);
883
884 // determinant
885 float det = calcDeterminantInt(vAi, vBi);
886
887 // Verts in Pixel Coordinate Space at this point
888 // Det > 0 = CW winding order
889 // Convert CW triangles to CCW
890 if (det > 0.0)
891 {
892 vA = _mm_mul_ps(vA, _mm_set1_ps(-1));
893 vB = _mm_mul_ps(vB, _mm_set1_ps(-1));
894 vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
895 vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
896 det = -det;
897 }
898
899 __m128 vC;
900 // Finish triangle setup - C edge coef
901 triangleSetupC(vX, vY, vA, vB, vC);
902
903 if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
904 {
905 // If we have degenerate edge(s) to rasterize, set I and J coefs
906 // to 0 for constant interpolation of attributes
907 triDesc.I[0] = 0.0f;
908 triDesc.I[1] = 0.0f;
909 triDesc.I[2] = 0.0f;
910 triDesc.J[0] = 0.0f;
911 triDesc.J[1] = 0.0f;
912 triDesc.J[2] = 0.0f;
913
914 // Degenerate triangles have no area
915 triDesc.recipDet = 0.0f;
916 }
917 else
918 {
919 // only extract coefs for 2 of the barycentrics; the 3rd can be
920 // determined from the barycentric equation:
921 // i + j + k = 1 <=> k = 1 - j - i
922 _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
923 _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
924 _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
925 _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
926 _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
927 _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
928
929 // compute recipDet, used to calculate barycentric i and j in the backend
930 triDesc.recipDet = 1.0f/det;
931 }
932
933 OSALIGNSIMD(float) oneOverW[4];
934 _mm_store_ps(oneOverW, vRecipW);
935 triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
936 triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
937 triDesc.OneOverW[2] = oneOverW[2];
938
939 // calculate perspective correct coefs per vertex attrib
940 float* pPerspAttribs = perspAttribsTLS;
941 float* pAttribs = workDesc.pAttribs;
942 triDesc.pPerspAttribs = pPerspAttribs;
943 triDesc.pAttribs = pAttribs;
944 float *pRecipW = workDesc.pTriBuffer + 12;
945 triDesc.pRecipW = pRecipW;
946 __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
947 __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
948 __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
949 for(uint32_t i = 0; i < workDesc.numAttribs; i++)
950 {
951 __m128 attribA = _mm_load_ps(pAttribs);
952 __m128 attribB = _mm_load_ps(pAttribs+=4);
953 __m128 attribC = _mm_load_ps(pAttribs+=4);
954 pAttribs+=4;
955
956 attribA = _mm_mul_ps(attribA, vOneOverWV0);
957 attribB = _mm_mul_ps(attribB, vOneOverWV1);
958 attribC = _mm_mul_ps(attribC, vOneOverWV2);
959
960 _mm_store_ps(pPerspAttribs, attribA);
961 _mm_store_ps(pPerspAttribs+=4, attribB);
962 _mm_store_ps(pPerspAttribs+=4, attribC);
963 pPerspAttribs+=4;
964 }
965
966 // compute bary Z
967 // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
968 OSALIGNSIMD(float) a[4];
969 _mm_store_ps(a, vZ);
970 triDesc.Z[0] = a[0] - a[2];
971 triDesc.Z[1] = a[1] - a[2];
972 triDesc.Z[2] = a[2];
973
974 // add depth bias
975 triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
976
977 // Calc bounding box of triangle
978 OSALIGNSIMD(SWR_RECT) bbox;
979 calcBoundingBoxInt(vXi, vYi, bbox);
980
981 const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
982
983 if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
984 {
985 // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
986 bbox.xmin--; bbox.xmax++; bbox.ymin--; bbox.ymax++;
987 SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
988 "Conservative rast degenerate handling requires a valid scissor rect");
989 }
990
991 // Intersect with scissor/viewport
992 OSALIGNSIMD(SWR_RECT) intersect;
993 intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
994 intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
995 intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
996 intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
997
998 triDesc.triFlags = workDesc.triFlags;
999
1000 // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
1001 uint32_t macroX, macroY;
1002 MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
1003 int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
1004 int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
1005 int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
1006 int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
1007
1008 intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
1009 intersect.ymin = std::max(intersect.ymin, macroBoxTop);
1010 intersect.xmax = std::min(intersect.xmax, macroBoxRight);
1011 intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
1012
1013 SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
1014
1015 AR_END(BETriangleSetup, 0);
1016
1017 // update triangle desc
1018 uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1019 uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1020 uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1021 uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1022 uint32_t numTilesX = maxTileX - minTileX + 1;
1023 uint32_t numTilesY = maxTileY - minTileY + 1;
1024
1025 if (numTilesX == 0 || numTilesY == 0)
1026 {
1027 RDTSC_EVENT(BEEmptyTriangle, 1, 0);
1028 AR_END(BERasterizeTriangle, 1);
1029 return;
1030 }
1031
1032 AR_BEGIN(BEStepSetup, pDC->drawId);
1033
1034 // Step to pixel center of top-left pixel of the triangle bbox
1035 // Align intersect bbox (top/left) to raster tile's (top/left).
1036 int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
1037 int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
1038
1039 // convenience typedef
1040 typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
1041
1042 // single sample rasterization evaluates edges at pixel center,
1043 // multisample evaluates edges UL pixel corner and steps to each sample position
1044 if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1045 {
1046 // Add 0.5, in fixed point, to offset to pixel center
1047 x += (FIXED_POINT_SCALE / 2);
1048 y += (FIXED_POINT_SCALE / 2);
1049 }
1050
1051 __m128i vTopLeftX = _mm_set1_epi32(x);
1052 __m128i vTopLeftY = _mm_set1_epi32(y);
1053
1054 // evaluate edge equations at top-left pixel using 64bit math
1055 //
1056 // line = Ax + By + C
1057 // solving for C:
1058 // C = -Ax - By
1059 // we know x0 and y0 are on the line; plug them in:
1060 // C = -Ax0 - By0
1061 // plug C back into line equation:
1062 // line = Ax - By - Ax0 - By0
1063 // line = A(x - x0) + B(y - y0)
1064 // dX = (x-x0), dY = (y-y0)
1065 // so all this simplifies to
1066 // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
1067
1068 __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
1069 __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
1070
1071 // evaluate A(dx) and B(dY) for all points
1072 __m256d vAipd = _mm256_cvtepi32_pd(vAi);
1073 __m256d vBipd = _mm256_cvtepi32_pd(vBi);
1074 __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
1075 __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
1076
1077 __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
1078 __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
1079 __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
1080
1081 // apply any edge adjustments(top-left, crast, etc)
1082 adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
1083
1084 // broadcast respective edge results to all lanes
1085 double* pEdge = (double*)&vEdge;
1086 __m256d vEdgeFix16[7];
1087 vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
1088 vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
1089 vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
1090
1091 OSALIGNSIMD(int32_t) aAi[4], aBi[4];
1092 _mm_store_si128((__m128i*)aAi, vAi);
1093 _mm_store_si128((__m128i*)aBi, vBi);
1094 EDGE rastEdges[RT::NumEdgesT::value];
1095
1096 // Compute and store triangle edge data
1097 ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
1098 ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
1099 ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
1100
1101 // Compute and store triangle edge data if scissor needs to rasterized
1102 ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
1103 (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
1104
1105 // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
1106 // used to for testing if entire raster tile is inside a triangle
1107 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1108 {
1109 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
1110 }
1111
1112 // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
1113 // step sample positions to the raster tile bbox of multisample points
1114 // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples)
1115 // | |
1116 // | |
1117 // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
1118 __m256d vEdgeTileBbox[3];
1119 if (NumCoverageSamplesT::value > 1)
1120 {
1121 const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
1122 const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
1123 const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
1124
1125 __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
1126 __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
1127
1128 // step edge equation tests from Tile
1129 // used to for testing if entire raster tile is inside a triangle
1130 for (uint32_t e = 0; e < 3; ++e)
1131 {
1132 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
1133 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
1134 vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1135
1136 // adjust for msaa tile bbox edges outward for conservative rast, if enabled
1137 adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
1138 }
1139 }
1140
1141 AR_END(BEStepSetup, 0);
1142
1143 uint32_t tY = minTileY;
1144 uint32_t tX = minTileX;
1145 uint32_t maxY = maxTileY;
1146 uint32_t maxX = maxTileX;
1147
1148 RenderOutputBuffers renderBuffers, currentRenderBufferRow;
1149 GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
1150 currentRenderBufferRow = renderBuffers;
1151
1152 // rasterize and generate coverage masks per sample
1153 for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
1154 {
1155 __m256d vStartOfRowEdge[RT::NumEdgesT::value];
1156 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1157 {
1158 vStartOfRowEdge[e] = vEdgeFix16[e];
1159 }
1160
1161 for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
1162 {
1163 triDesc.anyCoveredSamples = 0;
1164
1165 // is the corner of the edge outside of the raster tile? (vEdge < 0)
1166 int mask0, mask1, mask2;
1167 UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
1168
1169 for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
1170 {
1171 // trivial reject, at least one edge has all 4 corners of raster tile outside
1172 bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
1173
1174 if (!trivialReject)
1175 {
1176 // trivial accept mask
1177 triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
1178
1179 // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
1180 UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
1181 (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
1182
1183 // @todo Make this a bit smarter to allow use of trivial accept when:
1184 // 1) scissor/vp intersection rect is raster tile aligned
1185 // 2) raster tile is entirely within scissor/vp intersection rect
1186 if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
1187 {
1188 // trivial accept, all 4 corners of all 3 edges are negative
1189 // i.e. raster tile completely inside triangle
1190 triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
1191 if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
1192 {
1193 triDesc.innerCoverageMask = 0xffffffffffffffffULL;
1194 }
1195 RDTSC_EVENT(BETrivialAccept, 1, 0);
1196 }
1197 else
1198 {
1199 __m256d vEdgeAtSample[RT::NumEdgesT::value];
1200 if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1201 {
1202 // should get optimized out for single sample case (global value numbering or copy propagation)
1203 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1204 {
1205 vEdgeAtSample[e] = vEdgeFix16[e];
1206 }
1207 }
1208 else
1209 {
1210 const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
1211 __m128i vSampleOffsetXh = samplePos.vXi(sampleNum);
1212 __m128i vSampleOffsetYh = samplePos.vYi(sampleNum);
1213 __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
1214 __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
1215
1216 // step edge equation tests from UL tile corner to pixel sample position
1217 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1218 {
1219 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
1220 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
1221 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1222 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
1223 }
1224 }
1225
1226 double startQuadEdges[RT::NumEdgesT::value];
1227 const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1228 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1229 {
1230 _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
1231 }
1232
1233 // not trivial accept or reject, must rasterize full tile
1234 AR_BEGIN(BERasterizePartial, pDC->drawId);
1235 triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
1236 AR_END(BERasterizePartial, 0);
1237
1238 triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
1239
1240 // Output SV InnerCoverage, if needed
1241 GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
1242 }
1243 }
1244 else
1245 {
1246 // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
1247 if(NumCoverageSamplesT::value > 1)
1248 {
1249 triDesc.coverageMask[sampleNum] = 0;
1250 }
1251 RDTSC_EVENT(BETrivialReject, 1, 0);
1252 }
1253 }
1254
1255 #if KNOB_ENABLE_TOSS_POINTS
1256 if(KNOB_TOSS_RS)
1257 {
1258 gToss = triDesc.coverageMask[0];
1259 }
1260 else
1261 #endif
1262 if(triDesc.anyCoveredSamples)
1263 {
1264 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
1265 // copy conservative coverage result to all samples
1266 if(RT::IsConservativeT::value)
1267 {
1268 auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
1269 UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
1270 }
1271
1272 AR_BEGIN(BEPixelBackend, pDC->drawId);
1273 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
1274 AR_END(BEPixelBackend, 0);
1275 }
1276
1277 // step to the next tile in X
1278 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1279 {
1280 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
1281 }
1282 StepRasterTileX<RT>(state.psState.numRenderTargets, renderBuffers);
1283 }
1284
1285 // step to the next tile in Y
1286 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1287 {
1288 vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
1289 }
1290 StepRasterTileY<RT>(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow);
1291 }
1292
1293 AR_END(BERasterizeTriangle, 1);
1294 }
1295
1296 void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
1297 {
1298 const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
1299 const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
1300 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1301
1302 bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
1303
1304 // load point vertex
1305 float x = *workDesc.pTriBuffer;
1306 float y = *(workDesc.pTriBuffer + 1);
1307 float z = *(workDesc.pTriBuffer + 2);
1308
1309 // create a copy of the triangle buffer to write our adjusted vertices to
1310 OSALIGNSIMD(float) newTriBuffer[4 * 4];
1311 TRIANGLE_WORK_DESC newWorkDesc = workDesc;
1312 newWorkDesc.pTriBuffer = &newTriBuffer[0];
1313
1314 // create a copy of the attrib buffer to write our adjusted attribs to
1315 OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
1316 newWorkDesc.pAttribs = &newAttribBuffer[0];
1317
1318 newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
1319 newWorkDesc.numAttribs = workDesc.numAttribs;
1320 newWorkDesc.triFlags = workDesc.triFlags;
1321
1322 // construct two tris by bloating point by point size
1323 float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
1324 float lowerX = x - halfPointSize;
1325 float upperX = x + halfPointSize;
1326 float lowerY = y - halfPointSize;
1327 float upperY = y + halfPointSize;
1328
1329 // tri 0
1330 float *pBuf = &newTriBuffer[0];
1331 *pBuf++ = lowerX;
1332 *pBuf++ = lowerX;
1333 *pBuf++ = upperX;
1334 pBuf++;
1335 *pBuf++ = lowerY;
1336 *pBuf++ = upperY;
1337 *pBuf++ = upperY;
1338 pBuf++;
1339 _mm_store_ps(pBuf, _mm_set1_ps(z));
1340 _mm_store_ps(pBuf+=4, _mm_set1_ps(1.0f));
1341
1342 // setup triangle rasterizer function
1343 PFN_WORK_FUNC pfnTriRast;
1344 // conservative rast not supported for points/lines
1345 pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
1346 SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
1347
1348 // overwrite texcoords for point sprites
1349 if (isPointSpriteTexCoordEnabled)
1350 {
1351 // copy original attribs
1352 memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
1353 newWorkDesc.pAttribs = &newAttribBuffer[0];
1354
1355 // overwrite texcoord for point sprites
1356 uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
1357 DWORD texCoordAttrib = 0;
1358
1359 while (_BitScanForward(&texCoordAttrib, texCoordMask))
1360 {
1361 texCoordMask &= ~(1 << texCoordAttrib);
1362 __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
1363 if (rastState.pointSpriteTopOrigin)
1364 {
1365 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
1366 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
1367 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
1368 }
1369 else
1370 {
1371 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
1372 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
1373 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
1374 }
1375 }
1376 }
1377 else
1378 {
1379 // no texcoord overwrite, can reuse the attrib buffer from frontend
1380 newWorkDesc.pAttribs = workDesc.pAttribs;
1381 }
1382
1383 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1384
1385 // tri 1
1386 pBuf = &newTriBuffer[0];
1387 *pBuf++ = lowerX;
1388 *pBuf++ = upperX;
1389 *pBuf++ = upperX;
1390 pBuf++;
1391 *pBuf++ = lowerY;
1392 *pBuf++ = upperY;
1393 *pBuf++ = lowerY;
1394 // z, w unchanged
1395
1396 if (isPointSpriteTexCoordEnabled)
1397 {
1398 uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
1399 DWORD texCoordAttrib = 0;
1400
1401 while (_BitScanForward(&texCoordAttrib, texCoordMask))
1402 {
1403 texCoordMask &= ~(1 << texCoordAttrib);
1404 __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
1405 if (rastState.pointSpriteTopOrigin)
1406 {
1407 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
1408 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
1409 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
1410
1411 }
1412 else
1413 {
1414 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
1415 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
1416 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
1417 }
1418 }
1419 }
1420
1421 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1422 }
1423
1424 void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
1425 {
1426 SWR_CONTEXT *pContext = pDC->pContext;
1427
1428 #if KNOB_ENABLE_TOSS_POINTS
1429 if (KNOB_TOSS_BIN_TRIS)
1430 {
1431 return;
1432 }
1433 #endif
1434
1435 const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
1436 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
1437
1438 // map x,y relative offsets from start of raster tile to bit position in
1439 // coverage mask for the point
1440 static const uint32_t coverageMap[8][8] = {
1441 { 0, 1, 4, 5, 8, 9, 12, 13 },
1442 { 2, 3, 6, 7, 10, 11, 14, 15 },
1443 { 16, 17, 20, 21, 24, 25, 28, 29 },
1444 { 18, 19, 22, 23, 26, 27, 30, 31 },
1445 { 32, 33, 36, 37, 40, 41, 44, 45 },
1446 { 34, 35, 38, 39, 42, 43, 46, 47 },
1447 { 48, 49, 52, 53, 56, 57, 60, 61 },
1448 { 50, 51, 54, 55, 58, 59, 62, 63 }
1449 };
1450
1451 OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
1452
1453 // pull point information from triangle buffer
1454 // @todo use structs for readability
1455 uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
1456 uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
1457 float z = *(workDesc.pTriBuffer + 2);
1458
1459 // construct triangle descriptor for point
1460 // no interpolation, set up i,j for constant interpolation of z and attribs
1461 // @todo implement an optimized backend that doesn't require triangle information
1462
1463 // compute coverage mask from x,y packed into the coverageMask flag
1464 // mask indices by the maximum valid index for x/y of coveragemap.
1465 uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
1466 uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
1467 // todo: multisample points?
1468 triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
1469
1470 // no persp divide needed for points
1471 triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
1472 triDesc.triFlags = workDesc.triFlags;
1473 triDesc.recipDet = 1.0f;
1474 triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
1475 triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
1476 triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
1477 triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
1478
1479 RenderOutputBuffers renderBuffers;
1480 GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
1481 renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
1482
1483 AR_BEGIN(BEPixelBackend, pDC->drawId);
1484 backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
1485 AR_END(BEPixelBackend, 0);
1486 }
1487
1488 // Get pointers to hot tile memory for color RT, depth, stencil
1489 template <uint32_t numSamples>
1490 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
1491 {
1492 const API_STATE& state = GetApiState(pDC);
1493 SWR_CONTEXT *pContext = pDC->pContext;
1494
1495 uint32_t mx, my;
1496 MacroTileMgr::getTileIndices(macroID, mx, my);
1497 tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
1498 tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
1499
1500 // compute tile offset for active hottile buffers
1501 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
1502 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1503 offset*=numSamples;
1504
1505 unsigned long rtSlot = 0;
1506 uint32_t colorHottileEnableMask = state.colorHottileEnable;
1507 while(_BitScanForward(&rtSlot, colorHottileEnableMask))
1508 {
1509 HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true,
1510 numSamples, renderTargetArrayIndex);
1511 pColor->state = HOTTILE_DIRTY;
1512 renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
1513
1514 colorHottileEnableMask &= ~(1 << rtSlot);
1515 }
1516 if(state.depthHottileEnable)
1517 {
1518 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
1519 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1520 offset*=numSamples;
1521 HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true,
1522 numSamples, renderTargetArrayIndex);
1523 pDepth->state = HOTTILE_DIRTY;
1524 SWR_ASSERT(pDepth->pBuffer != nullptr);
1525 renderBuffers.pDepth = pDepth->pBuffer + offset;
1526 }
1527 if(state.stencilHottileEnable)
1528 {
1529 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
1530 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1531 offset*=numSamples;
1532 HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true,
1533 numSamples, renderTargetArrayIndex);
1534 pStencil->state = HOTTILE_DIRTY;
1535 SWR_ASSERT(pStencil->pBuffer != nullptr);
1536 renderBuffers.pStencil = pStencil->pBuffer + offset;
1537 }
1538 }
1539
1540 template <typename RT>
1541 INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers)
1542 {
1543 for(uint32_t rt = 0; rt < NumRT; ++rt)
1544 {
1545 buffers.pColor[rt] += RT::colorRasterTileStep;
1546 }
1547
1548 buffers.pDepth += RT::depthRasterTileStep;
1549 buffers.pStencil += RT::stencilRasterTileStep;
1550 }
1551
1552 template <typename RT>
1553 INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
1554 {
1555 for(uint32_t rt = 0; rt < NumRT; ++rt)
1556 {
1557 startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
1558 buffers.pColor[rt] = startBufferRow.pColor[rt];
1559 }
1560 startBufferRow.pDepth += RT::depthRasterTileRowStep;
1561 buffers.pDepth = startBufferRow.pDepth;
1562
1563 startBufferRow.pStencil += RT::stencilRasterTileRowStep;
1564 buffers.pStencil = startBufferRow.pStencil;
1565 }
1566
1567 void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
1568 {
1569 SWR_CONTEXT *pContext = pDC->pContext;
1570 const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData);
1571 #if KNOB_ENABLE_TOSS_POINTS
1572 if (KNOB_TOSS_BIN_TRIS)
1573 {
1574 return;
1575 }
1576 #endif
1577
1578 // bloat line to two tris and call the triangle rasterizer twice
1579 AR_BEGIN(BERasterizeLine, pDC->drawId);
1580
1581 const API_STATE &state = GetApiState(pDC);
1582 const SWR_RASTSTATE &rastState = state.rastState;
1583
1584 // macrotile dimensioning
1585 uint32_t macroX, macroY;
1586 MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
1587 int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
1588 int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
1589 int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
1590 int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
1591
1592 const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
1593
1594 // create a copy of the triangle buffer to write our adjusted vertices to
1595 OSALIGNSIMD(float) newTriBuffer[4 * 4];
1596 TRIANGLE_WORK_DESC newWorkDesc = workDesc;
1597 newWorkDesc.pTriBuffer = &newTriBuffer[0];
1598
1599 // create a copy of the attrib buffer to write our adjusted attribs to
1600 OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
1601 newWorkDesc.pAttribs = &newAttribBuffer[0];
1602
1603 const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
1604 const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
1605
1606 __m128 vX, vY, vZ, vRecipW;
1607
1608 vX = _mm_load_ps(workDesc.pTriBuffer);
1609 vY = _mm_load_ps(workDesc.pTriBuffer + 4);
1610 vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
1611 vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
1612
1613 // triangle 0
1614 // v0,v1 -> v0,v0,v1
1615 __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
1616 __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
1617 __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
1618 __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
1619
1620 __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
1621 __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
1622 if (workDesc.triFlags.yMajor)
1623 {
1624 vXa = _mm_add_ps(vAdjust, vXa);
1625 }
1626 else
1627 {
1628 vYa = _mm_add_ps(vAdjust, vYa);
1629 }
1630
1631 // Store triangle description for rasterizer
1632 _mm_store_ps((float*)&newTriBuffer[0], vXa);
1633 _mm_store_ps((float*)&newTriBuffer[4], vYa);
1634 _mm_store_ps((float*)&newTriBuffer[8], vZa);
1635 _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
1636
1637 // binner bins 3 edges for lines as v0, v1, v1
1638 // tri0 needs v0, v0, v1
1639 for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
1640 {
1641 __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]);
1642 __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]);
1643
1644 _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0);
1645 _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0);
1646 _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1);
1647 }
1648
1649 // Store user clip distances for triangle 0
1650 float newClipBuffer[3 * 8];
1651 uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask);
1652 if (numClipDist)
1653 {
1654 newWorkDesc.pUserClipBuffer = newClipBuffer;
1655
1656 float* pOldBuffer = workDesc.pUserClipBuffer;
1657 float* pNewBuffer = newClipBuffer;
1658 for (uint32_t i = 0; i < numClipDist; ++i)
1659 {
1660 // read barycentric coeffs from binner
1661 float a = *(pOldBuffer++);
1662 float b = *(pOldBuffer++);
1663
1664 // reconstruct original clip distance at vertices
1665 float c0 = a + b;
1666 float c1 = b;
1667
1668 // construct triangle barycentrics
1669 *(pNewBuffer++) = c0 - c1;
1670 *(pNewBuffer++) = c0 - c1;
1671 *(pNewBuffer++) = c1;
1672 }
1673 }
1674
1675 // setup triangle rasterizer function
1676 PFN_WORK_FUNC pfnTriRast;
1677 // conservative rast not supported for points/lines
1678 pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
1679 SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
1680
1681 // make sure this macrotile intersects the triangle
1682 __m128i vXai = fpToFixedPoint(vXa);
1683 __m128i vYai = fpToFixedPoint(vYa);
1684 OSALIGNSIMD(SWR_RECT) bboxA;
1685 calcBoundingBoxInt(vXai, vYai, bboxA);
1686
1687 if (!(bboxA.xmin > macroBoxRight ||
1688 bboxA.xmin > scissorInFixedPoint.xmax ||
1689 bboxA.xmax - 1 < macroBoxLeft ||
1690 bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
1691 bboxA.ymin > macroBoxBottom ||
1692 bboxA.ymin > scissorInFixedPoint.ymax ||
1693 bboxA.ymax - 1 < macroBoxTop ||
1694 bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
1695 // rasterize triangle
1696 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1697 }
1698
1699 // triangle 1
1700 // v0,v1 -> v1,v1,v0
1701 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
1702 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
1703 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
1704 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
1705
1706 vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
1707 if (workDesc.triFlags.yMajor)
1708 {
1709 vXa = _mm_add_ps(vAdjust, vXa);
1710 }
1711 else
1712 {
1713 vYa = _mm_add_ps(vAdjust, vYa);
1714 }
1715
1716 // Store triangle description for rasterizer
1717 _mm_store_ps((float*)&newTriBuffer[0], vXa);
1718 _mm_store_ps((float*)&newTriBuffer[4], vYa);
1719 _mm_store_ps((float*)&newTriBuffer[8], vZa);
1720 _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
1721
1722 // binner bins 3 edges for lines as v0, v1, v1
1723 // tri1 needs v1, v1, v0
1724 for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
1725 {
1726 __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
1727 __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
1728
1729 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
1730 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
1731 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
1732 }
1733
1734 // store user clip distance for triangle 1
1735 if (numClipDist)
1736 {
1737 float* pOldBuffer = workDesc.pUserClipBuffer;
1738 float* pNewBuffer = newClipBuffer;
1739 for (uint32_t i = 0; i < numClipDist; ++i)
1740 {
1741 // read barycentric coeffs from binner
1742 float a = *(pOldBuffer++);
1743 float b = *(pOldBuffer++);
1744
1745 // reconstruct original clip distance at vertices
1746 float c0 = a + b;
1747 float c1 = b;
1748
1749 // construct triangle barycentrics
1750 *(pNewBuffer++) = c1 - c0;
1751 *(pNewBuffer++) = c1 - c0;
1752 *(pNewBuffer++) = c0;
1753 }
1754 }
1755
1756 vXai = fpToFixedPoint(vXa);
1757 vYai = fpToFixedPoint(vYa);
1758 calcBoundingBoxInt(vXai, vYai, bboxA);
1759
1760 if (!(bboxA.xmin > macroBoxRight ||
1761 bboxA.xmin > scissorInFixedPoint.xmax ||
1762 bboxA.xmax - 1 < macroBoxLeft ||
1763 bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
1764 bboxA.ymin > macroBoxBottom ||
1765 bboxA.ymin > scissorInFixedPoint.ymax ||
1766 bboxA.ymax - 1 < macroBoxTop ||
1767 bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
1768 // rasterize triangle
1769 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1770 }
1771
1772 AR_END(BERasterizeLine, 1);
1773 }
1774
1775 struct RasterizerChooser
1776 {
1777 typedef PFN_WORK_FUNC FuncType;
1778
1779 template <typename... ArgsB>
1780 static FuncType GetFunc()
1781 {
1782 return RasterizeTriangle<RasterizerTraits<ArgsB...>>;
1783 }
1784 };
1785
1786 // Selector for correct templated RasterizeTriangle function
1787 PFN_WORK_FUNC GetRasterizerFunc(
1788 uint32_t numSamples,
1789 bool IsCenter,
1790 bool IsConservative,
1791 uint32_t InputCoverage,
1792 uint32_t EdgeEnable,
1793 bool RasterizeScissorEdges
1794 )
1795 {
1796 return TemplateArgUnroller<RasterizerChooser>::GetFunc(
1797 IntArg<SWR_MULTISAMPLE_1X,SWR_MULTISAMPLE_TYPE_COUNT-1>{numSamples},
1798 IsCenter,
1799 IsConservative,
1800 IntArg<SWR_INPUT_COVERAGE_NONE, SWR_INPUT_COVERAGE_COUNT-1>{InputCoverage},
1801 IntArg<0, STATE_VALID_TRI_EDGE_COUNT-1>{EdgeEnable},
1802 RasterizeScissorEdges);
1803 }