swr/rast: Support render target mask instead of render target count
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / rasterizer_impl.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file rasterizer.cpp
24 *
25 * @brief Implementation for the rasterizer.
26 *
27 ******************************************************************************/
28
29 #include <vector>
30 #include <algorithm>
31
32 #include "rasterizer.h"
33 #include "rdtsc_core.h"
34 #include "backend.h"
35 #include "utils.h"
36 #include "frontend.h"
37 #include "tilemgr.h"
38 #include "memory/tilingtraits.h"
39
40 extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
41
42 template <uint32_t numSamples = 1>
43 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
44 template <typename RT>
45 void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers);
46 template <typename RT>
47 void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
48
49 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
50 static const __m256d gMaskToVecpd[] =
51 {
52 MASKTOVEC(0, 0, 0, 0),
53 MASKTOVEC(0, 0, 0, 1),
54 MASKTOVEC(0, 0, 1, 0),
55 MASKTOVEC(0, 0, 1, 1),
56 MASKTOVEC(0, 1, 0, 0),
57 MASKTOVEC(0, 1, 0, 1),
58 MASKTOVEC(0, 1, 1, 0),
59 MASKTOVEC(0, 1, 1, 1),
60 MASKTOVEC(1, 0, 0, 0),
61 MASKTOVEC(1, 0, 0, 1),
62 MASKTOVEC(1, 0, 1, 0),
63 MASKTOVEC(1, 0, 1, 1),
64 MASKTOVEC(1, 1, 0, 0),
65 MASKTOVEC(1, 1, 0, 1),
66 MASKTOVEC(1, 1, 1, 0),
67 MASKTOVEC(1, 1, 1, 1),
68 };
69
70 struct POS
71 {
72 int32_t x, y;
73 };
74
75 struct EDGE
76 {
77 double a, b; // a, b edge coefficients in fix8
78 double stepQuadX; // step to adjacent horizontal quad in fix16
79 double stepQuadY; // step to adjacent vertical quad in fix16
80 double stepRasterTileX; // step to adjacent horizontal raster tile in fix16
81 double stepRasterTileY; // step to adjacent vertical raster tile in fix16
82
83 __m256d vQuadOffsets; // offsets for 4 samples of a quad
84 __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
85 };
86
87 //////////////////////////////////////////////////////////////////////////
88 /// @brief rasterize a raster tile partially covered by the triangle
89 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
90 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
91 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
92 /// Used to step between quads when sweeping over the raster tile.
93 template<uint32_t NumEdges, typename EdgeMaskT>
94 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
95 {
96 uint64_t coverageMask = 0;
97
98 __m256d vEdges[NumEdges];
99 __m256d vStepX[NumEdges];
100 __m256d vStepY[NumEdges];
101
102 for (uint32_t e = 0; e < NumEdges; ++e)
103 {
104 // Step to the pixel sample locations of the 1st quad
105 vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
106
107 // compute step to next quad (mul by 2 in x and y direction)
108 vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
109 vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
110 }
111
112 // fast unrolled version for 8x8 tile
113 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
114 int edgeMask[NumEdges];
115 uint64_t mask;
116
117 auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
118 auto update_lambda = [&](int e){mask &= edgeMask[e];};
119 auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
120 auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
121 auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
122
123 // evaluate which pixels in the quad are covered
124 #define EVAL \
125 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
126
127 // update coverage mask
128 // if edge 0 is degenerate and will be skipped; init the mask
129 #define UPDATE_MASK(bit) \
130 if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
131 mask = 0xf;\
132 }\
133 else{\
134 mask = edgeMask[0]; \
135 }\
136 UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
137 coverageMask |= (mask << bit);
138
139 // step in the +x direction to the next quad
140 #define INCX \
141 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
142
143 // step in the +y direction to the next quad
144 #define INCY \
145 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
146
147 // step in the -x direction to the next quad
148 #define DECX \
149 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
150
151 // sweep 2x2 quad back and forth through the raster tile,
152 // computing coverage masks for the entire tile
153
154 // raster tile
155 // 0 1 2 3 4 5 6 7
156 // x x
157 // x x ------------------>
158 // x x |
159 // <-----------------x x V
160 // ..
161
162 // row 0
163 EVAL;
164 UPDATE_MASK(0);
165 INCX;
166 EVAL;
167 UPDATE_MASK(4);
168 INCX;
169 EVAL;
170 UPDATE_MASK(8);
171 INCX;
172 EVAL;
173 UPDATE_MASK(12);
174 INCY;
175
176 //row 1
177 EVAL;
178 UPDATE_MASK(28);
179 DECX;
180 EVAL;
181 UPDATE_MASK(24);
182 DECX;
183 EVAL;
184 UPDATE_MASK(20);
185 DECX;
186 EVAL;
187 UPDATE_MASK(16);
188 INCY;
189
190 // row 2
191 EVAL;
192 UPDATE_MASK(32);
193 INCX;
194 EVAL;
195 UPDATE_MASK(36);
196 INCX;
197 EVAL;
198 UPDATE_MASK(40);
199 INCX;
200 EVAL;
201 UPDATE_MASK(44);
202 INCY;
203
204 // row 3
205 EVAL;
206 UPDATE_MASK(60);
207 DECX;
208 EVAL;
209 UPDATE_MASK(56);
210 DECX;
211 EVAL;
212 UPDATE_MASK(52);
213 DECX;
214 EVAL;
215 UPDATE_MASK(48);
216 #else
217 uint32_t bit = 0;
218 for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
219 {
220 __m256d vStartOfRowEdge[NumEdges];
221 for (uint32_t e = 0; e < NumEdges; ++e)
222 {
223 vStartOfRowEdge[e] = vEdges[e];
224 }
225
226 for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
227 {
228 int edgeMask[NumEdges];
229 for (uint32_t e = 0; e < NumEdges; ++e)
230 {
231 edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
232 }
233
234 uint64_t mask = edgeMask[0];
235 for (uint32_t e = 1; e < NumEdges; ++e)
236 {
237 mask &= edgeMask[e];
238 }
239 coverageMask |= (mask << bit);
240
241 // step to the next pixel in the x
242 for (uint32_t e = 0; e < NumEdges; ++e)
243 {
244 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
245 }
246 bit+=4;
247 }
248
249 // step to the next row
250 for (uint32_t e = 0; e < NumEdges; ++e)
251 {
252 vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
253 }
254 }
255 #endif
256 return coverageMask;
257
258 }
259 // Top left rule:
260 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
261 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
262 // Top left: a sample is in if it is a top or left edge.
263 // Out: !(horizontal && above) = !horizontal && below
264 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right
265 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge)
266 {
267 // if vA < 0, vC--
268 // if vA == 0 && vB < 0, vC--
269
270 __m256d vEdgeOut = vEdge;
271 __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
272
273 // if vA < 0 (line is not horizontal and below)
274 int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
275
276 // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
277 __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
278 int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
279 msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
280
281 // if either of these are true and we're on the line (edge == 0), bump it outside the line
282 vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
283 }
284
285 //////////////////////////////////////////////////////////////////////////
286 /// @brief calculates difference in precision between the result of manh
287 /// calculation and the edge precision, based on compile time trait values
288 template<typename RT>
289 constexpr int64_t ManhToEdgePrecisionAdjust()
290 {
291 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
292 "Inadequate precision of result of manh calculation ");
293 return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
294 }
295
296 //////////////////////////////////////////////////////////////////////////
297 /// @struct adjustEdgeConservative
298 /// @brief Primary template definition used for partially specializing
299 /// the adjustEdgeConservative function. This struct should never
300 /// be instantiated.
301 /// @tparam RT: rasterizer traits
302 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
303 template <typename RT, typename ConservativeEdgeOffsetT>
304 struct adjustEdgeConservative
305 {
306 //////////////////////////////////////////////////////////////////////////
307 /// @brief Performs calculations to adjust each edge of a triangle away
308 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
309 /// direction.
310 ///
311 /// Uncertainty regions arise from fixed point rounding, which
312 /// can snap a vertex +/- by min fixed point value.
313 /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
314 /// This allows the rasterizer to test for coverage only at the pixel center,
315 /// instead of having to test individual pixel corners for conservative coverage
316 INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
317 {
318 // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
319 // from the pixel center (in the direction of the edge normal A/B)
320
321 // edge = Ax + Bx + C - (manh/e)
322 // manh = manhattan distance = abs(A) + abs(B)
323 // e = absolute rounding error from snapping from float to fixed point precision
324
325 // 'fixed point' multiply (in double to be avx1 friendly)
326 // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
327 __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
328 __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
329 _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
330
331 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
332 "Inadequate precision of result of manh calculation ");
333
334 // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
335 // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
336 manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
337
338 // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
339 // this allows the rasterizer to do a single conservative coverage test to see if the primitive
340 // intersects the pixel at all
341 vEdge = _mm256_sub_pd(vEdge, manh);
342 };
343 };
344
345 //////////////////////////////////////////////////////////////////////////
346 /// @brief adjustEdgeConservative specialization where no edge offset is needed
347 template <typename RT>
348 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
349 {
350 INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
351 };
352
353 //////////////////////////////////////////////////////////////////////////
354 /// @brief calculates the distance a degenerate BBox needs to be adjusted
355 /// for conservative rast based on compile time trait values
356 template<typename RT>
357 constexpr int64_t ConservativeScissorOffset()
358 {
359 static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
360 // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
361 typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
362 // 1/2 pixel edge offset + conservative offset - degenerateTriangle
363 return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
364 }
365
366 //////////////////////////////////////////////////////////////////////////
367 /// @brief Performs calculations to adjust each a vector of evaluated edges out
368 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
369 /// direction.
370 template <typename RT>
371 INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
372 {
373 int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
374 int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
375 vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
376 };
377
378 //////////////////////////////////////////////////////////////////////////
379 /// @brief Performs calculations to adjust each a scalar evaluated edge out
380 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
381 /// direction.
382 template <typename RT, typename OffsetT>
383 INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
384 {
385 int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
386 int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
387 return (Edge - manh);
388 };
389
390 //////////////////////////////////////////////////////////////////////////
391 /// @brief Perform any needed adjustments to evaluated triangle edges
392 template <typename RT, typename EdgeOffsetT>
393 struct adjustEdgesFix16
394 {
395 INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
396 {
397 static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
398 "Edge equation expected to be in x.16 fixed point");
399
400 static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
401
402 // need to apply any edge offsets before applying the top-left rule
403 adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
404
405 adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
406 }
407 };
408
409 //////////////////////////////////////////////////////////////////////////
410 /// @brief Perform top left adjustments to evaluated triangle edges
411 template <typename RT>
412 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
413 {
414 INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
415 {
416 adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
417 }
418 };
419
420 // max(abs(dz/dx), abs(dz,dy)
421 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
422 {
423 /*
424 // evaluate i,j at (0,0)
425 float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
426 float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
427
428 // evaluate i,j at (1,0)
429 float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
430 float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
431
432 // compute dz/dx
433 float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
434 float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
435 float dzdx = abs(d10 - d00);
436
437 // evaluate i,j at (0,1)
438 float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
439 float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
440
441 float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
442 float dzdy = abs(d01 - d00);
443 */
444
445 // optimized version of above
446 float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
447 float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
448
449 return std::max(dzdx, dzdy);
450 }
451
452 INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
453 {
454 if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
455 {
456 return (1.0f / (1 << 24));
457 }
458 else if (pState->depthFormat == R16_UNORM)
459 {
460 return (1.0f / (1 << 16));
461 }
462 else
463 {
464 SWR_ASSERT(pState->depthFormat == R32_FLOAT);
465
466 // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
467 float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
468 uint32_t zMaxInt = *(uint32_t*)&zMax;
469 zMaxInt &= 0x7f800000;
470 zMax = *(float*)&zMaxInt;
471
472 return zMax * (1.0f / (1 << 23));
473 }
474 }
475
476 INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
477 {
478 if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
479 {
480 return 0.0f;
481 }
482
483 float scale = pState->slopeScaledDepthBias;
484 if (scale != 0.0f)
485 {
486 scale *= ComputeMaxDepthSlope(pTri);
487 }
488
489 float bias = pState->depthBias;
490 if (!pState->depthBiasPreAdjusted)
491 {
492 bias *= ComputeBiasFactor(pState, pTri, z);
493 }
494 bias += scale;
495
496 if (pState->depthBiasClamp > 0.0f)
497 {
498 bias = std::min(bias, pState->depthBiasClamp);
499 }
500 else if (pState->depthBiasClamp < 0.0f)
501 {
502 bias = std::max(bias, pState->depthBiasClamp);
503 }
504
505 return bias;
506 }
507
508 // Prevent DCE by writing coverage mask from rasterizer to volatile
509 #if KNOB_ENABLE_TOSS_POINTS
510 __declspec(thread) volatile uint64_t gToss;
511 #endif
512
513 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
514 // try to avoid _chkstk insertions; make this thread local
515 static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
516
517 INLINE
518 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
519 {
520 edge.a = a;
521 edge.b = b;
522
523 // compute constant steps to adjacent quads
524 edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
525 edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
526
527 // compute constant steps to adjacent raster tiles
528 edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
529 edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
530
531 // compute quad offsets
532 const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
533 const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
534
535 __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
536 __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
537 edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
538
539 // compute raster tile offsets
540 const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
541 const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
542
543 __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
544 __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
545 edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
546 }
547
548 INLINE
549 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
550 {
551 ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
552 }
553
554 //////////////////////////////////////////////////////////////////////////
555 /// @brief Primary template definition used for partially specializing
556 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
557 /// corner to sample position, and test for coverage
558 /// @tparam sampleCount: multisample count
559 template <typename NumSamplesT>
560 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
561 int32_t &mask0, int32_t &mask1, int32_t &mask2)
562 {
563 __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
564 // evaluate edge equations at the tile multisample bounding box
565 vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
566 vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
567 vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
568 mask0 = _mm256_movemask_pd(vSampleBboxTest0);
569 mask1 = _mm256_movemask_pd(vSampleBboxTest1);
570 mask2 = _mm256_movemask_pd(vSampleBboxTest2);
571 }
572
573 //////////////////////////////////////////////////////////////////////////
574 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
575 /// when only rasterizing a single coverage test point
576 template <>
577 INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
578 int32_t &mask0, int32_t &mask1, int32_t &mask2)
579 {
580 mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
581 mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
582 mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
583 }
584
585 //////////////////////////////////////////////////////////////////////////
586 /// @struct ComputeScissorEdges
587 /// @brief Primary template definition. Allows the function to be generically
588 /// called. When paired with below specializations, will result in an empty
589 /// inlined function if scissor is not enabled
590 /// @tparam RasterScissorEdgesT: is scissor enabled?
591 /// @tparam IsConservativeT: is conservative rast enabled?
592 /// @tparam RT: rasterizer traits
593 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
594 struct ComputeScissorEdges
595 {
596 INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
597 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
598 };
599
600 //////////////////////////////////////////////////////////////////////////
601 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
602 /// specialization. Instantiated when conservative rast and scissor are enabled
603 template <typename RT>
604 struct ComputeScissorEdges<std::true_type, std::true_type, RT>
605 {
606 //////////////////////////////////////////////////////////////////////////
607 /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
608 /// evaluate edge equations and offset them away from pixel center.
609 INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
610 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
611 {
612 // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
613 SWR_RECT scissor;
614 scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
615 scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
616 scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
617 scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
618
619 POS topLeft{scissor.xmin, scissor.ymin};
620 POS bottomLeft{scissor.xmin, scissor.ymax};
621 POS topRight{scissor.xmax, scissor.ymin};
622 POS bottomRight{scissor.xmax, scissor.ymax};
623
624 // construct 4 scissor edges in ccw direction
625 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
626 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
627 ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
628 ComputeEdgeData(topRight, topLeft, rastEdges[6]);
629
630 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
631 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
632 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
633 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
634
635 // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
636 adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
637 adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
638 adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
639 adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
640
641 // Upper left rule for scissor
642 vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
643 vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
644 }
645 };
646
647 //////////////////////////////////////////////////////////////////////////
648 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
649 /// specialization. Instantiated when scissor is enabled and conservative rast
650 /// is disabled.
651 template <typename RT>
652 struct ComputeScissorEdges<std::true_type, std::false_type, RT>
653 {
654 //////////////////////////////////////////////////////////////////////////
655 /// @brief Compute scissor edge vectors and evaluate edge equations
656 INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
657 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
658 {
659 const SWR_RECT &scissor = scissorBBox;
660 POS topLeft{scissor.xmin, scissor.ymin};
661 POS bottomLeft{scissor.xmin, scissor.ymax};
662 POS topRight{scissor.xmax, scissor.ymin};
663 POS bottomRight{scissor.xmax, scissor.ymax};
664
665 // construct 4 scissor edges in ccw direction
666 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
667 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
668 ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
669 ComputeEdgeData(topRight, topLeft, rastEdges[6]);
670
671 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
672 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
673 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
674 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
675
676 // Upper left rule for scissor
677 vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
678 vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
679 }
680 };
681
682 //////////////////////////////////////////////////////////////////////////
683 /// @brief Primary function template for TrivialRejectTest. Should
684 /// never be called, but TemplateUnroller instantiates a few unused values,
685 /// so it calls a runtime assert instead of a static_assert.
686 template <typename ValidEdgeMaskT>
687 INLINE bool TrivialRejectTest(const int, const int, const int)
688 {
689 SWR_INVALID("Primary templated function should never be called");
690 return false;
691 };
692
693 //////////////////////////////////////////////////////////////////////////
694 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
695 /// and edge 1 for trivial coverage reject
696 template <>
697 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
698 {
699 return (!(mask0 && mask1)) ? true : false;
700 };
701
702 //////////////////////////////////////////////////////////////////////////
703 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
704 /// and edge 2 for trivial coverage reject
705 template <>
706 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
707 {
708 return (!(mask0 && mask2)) ? true : false;
709 };
710
711 //////////////////////////////////////////////////////////////////////////
712 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
713 /// and edge 2 for trivial coverage reject
714 template <>
715 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
716 {
717 return (!(mask1 && mask2)) ? true : false;
718 };
719
720 //////////////////////////////////////////////////////////////////////////
721 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
722 /// primitive edges for trivial coverage reject
723 template <>
724 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
725 {
726 return (!(mask0 && mask1 && mask2)) ? true : false;;
727 };
728
729 //////////////////////////////////////////////////////////////////////////
730 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
731 /// point, so return false and rasterize against conservative BBox
732 template <>
733 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
734 {
735 return false;
736 };
737
738 //////////////////////////////////////////////////////////////////////////
739 /// @brief Primary function template for TrivialAcceptTest. Always returns
740 /// false, since it will only be called for degenerate tris, and as such
741 /// will never cover the entire raster tile
742 template <typename ScissorEnableT>
743 INLINE bool TrivialAcceptTest(const int, const int, const int)
744 {
745 return false;
746 };
747
748 //////////////////////////////////////////////////////////////////////////
749 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
750 /// edge masks for a fully covered raster tile
751 template <>
752 INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
753 {
754 return ((mask0 & mask1 & mask2) == 0xf);
755 };
756
757 //////////////////////////////////////////////////////////////////////////
758 /// @brief Primary function template for GenerateSVInnerCoverage. Results
759 /// in an empty function call if SVInnerCoverage isn't requested
760 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
761 struct GenerateSVInnerCoverage
762 {
763 INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t &){};
764 };
765
766 //////////////////////////////////////////////////////////////////////////
767 /// @brief Specialization of GenerateSVInnerCoverage where all edges
768 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
769 /// edge values from OuterConservative to InnerConservative and rasterizes.
770 template <typename RT>
771 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
772 {
773 INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges, uint64_t &innerCoverageMask)
774 {
775 SWR_CONTEXT *pContext = pDC->pContext;
776
777 double startQuadEdgesAdj[RT::NumEdgesT::value];
778 for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
779 {
780 startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
781 }
782
783 // not trivial accept or reject, must rasterize full tile
784 AR_BEGIN(BERasterizePartial, pDC->drawId);
785 innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
786 AR_END(BERasterizePartial, 0);
787 }
788 };
789
790 //////////////////////////////////////////////////////////////////////////
791 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
792 /// in an empty function call if SVInnerCoverage isn't requested
793 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
794 struct UpdateEdgeMasksInnerConservative
795 {
796 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
797 const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
798 };
799
800 //////////////////////////////////////////////////////////////////////////
801 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
802 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
803 /// evaluated at raster tile corners to inner conservative position and
804 /// updates edge masks
805 template <typename RT>
806 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
807 {
808 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
809 const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
810 {
811 __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
812
813 // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
814 // conservative evaluated edge when adjusting the edge in for inner conservative tests
815 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
816 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
817 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
818
819 UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
820 }
821 };
822
823 //////////////////////////////////////////////////////////////////////////
824 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
825 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
826 /// cover an entire raster tile, set mask0 to 0 to force it down the
827 /// rastierizePartialTile path
828 template <typename RT, typename ValidEdgeMaskT>
829 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
830 {
831 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
832 const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
833 {
834 // set one mask to zero to force the triangle down the rastierizePartialTile path
835 mask0 = 0;
836 }
837 };
838
839 template <typename RT>
840 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
841 {
842 SWR_CONTEXT *pContext = pDC->pContext;
843 const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
844 #if KNOB_ENABLE_TOSS_POINTS
845 if (KNOB_TOSS_BIN_TRIS)
846 {
847 return;
848 }
849 #endif
850 AR_BEGIN(BERasterizeTriangle, pDC->drawId);
851 AR_BEGIN(BETriangleSetup, pDC->drawId);
852
853 const API_STATE &state = GetApiState(pDC);
854 const SWR_RASTSTATE &rastState = state.rastState;
855 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
856
857 OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
858 triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
859
860 __m128 vX, vY, vZ, vRecipW;
861
862 // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
863 // eg: vX = [x0 x1 x2 dc]
864 vX = _mm_load_ps(workDesc.pTriBuffer);
865 vY = _mm_load_ps(workDesc.pTriBuffer + 4);
866 vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
867 vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
868
869 // convert to fixed point
870 static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
871 __m128i vXi = fpToFixedPoint(vX);
872 __m128i vYi = fpToFixedPoint(vY);
873
874 // quantize floating point position to fixed point precision
875 // to prevent attribute creep around the triangle vertices
876 vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
877 vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
878
879 // triangle setup - A and B edge equation coefs
880 __m128 vA, vB;
881 triangleSetupAB(vX, vY, vA, vB);
882
883 __m128i vAi, vBi;
884 triangleSetupABInt(vXi, vYi, vAi, vBi);
885
886 // determinant
887 float det = calcDeterminantInt(vAi, vBi);
888
889 // Verts in Pixel Coordinate Space at this point
890 // Det > 0 = CW winding order
891 // Convert CW triangles to CCW
892 if (det > 0.0)
893 {
894 vA = _mm_mul_ps(vA, _mm_set1_ps(-1));
895 vB = _mm_mul_ps(vB, _mm_set1_ps(-1));
896 vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
897 vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
898 det = -det;
899 }
900
901 __m128 vC;
902 // Finish triangle setup - C edge coef
903 triangleSetupC(vX, vY, vA, vB, vC);
904
905 if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
906 {
907 // If we have degenerate edge(s) to rasterize, set I and J coefs
908 // to 0 for constant interpolation of attributes
909 triDesc.I[0] = 0.0f;
910 triDesc.I[1] = 0.0f;
911 triDesc.I[2] = 0.0f;
912 triDesc.J[0] = 0.0f;
913 triDesc.J[1] = 0.0f;
914 triDesc.J[2] = 0.0f;
915
916 // Degenerate triangles have no area
917 triDesc.recipDet = 0.0f;
918 }
919 else
920 {
921 // only extract coefs for 2 of the barycentrics; the 3rd can be
922 // determined from the barycentric equation:
923 // i + j + k = 1 <=> k = 1 - j - i
924 _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
925 _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
926 _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
927 _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
928 _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
929 _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
930
931 // compute recipDet, used to calculate barycentric i and j in the backend
932 triDesc.recipDet = 1.0f/det;
933 }
934
935 OSALIGNSIMD(float) oneOverW[4];
936 _mm_store_ps(oneOverW, vRecipW);
937 triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
938 triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
939 triDesc.OneOverW[2] = oneOverW[2];
940
941 // calculate perspective correct coefs per vertex attrib
942 float* pPerspAttribs = perspAttribsTLS;
943 float* pAttribs = workDesc.pAttribs;
944 triDesc.pPerspAttribs = pPerspAttribs;
945 triDesc.pAttribs = pAttribs;
946 float *pRecipW = workDesc.pTriBuffer + 12;
947 triDesc.pRecipW = pRecipW;
948 __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
949 __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
950 __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
951 for(uint32_t i = 0; i < workDesc.numAttribs; i++)
952 {
953 __m128 attribA = _mm_load_ps(pAttribs);
954 __m128 attribB = _mm_load_ps(pAttribs+=4);
955 __m128 attribC = _mm_load_ps(pAttribs+=4);
956 pAttribs+=4;
957
958 attribA = _mm_mul_ps(attribA, vOneOverWV0);
959 attribB = _mm_mul_ps(attribB, vOneOverWV1);
960 attribC = _mm_mul_ps(attribC, vOneOverWV2);
961
962 _mm_store_ps(pPerspAttribs, attribA);
963 _mm_store_ps(pPerspAttribs+=4, attribB);
964 _mm_store_ps(pPerspAttribs+=4, attribC);
965 pPerspAttribs+=4;
966 }
967
968 // compute bary Z
969 // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
970 OSALIGNSIMD(float) a[4];
971 _mm_store_ps(a, vZ);
972 triDesc.Z[0] = a[0] - a[2];
973 triDesc.Z[1] = a[1] - a[2];
974 triDesc.Z[2] = a[2];
975
976 // add depth bias
977 triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
978
979 // Calc bounding box of triangle
980 OSALIGNSIMD(SWR_RECT) bbox;
981 calcBoundingBoxInt(vXi, vYi, bbox);
982
983 const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
984
985 if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
986 {
987 // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
988 bbox.xmin--; bbox.xmax++; bbox.ymin--; bbox.ymax++;
989 SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
990 "Conservative rast degenerate handling requires a valid scissor rect");
991 }
992
993 // Intersect with scissor/viewport
994 OSALIGNSIMD(SWR_RECT) intersect;
995 intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
996 intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
997 intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
998 intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
999
1000 triDesc.triFlags = workDesc.triFlags;
1001
1002 // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
1003 uint32_t macroX, macroY;
1004 MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
1005 int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
1006 int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
1007 int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
1008 int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
1009
1010 intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
1011 intersect.ymin = std::max(intersect.ymin, macroBoxTop);
1012 intersect.xmax = std::min(intersect.xmax, macroBoxRight);
1013 intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
1014
1015 SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
1016
1017 AR_END(BETriangleSetup, 0);
1018
1019 // update triangle desc
1020 uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1021 uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1022 uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1023 uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1024 uint32_t numTilesX = maxTileX - minTileX + 1;
1025 uint32_t numTilesY = maxTileY - minTileY + 1;
1026
1027 if (numTilesX == 0 || numTilesY == 0)
1028 {
1029 RDTSC_EVENT(BEEmptyTriangle, 1, 0);
1030 AR_END(BERasterizeTriangle, 1);
1031 return;
1032 }
1033
1034 AR_BEGIN(BEStepSetup, pDC->drawId);
1035
1036 // Step to pixel center of top-left pixel of the triangle bbox
1037 // Align intersect bbox (top/left) to raster tile's (top/left).
1038 int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
1039 int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
1040
1041 // convenience typedef
1042 typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
1043
1044 // single sample rasterization evaluates edges at pixel center,
1045 // multisample evaluates edges UL pixel corner and steps to each sample position
1046 if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1047 {
1048 // Add 0.5, in fixed point, to offset to pixel center
1049 x += (FIXED_POINT_SCALE / 2);
1050 y += (FIXED_POINT_SCALE / 2);
1051 }
1052
1053 __m128i vTopLeftX = _mm_set1_epi32(x);
1054 __m128i vTopLeftY = _mm_set1_epi32(y);
1055
1056 // evaluate edge equations at top-left pixel using 64bit math
1057 //
1058 // line = Ax + By + C
1059 // solving for C:
1060 // C = -Ax - By
1061 // we know x0 and y0 are on the line; plug them in:
1062 // C = -Ax0 - By0
1063 // plug C back into line equation:
1064 // line = Ax - By - Ax0 - By0
1065 // line = A(x - x0) + B(y - y0)
1066 // dX = (x-x0), dY = (y-y0)
1067 // so all this simplifies to
1068 // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
1069
1070 __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
1071 __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
1072
1073 // evaluate A(dx) and B(dY) for all points
1074 __m256d vAipd = _mm256_cvtepi32_pd(vAi);
1075 __m256d vBipd = _mm256_cvtepi32_pd(vBi);
1076 __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
1077 __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
1078
1079 __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
1080 __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
1081 __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
1082
1083 // apply any edge adjustments(top-left, crast, etc)
1084 adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
1085
1086 // broadcast respective edge results to all lanes
1087 double* pEdge = (double*)&vEdge;
1088 __m256d vEdgeFix16[7];
1089 vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
1090 vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
1091 vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
1092
1093 OSALIGNSIMD(int32_t) aAi[4], aBi[4];
1094 _mm_store_si128((__m128i*)aAi, vAi);
1095 _mm_store_si128((__m128i*)aBi, vBi);
1096 EDGE rastEdges[RT::NumEdgesT::value];
1097
1098 // Compute and store triangle edge data
1099 ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
1100 ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
1101 ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
1102
1103 // Compute and store triangle edge data if scissor needs to rasterized
1104 ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
1105 (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
1106
1107 // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
1108 // used to for testing if entire raster tile is inside a triangle
1109 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1110 {
1111 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
1112 }
1113
1114 // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
1115 // step sample positions to the raster tile bbox of multisample points
1116 // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples)
1117 // | |
1118 // | |
1119 // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
1120 __m256d vEdgeTileBbox[3];
1121 if (NumCoverageSamplesT::value > 1)
1122 {
1123 const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
1124 const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
1125 const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
1126
1127 __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
1128 __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
1129
1130 // step edge equation tests from Tile
1131 // used to for testing if entire raster tile is inside a triangle
1132 for (uint32_t e = 0; e < 3; ++e)
1133 {
1134 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
1135 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
1136 vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1137
1138 // adjust for msaa tile bbox edges outward for conservative rast, if enabled
1139 adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
1140 }
1141 }
1142
1143 AR_END(BEStepSetup, 0);
1144
1145 uint32_t tY = minTileY;
1146 uint32_t tX = minTileX;
1147 uint32_t maxY = maxTileY;
1148 uint32_t maxX = maxTileX;
1149
1150 RenderOutputBuffers renderBuffers, currentRenderBufferRow;
1151 GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
1152 currentRenderBufferRow = renderBuffers;
1153
1154 // rasterize and generate coverage masks per sample
1155 for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
1156 {
1157 __m256d vStartOfRowEdge[RT::NumEdgesT::value];
1158 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1159 {
1160 vStartOfRowEdge[e] = vEdgeFix16[e];
1161 }
1162
1163 for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
1164 {
1165 triDesc.anyCoveredSamples = 0;
1166
1167 // is the corner of the edge outside of the raster tile? (vEdge < 0)
1168 int mask0, mask1, mask2;
1169 UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
1170
1171 for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
1172 {
1173 // trivial reject, at least one edge has all 4 corners of raster tile outside
1174 bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
1175
1176 if (!trivialReject)
1177 {
1178 // trivial accept mask
1179 triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
1180
1181 // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
1182 UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
1183 (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
1184
1185 // @todo Make this a bit smarter to allow use of trivial accept when:
1186 // 1) scissor/vp intersection rect is raster tile aligned
1187 // 2) raster tile is entirely within scissor/vp intersection rect
1188 if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
1189 {
1190 // trivial accept, all 4 corners of all 3 edges are negative
1191 // i.e. raster tile completely inside triangle
1192 triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
1193 if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
1194 {
1195 triDesc.innerCoverageMask = 0xffffffffffffffffULL;
1196 }
1197 RDTSC_EVENT(BETrivialAccept, 1, 0);
1198 }
1199 else
1200 {
1201 __m256d vEdgeAtSample[RT::NumEdgesT::value];
1202 if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1203 {
1204 // should get optimized out for single sample case (global value numbering or copy propagation)
1205 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1206 {
1207 vEdgeAtSample[e] = vEdgeFix16[e];
1208 }
1209 }
1210 else
1211 {
1212 const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
1213 __m128i vSampleOffsetXh = samplePos.vXi(sampleNum);
1214 __m128i vSampleOffsetYh = samplePos.vYi(sampleNum);
1215 __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
1216 __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
1217
1218 // step edge equation tests from UL tile corner to pixel sample position
1219 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1220 {
1221 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
1222 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
1223 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1224 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
1225 }
1226 }
1227
1228 double startQuadEdges[RT::NumEdgesT::value];
1229 const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1230 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1231 {
1232 _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
1233 }
1234
1235 // not trivial accept or reject, must rasterize full tile
1236 AR_BEGIN(BERasterizePartial, pDC->drawId);
1237 triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
1238 AR_END(BERasterizePartial, 0);
1239
1240 triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
1241
1242 // Output SV InnerCoverage, if needed
1243 GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
1244 }
1245 }
1246 else
1247 {
1248 // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
1249 if(NumCoverageSamplesT::value > 1)
1250 {
1251 triDesc.coverageMask[sampleNum] = 0;
1252 }
1253 RDTSC_EVENT(BETrivialReject, 1, 0);
1254 }
1255 }
1256
1257 #if KNOB_ENABLE_TOSS_POINTS
1258 if(KNOB_TOSS_RS)
1259 {
1260 gToss = triDesc.coverageMask[0];
1261 }
1262 else
1263 #endif
1264 if(triDesc.anyCoveredSamples)
1265 {
1266 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
1267 // copy conservative coverage result to all samples
1268 if(RT::IsConservativeT::value)
1269 {
1270 auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
1271 UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
1272 }
1273
1274 AR_BEGIN(BEPixelBackend, pDC->drawId);
1275 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
1276 AR_END(BEPixelBackend, 0);
1277 }
1278
1279 // step to the next tile in X
1280 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1281 {
1282 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
1283 }
1284 StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers);
1285 }
1286
1287 // step to the next tile in Y
1288 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1289 {
1290 vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
1291 }
1292 StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow);
1293 }
1294
1295 AR_END(BERasterizeTriangle, 1);
1296 }
1297
1298 // Get pointers to hot tile memory for color RT, depth, stencil
1299 template <uint32_t numSamples>
1300 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
1301 {
1302 const API_STATE& state = GetApiState(pDC);
1303 SWR_CONTEXT *pContext = pDC->pContext;
1304
1305 uint32_t mx, my;
1306 MacroTileMgr::getTileIndices(macroID, mx, my);
1307 tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
1308 tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
1309
1310 // compute tile offset for active hottile buffers
1311 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
1312 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1313 offset*=numSamples;
1314
1315 unsigned long rtSlot = 0;
1316 uint32_t colorHottileEnableMask = state.colorHottileEnable;
1317 while(_BitScanForward(&rtSlot, colorHottileEnableMask))
1318 {
1319 HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true,
1320 numSamples, renderTargetArrayIndex);
1321 pColor->state = HOTTILE_DIRTY;
1322 renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
1323
1324 colorHottileEnableMask &= ~(1 << rtSlot);
1325 }
1326 if(state.depthHottileEnable)
1327 {
1328 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
1329 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1330 offset*=numSamples;
1331 HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true,
1332 numSamples, renderTargetArrayIndex);
1333 pDepth->state = HOTTILE_DIRTY;
1334 SWR_ASSERT(pDepth->pBuffer != nullptr);
1335 renderBuffers.pDepth = pDepth->pBuffer + offset;
1336 }
1337 if(state.stencilHottileEnable)
1338 {
1339 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
1340 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1341 offset*=numSamples;
1342 HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true,
1343 numSamples, renderTargetArrayIndex);
1344 pStencil->state = HOTTILE_DIRTY;
1345 SWR_ASSERT(pStencil->pBuffer != nullptr);
1346 renderBuffers.pStencil = pStencil->pBuffer + offset;
1347 }
1348 }
1349
1350 template <typename RT>
1351 INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers)
1352 {
1353 DWORD rt = 0;
1354 while (_BitScanForward(&rt, colorHotTileMask))
1355 {
1356 colorHotTileMask &= ~(1 << rt);
1357 buffers.pColor[rt] += RT::colorRasterTileStep;
1358 }
1359
1360 buffers.pDepth += RT::depthRasterTileStep;
1361 buffers.pStencil += RT::stencilRasterTileStep;
1362 }
1363
1364 template <typename RT>
1365 INLINE void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
1366 {
1367 DWORD rt = 0;
1368 while (_BitScanForward(&rt, colorHotTileMask))
1369 {
1370 colorHotTileMask &= ~(1 << rt);
1371 startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
1372 buffers.pColor[rt] = startBufferRow.pColor[rt];
1373 }
1374 startBufferRow.pDepth += RT::depthRasterTileRowStep;
1375 buffers.pDepth = startBufferRow.pDepth;
1376
1377 startBufferRow.pStencil += RT::stencilRasterTileRowStep;
1378 buffers.pStencil = startBufferRow.pStencil;
1379 }
1380