swr/rast: Removed unused variable
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / rasterizer_impl.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file rasterizer.cpp
24 *
25 * @brief Implementation for the rasterizer.
26 *
27 ******************************************************************************/
28
29 #include <vector>
30 #include <algorithm>
31
32 #include "rasterizer.h"
33 #include "rdtsc_core.h"
34 #include "backend.h"
35 #include "utils.h"
36 #include "frontend.h"
37 #include "tilemgr.h"
38 #include "memory/tilingtraits.h"
39
40 extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
41
42 template <uint32_t numSamples = 1>
43 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
44 template <typename RT>
45 void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers);
46 template <typename RT>
47 void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
48
49 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
50 static const __m256d gMaskToVecpd[] =
51 {
52 MASKTOVEC(0, 0, 0, 0),
53 MASKTOVEC(0, 0, 0, 1),
54 MASKTOVEC(0, 0, 1, 0),
55 MASKTOVEC(0, 0, 1, 1),
56 MASKTOVEC(0, 1, 0, 0),
57 MASKTOVEC(0, 1, 0, 1),
58 MASKTOVEC(0, 1, 1, 0),
59 MASKTOVEC(0, 1, 1, 1),
60 MASKTOVEC(1, 0, 0, 0),
61 MASKTOVEC(1, 0, 0, 1),
62 MASKTOVEC(1, 0, 1, 0),
63 MASKTOVEC(1, 0, 1, 1),
64 MASKTOVEC(1, 1, 0, 0),
65 MASKTOVEC(1, 1, 0, 1),
66 MASKTOVEC(1, 1, 1, 0),
67 MASKTOVEC(1, 1, 1, 1),
68 };
69
70 struct POS
71 {
72 int32_t x, y;
73 };
74
75 struct EDGE
76 {
77 double a, b; // a, b edge coefficients in fix8
78 double stepQuadX; // step to adjacent horizontal quad in fix16
79 double stepQuadY; // step to adjacent vertical quad in fix16
80 double stepRasterTileX; // step to adjacent horizontal raster tile in fix16
81 double stepRasterTileY; // step to adjacent vertical raster tile in fix16
82
83 __m256d vQuadOffsets; // offsets for 4 samples of a quad
84 __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
85 };
86
87 //////////////////////////////////////////////////////////////////////////
88 /// @brief rasterize a raster tile partially covered by the triangle
89 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
90 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
91 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
92 /// Used to step between quads when sweeping over the raster tile.
93 template<uint32_t NumEdges, typename EdgeMaskT>
94 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
95 {
96 uint64_t coverageMask = 0;
97
98 __m256d vEdges[NumEdges];
99 __m256d vStepX[NumEdges];
100 __m256d vStepY[NumEdges];
101
102 for (uint32_t e = 0; e < NumEdges; ++e)
103 {
104 // Step to the pixel sample locations of the 1st quad
105 vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
106
107 // compute step to next quad (mul by 2 in x and y direction)
108 vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
109 vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
110 }
111
112 // fast unrolled version for 8x8 tile
113 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
114 int edgeMask[NumEdges];
115 uint64_t mask;
116
117 auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
118 auto update_lambda = [&](int e){mask &= edgeMask[e];};
119 auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
120 auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
121 auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
122
123 // evaluate which pixels in the quad are covered
124 #define EVAL \
125 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
126
127 // update coverage mask
128 // if edge 0 is degenerate and will be skipped; init the mask
129 #define UPDATE_MASK(bit) \
130 if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
131 mask = 0xf;\
132 }\
133 else{\
134 mask = edgeMask[0]; \
135 }\
136 UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
137 coverageMask |= (mask << bit);
138
139 // step in the +x direction to the next quad
140 #define INCX \
141 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
142
143 // step in the +y direction to the next quad
144 #define INCY \
145 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
146
147 // step in the -x direction to the next quad
148 #define DECX \
149 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
150
151 // sweep 2x2 quad back and forth through the raster tile,
152 // computing coverage masks for the entire tile
153
154 // raster tile
155 // 0 1 2 3 4 5 6 7
156 // x x
157 // x x ------------------>
158 // x x |
159 // <-----------------x x V
160 // ..
161
162 // row 0
163 EVAL;
164 UPDATE_MASK(0);
165 INCX;
166 EVAL;
167 UPDATE_MASK(4);
168 INCX;
169 EVAL;
170 UPDATE_MASK(8);
171 INCX;
172 EVAL;
173 UPDATE_MASK(12);
174 INCY;
175
176 //row 1
177 EVAL;
178 UPDATE_MASK(28);
179 DECX;
180 EVAL;
181 UPDATE_MASK(24);
182 DECX;
183 EVAL;
184 UPDATE_MASK(20);
185 DECX;
186 EVAL;
187 UPDATE_MASK(16);
188 INCY;
189
190 // row 2
191 EVAL;
192 UPDATE_MASK(32);
193 INCX;
194 EVAL;
195 UPDATE_MASK(36);
196 INCX;
197 EVAL;
198 UPDATE_MASK(40);
199 INCX;
200 EVAL;
201 UPDATE_MASK(44);
202 INCY;
203
204 // row 3
205 EVAL;
206 UPDATE_MASK(60);
207 DECX;
208 EVAL;
209 UPDATE_MASK(56);
210 DECX;
211 EVAL;
212 UPDATE_MASK(52);
213 DECX;
214 EVAL;
215 UPDATE_MASK(48);
216 #else
217 uint32_t bit = 0;
218 for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
219 {
220 __m256d vStartOfRowEdge[NumEdges];
221 for (uint32_t e = 0; e < NumEdges; ++e)
222 {
223 vStartOfRowEdge[e] = vEdges[e];
224 }
225
226 for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
227 {
228 int edgeMask[NumEdges];
229 for (uint32_t e = 0; e < NumEdges; ++e)
230 {
231 edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
232 }
233
234 uint64_t mask = edgeMask[0];
235 for (uint32_t e = 1; e < NumEdges; ++e)
236 {
237 mask &= edgeMask[e];
238 }
239 coverageMask |= (mask << bit);
240
241 // step to the next pixel in the x
242 for (uint32_t e = 0; e < NumEdges; ++e)
243 {
244 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
245 }
246 bit+=4;
247 }
248
249 // step to the next row
250 for (uint32_t e = 0; e < NumEdges; ++e)
251 {
252 vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
253 }
254 }
255 #endif
256 return coverageMask;
257
258 }
259 // Top left rule:
260 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
261 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
262 // Top left: a sample is in if it is a top or left edge.
263 // Out: !(horizontal && above) = !horizontal && below
264 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right
265 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge)
266 {
267 // if vA < 0, vC--
268 // if vA == 0 && vB < 0, vC--
269
270 __m256d vEdgeOut = vEdge;
271 __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
272
273 // if vA < 0 (line is not horizontal and below)
274 int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
275
276 // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
277 __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
278 int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
279 msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
280
281 // if either of these are true and we're on the line (edge == 0), bump it outside the line
282 vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
283 }
284
285 //////////////////////////////////////////////////////////////////////////
286 /// @brief calculates difference in precision between the result of manh
287 /// calculation and the edge precision, based on compile time trait values
288 template<typename RT>
289 constexpr int64_t ManhToEdgePrecisionAdjust()
290 {
291 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
292 "Inadequate precision of result of manh calculation ");
293 return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
294 }
295
296 //////////////////////////////////////////////////////////////////////////
297 /// @struct adjustEdgeConservative
298 /// @brief Primary template definition used for partially specializing
299 /// the adjustEdgeConservative function. This struct should never
300 /// be instantiated.
301 /// @tparam RT: rasterizer traits
302 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
303 template <typename RT, typename ConservativeEdgeOffsetT>
304 struct adjustEdgeConservative
305 {
306 //////////////////////////////////////////////////////////////////////////
307 /// @brief Performs calculations to adjust each edge of a triangle away
308 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
309 /// direction.
310 ///
311 /// Uncertainty regions arise from fixed point rounding, which
312 /// can snap a vertex +/- by min fixed point value.
313 /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
314 /// This allows the rasterizer to test for coverage only at the pixel center,
315 /// instead of having to test individual pixel corners for conservative coverage
316 INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
317 {
318 // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
319 // from the pixel center (in the direction of the edge normal A/B)
320
321 // edge = Ax + Bx + C - (manh/e)
322 // manh = manhattan distance = abs(A) + abs(B)
323 // e = absolute rounding error from snapping from float to fixed point precision
324
325 // 'fixed point' multiply (in double to be avx1 friendly)
326 // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
327 __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
328 __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
329 _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
330
331 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
332 "Inadequate precision of result of manh calculation ");
333
334 // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
335 // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
336 manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
337
338 // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
339 // this allows the rasterizer to do a single conservative coverage test to see if the primitive
340 // intersects the pixel at all
341 vEdge = _mm256_sub_pd(vEdge, manh);
342 };
343 };
344
345 //////////////////////////////////////////////////////////////////////////
346 /// @brief adjustEdgeConservative specialization where no edge offset is needed
347 template <typename RT>
348 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
349 {
350 INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
351 };
352
353 //////////////////////////////////////////////////////////////////////////
354 /// @brief calculates the distance a degenerate BBox needs to be adjusted
355 /// for conservative rast based on compile time trait values
356 template<typename RT>
357 constexpr int64_t ConservativeScissorOffset()
358 {
359 static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
360 // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
361 typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
362 // 1/2 pixel edge offset + conservative offset - degenerateTriangle
363 return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
364 }
365
366 //////////////////////////////////////////////////////////////////////////
367 /// @brief Performs calculations to adjust each a vector of evaluated edges out
368 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
369 /// direction.
370 template <typename RT>
371 INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
372 {
373 int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
374 int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
375 vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
376 };
377
378 //////////////////////////////////////////////////////////////////////////
379 /// @brief Performs calculations to adjust each a scalar evaluated edge out
380 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
381 /// direction.
382 template <typename RT, typename OffsetT>
383 INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
384 {
385 int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
386 int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
387 return (Edge - manh);
388 };
389
390 //////////////////////////////////////////////////////////////////////////
391 /// @brief Perform any needed adjustments to evaluated triangle edges
392 template <typename RT, typename EdgeOffsetT>
393 struct adjustEdgesFix16
394 {
395 INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
396 {
397 static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
398 "Edge equation expected to be in x.16 fixed point");
399
400 static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
401
402 // need to apply any edge offsets before applying the top-left rule
403 adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
404
405 adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
406 }
407 };
408
409 //////////////////////////////////////////////////////////////////////////
410 /// @brief Perform top left adjustments to evaluated triangle edges
411 template <typename RT>
412 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
413 {
414 INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
415 {
416 adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
417 }
418 };
419
420 // max(abs(dz/dx), abs(dz,dy)
421 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
422 {
423 /*
424 // evaluate i,j at (0,0)
425 float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
426 float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
427
428 // evaluate i,j at (1,0)
429 float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
430 float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
431
432 // compute dz/dx
433 float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
434 float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
435 float dzdx = abs(d10 - d00);
436
437 // evaluate i,j at (0,1)
438 float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
439 float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
440
441 float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
442 float dzdy = abs(d01 - d00);
443 */
444
445 // optimized version of above
446 float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
447 float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
448
449 return std::max(dzdx, dzdy);
450 }
451
452 INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
453 {
454 if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
455 {
456 return (1.0f / (1 << 24));
457 }
458 else if (pState->depthFormat == R16_UNORM)
459 {
460 return (1.0f / (1 << 16));
461 }
462 else
463 {
464 SWR_ASSERT(pState->depthFormat == R32_FLOAT);
465
466 // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
467 float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
468 uint32_t zMaxInt = *(uint32_t*)&zMax;
469 zMaxInt &= 0x7f800000;
470 zMax = *(float*)&zMaxInt;
471
472 return zMax * (1.0f / (1 << 23));
473 }
474 }
475
476 INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
477 {
478 if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
479 {
480 return 0.0f;
481 }
482
483 float scale = pState->slopeScaledDepthBias;
484 if (scale != 0.0f)
485 {
486 scale *= ComputeMaxDepthSlope(pTri);
487 }
488
489 float bias = pState->depthBias;
490 if (!pState->depthBiasPreAdjusted)
491 {
492 bias *= ComputeBiasFactor(pState, pTri, z);
493 }
494 bias += scale;
495
496 if (pState->depthBiasClamp > 0.0f)
497 {
498 bias = std::min(bias, pState->depthBiasClamp);
499 }
500 else if (pState->depthBiasClamp < 0.0f)
501 {
502 bias = std::max(bias, pState->depthBiasClamp);
503 }
504
505 return bias;
506 }
507
508 // Prevent DCE by writing coverage mask from rasterizer to volatile
509 #if KNOB_ENABLE_TOSS_POINTS
510 __declspec(thread) volatile uint64_t gToss;
511 #endif
512
513 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
514 // try to avoid _chkstk insertions; make this thread local
515 static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
516
517 INLINE
518 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
519 {
520 edge.a = a;
521 edge.b = b;
522
523 // compute constant steps to adjacent quads
524 edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
525 edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
526
527 // compute constant steps to adjacent raster tiles
528 edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
529 edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
530
531 // compute quad offsets
532 const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
533 const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
534
535 __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
536 __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
537 edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
538
539 // compute raster tile offsets
540 const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
541 const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
542
543 __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
544 __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
545 edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
546 }
547
548 INLINE
549 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
550 {
551 ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
552 }
553
554 //////////////////////////////////////////////////////////////////////////
555 /// @brief Primary template definition used for partially specializing
556 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
557 /// corner to sample position, and test for coverage
558 /// @tparam sampleCount: multisample count
559 template <typename NumSamplesT>
560 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
561 int32_t &mask0, int32_t &mask1, int32_t &mask2)
562 {
563 __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
564 // evaluate edge equations at the tile multisample bounding box
565 vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
566 vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
567 vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
568 mask0 = _mm256_movemask_pd(vSampleBboxTest0);
569 mask1 = _mm256_movemask_pd(vSampleBboxTest1);
570 mask2 = _mm256_movemask_pd(vSampleBboxTest2);
571 }
572
573 //////////////////////////////////////////////////////////////////////////
574 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
575 /// when only rasterizing a single coverage test point
576 template <>
577 INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
578 int32_t &mask0, int32_t &mask1, int32_t &mask2)
579 {
580 mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
581 mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
582 mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
583 }
584
585 //////////////////////////////////////////////////////////////////////////
586 /// @struct ComputeScissorEdges
587 /// @brief Primary template definition. Allows the function to be generically
588 /// called. When paired with below specializations, will result in an empty
589 /// inlined function if scissor is not enabled
590 /// @tparam RasterScissorEdgesT: is scissor enabled?
591 /// @tparam IsConservativeT: is conservative rast enabled?
592 /// @tparam RT: rasterizer traits
593 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
594 struct ComputeScissorEdges
595 {
596 INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
597 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
598 };
599
600 //////////////////////////////////////////////////////////////////////////
601 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
602 /// specialization. Instantiated when conservative rast and scissor are enabled
603 template <typename RT>
604 struct ComputeScissorEdges<std::true_type, std::true_type, RT>
605 {
606 //////////////////////////////////////////////////////////////////////////
607 /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
608 /// evaluate edge equations and offset them away from pixel center.
609 INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
610 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
611 {
612 // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
613 SWR_RECT scissor;
614 scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
615 scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
616 scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
617 scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
618
619 POS topLeft{scissor.xmin, scissor.ymin};
620 POS bottomLeft{scissor.xmin, scissor.ymax};
621 POS topRight{scissor.xmax, scissor.ymin};
622 POS bottomRight{scissor.xmax, scissor.ymax};
623
624 // construct 4 scissor edges in ccw direction
625 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
626 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
627 ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
628 ComputeEdgeData(topRight, topLeft, rastEdges[6]);
629
630 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
631 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
632 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
633 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
634
635 // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
636 adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
637 adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
638 adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
639 adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
640
641 // Upper left rule for scissor
642 vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
643 vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
644 }
645 };
646
647 //////////////////////////////////////////////////////////////////////////
648 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
649 /// specialization. Instantiated when scissor is enabled and conservative rast
650 /// is disabled.
651 template <typename RT>
652 struct ComputeScissorEdges<std::true_type, std::false_type, RT>
653 {
654 //////////////////////////////////////////////////////////////////////////
655 /// @brief Compute scissor edge vectors and evaluate edge equations
656 INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
657 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
658 {
659 const SWR_RECT &scissor = scissorBBox;
660 POS topLeft{scissor.xmin, scissor.ymin};
661 POS bottomLeft{scissor.xmin, scissor.ymax};
662 POS topRight{scissor.xmax, scissor.ymin};
663 POS bottomRight{scissor.xmax, scissor.ymax};
664
665 // construct 4 scissor edges in ccw direction
666 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
667 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
668 ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
669 ComputeEdgeData(topRight, topLeft, rastEdges[6]);
670
671 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
672 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
673 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
674 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
675
676 // Upper left rule for scissor
677 vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
678 vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
679 }
680 };
681
682 //////////////////////////////////////////////////////////////////////////
683 /// @brief Primary function template for TrivialRejectTest. Should
684 /// never be called, but TemplateUnroller instantiates a few unused values,
685 /// so it calls a runtime assert instead of a static_assert.
686 template <typename ValidEdgeMaskT>
687 INLINE bool TrivialRejectTest(const int, const int, const int)
688 {
689 SWR_INVALID("Primary templated function should never be called");
690 return false;
691 };
692
693 //////////////////////////////////////////////////////////////////////////
694 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
695 /// and edge 1 for trivial coverage reject
696 template <>
697 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
698 {
699 return (!(mask0 && mask1)) ? true : false;
700 };
701
702 //////////////////////////////////////////////////////////////////////////
703 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
704 /// and edge 2 for trivial coverage reject
705 template <>
706 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
707 {
708 return (!(mask0 && mask2)) ? true : false;
709 };
710
711 //////////////////////////////////////////////////////////////////////////
712 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
713 /// and edge 2 for trivial coverage reject
714 template <>
715 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
716 {
717 return (!(mask1 && mask2)) ? true : false;
718 };
719
720 //////////////////////////////////////////////////////////////////////////
721 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
722 /// primitive edges for trivial coverage reject
723 template <>
724 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
725 {
726 return (!(mask0 && mask1 && mask2)) ? true : false;;
727 };
728
729 //////////////////////////////////////////////////////////////////////////
730 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
731 /// point, so return false and rasterize against conservative BBox
732 template <>
733 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
734 {
735 return false;
736 };
737
738 //////////////////////////////////////////////////////////////////////////
739 /// @brief Primary function template for TrivialAcceptTest. Always returns
740 /// false, since it will only be called for degenerate tris, and as such
741 /// will never cover the entire raster tile
742 template <typename ScissorEnableT>
743 INLINE bool TrivialAcceptTest(const int, const int, const int)
744 {
745 return false;
746 };
747
748 //////////////////////////////////////////////////////////////////////////
749 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
750 /// edge masks for a fully covered raster tile
751 template <>
752 INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
753 {
754 return ((mask0 & mask1 & mask2) == 0xf);
755 };
756
757 //////////////////////////////////////////////////////////////////////////
758 /// @brief Primary function template for GenerateSVInnerCoverage. Results
759 /// in an empty function call if SVInnerCoverage isn't requested
760 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
761 struct GenerateSVInnerCoverage
762 {
763 INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t &){};
764 };
765
766 //////////////////////////////////////////////////////////////////////////
767 /// @brief Specialization of GenerateSVInnerCoverage where all edges
768 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
769 /// edge values from OuterConservative to InnerConservative and rasterizes.
770 template <typename RT>
771 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
772 {
773 INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges, uint64_t &innerCoverageMask)
774 {
775 double startQuadEdgesAdj[RT::NumEdgesT::value];
776 for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
777 {
778 startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
779 }
780
781 // not trivial accept or reject, must rasterize full tile
782 RDTSC_BEGIN(BERasterizePartial, pDC->drawId);
783 innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
784 RDTSC_END(BERasterizePartial, 0);
785 }
786 };
787
788 //////////////////////////////////////////////////////////////////////////
789 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
790 /// in an empty function call if SVInnerCoverage isn't requested
791 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
792 struct UpdateEdgeMasksInnerConservative
793 {
794 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
795 const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
796 };
797
798 //////////////////////////////////////////////////////////////////////////
799 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
800 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
801 /// evaluated at raster tile corners to inner conservative position and
802 /// updates edge masks
803 template <typename RT>
804 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
805 {
806 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
807 const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
808 {
809 __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
810
811 // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
812 // conservative evaluated edge when adjusting the edge in for inner conservative tests
813 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
814 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
815 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
816
817 UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
818 }
819 };
820
821 //////////////////////////////////////////////////////////////////////////
822 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
823 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
824 /// cover an entire raster tile, set mask0 to 0 to force it down the
825 /// rastierizePartialTile path
826 template <typename RT, typename ValidEdgeMaskT>
827 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
828 {
829 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
830 const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
831 {
832 // set one mask to zero to force the triangle down the rastierizePartialTile path
833 mask0 = 0;
834 }
835 };
836
837 template <typename RT>
838 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
839 {
840 const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
841 #if KNOB_ENABLE_TOSS_POINTS
842 if (KNOB_TOSS_BIN_TRIS)
843 {
844 return;
845 }
846 #endif
847 RDTSC_BEGIN(BERasterizeTriangle, pDC->drawId);
848 RDTSC_BEGIN(BETriangleSetup, pDC->drawId);
849
850 const API_STATE &state = GetApiState(pDC);
851 const SWR_RASTSTATE &rastState = state.rastState;
852 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
853
854 OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
855 triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
856
857 __m128 vX, vY, vZ, vRecipW;
858
859 // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
860 // eg: vX = [x0 x1 x2 dc]
861 vX = _mm_load_ps(workDesc.pTriBuffer);
862 vY = _mm_load_ps(workDesc.pTriBuffer + 4);
863 vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
864 vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
865
866 // convert to fixed point
867 static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
868 __m128i vXi = fpToFixedPoint(vX);
869 __m128i vYi = fpToFixedPoint(vY);
870
871 // quantize floating point position to fixed point precision
872 // to prevent attribute creep around the triangle vertices
873 vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
874 vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
875
876 // triangle setup - A and B edge equation coefs
877 __m128 vA, vB;
878 triangleSetupAB(vX, vY, vA, vB);
879
880 __m128i vAi, vBi;
881 triangleSetupABInt(vXi, vYi, vAi, vBi);
882
883 // determinant
884 float det = calcDeterminantInt(vAi, vBi);
885
886 // Verts in Pixel Coordinate Space at this point
887 // Det > 0 = CW winding order
888 // Convert CW triangles to CCW
889 if (det > 0.0)
890 {
891 vA = _mm_mul_ps(vA, _mm_set1_ps(-1));
892 vB = _mm_mul_ps(vB, _mm_set1_ps(-1));
893 vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
894 vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
895 det = -det;
896 }
897
898 __m128 vC;
899 // Finish triangle setup - C edge coef
900 triangleSetupC(vX, vY, vA, vB, vC);
901
902 if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
903 {
904 // If we have degenerate edge(s) to rasterize, set I and J coefs
905 // to 0 for constant interpolation of attributes
906 triDesc.I[0] = 0.0f;
907 triDesc.I[1] = 0.0f;
908 triDesc.I[2] = 0.0f;
909 triDesc.J[0] = 0.0f;
910 triDesc.J[1] = 0.0f;
911 triDesc.J[2] = 0.0f;
912
913 // Degenerate triangles have no area
914 triDesc.recipDet = 0.0f;
915 }
916 else
917 {
918 // only extract coefs for 2 of the barycentrics; the 3rd can be
919 // determined from the barycentric equation:
920 // i + j + k = 1 <=> k = 1 - j - i
921 _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
922 _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
923 _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
924 _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
925 _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
926 _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
927
928 // compute recipDet, used to calculate barycentric i and j in the backend
929 triDesc.recipDet = 1.0f/det;
930 }
931
932 OSALIGNSIMD(float) oneOverW[4];
933 _mm_store_ps(oneOverW, vRecipW);
934 triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
935 triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
936 triDesc.OneOverW[2] = oneOverW[2];
937
938 // calculate perspective correct coefs per vertex attrib
939 float* pPerspAttribs = perspAttribsTLS;
940 float* pAttribs = workDesc.pAttribs;
941 triDesc.pPerspAttribs = pPerspAttribs;
942 triDesc.pAttribs = pAttribs;
943 float *pRecipW = workDesc.pTriBuffer + 12;
944 triDesc.pRecipW = pRecipW;
945 __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
946 __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
947 __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
948 for(uint32_t i = 0; i < workDesc.numAttribs; i++)
949 {
950 __m128 attribA = _mm_load_ps(pAttribs);
951 __m128 attribB = _mm_load_ps(pAttribs+=4);
952 __m128 attribC = _mm_load_ps(pAttribs+=4);
953 pAttribs+=4;
954
955 attribA = _mm_mul_ps(attribA, vOneOverWV0);
956 attribB = _mm_mul_ps(attribB, vOneOverWV1);
957 attribC = _mm_mul_ps(attribC, vOneOverWV2);
958
959 _mm_store_ps(pPerspAttribs, attribA);
960 _mm_store_ps(pPerspAttribs+=4, attribB);
961 _mm_store_ps(pPerspAttribs+=4, attribC);
962 pPerspAttribs+=4;
963 }
964
965 // compute bary Z
966 // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
967 OSALIGNSIMD(float) a[4];
968 _mm_store_ps(a, vZ);
969 triDesc.Z[0] = a[0] - a[2];
970 triDesc.Z[1] = a[1] - a[2];
971 triDesc.Z[2] = a[2];
972
973 // add depth bias
974 triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
975
976 // Calc bounding box of triangle
977 OSALIGNSIMD(SWR_RECT) bbox;
978 calcBoundingBoxInt(vXi, vYi, bbox);
979
980 const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
981
982 if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
983 {
984 // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
985 bbox.xmin--; bbox.xmax++; bbox.ymin--; bbox.ymax++;
986 SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
987 "Conservative rast degenerate handling requires a valid scissor rect");
988 }
989
990 // Intersect with scissor/viewport
991 OSALIGNSIMD(SWR_RECT) intersect;
992 intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
993 intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
994 intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
995 intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
996
997 triDesc.triFlags = workDesc.triFlags;
998
999 // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
1000 uint32_t macroX, macroY;
1001 MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
1002 int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
1003 int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
1004 int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
1005 int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
1006
1007 intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
1008 intersect.ymin = std::max(intersect.ymin, macroBoxTop);
1009 intersect.xmax = std::min(intersect.xmax, macroBoxRight);
1010 intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
1011
1012 SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
1013
1014 RDTSC_END(BETriangleSetup, 0);
1015
1016 // update triangle desc
1017 uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1018 uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1019 uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1020 uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1021 uint32_t numTilesX = maxTileX - minTileX + 1;
1022 uint32_t numTilesY = maxTileY - minTileY + 1;
1023
1024 if (numTilesX == 0 || numTilesY == 0)
1025 {
1026 RDTSC_EVENT(BEEmptyTriangle, 1, 0);
1027 RDTSC_END(BERasterizeTriangle, 1);
1028 return;
1029 }
1030
1031 RDTSC_BEGIN(BEStepSetup, pDC->drawId);
1032
1033 // Step to pixel center of top-left pixel of the triangle bbox
1034 // Align intersect bbox (top/left) to raster tile's (top/left).
1035 int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
1036 int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
1037
1038 // convenience typedef
1039 typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
1040
1041 // single sample rasterization evaluates edges at pixel center,
1042 // multisample evaluates edges UL pixel corner and steps to each sample position
1043 if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1044 {
1045 // Add 0.5, in fixed point, to offset to pixel center
1046 x += (FIXED_POINT_SCALE / 2);
1047 y += (FIXED_POINT_SCALE / 2);
1048 }
1049
1050 __m128i vTopLeftX = _mm_set1_epi32(x);
1051 __m128i vTopLeftY = _mm_set1_epi32(y);
1052
1053 // evaluate edge equations at top-left pixel using 64bit math
1054 //
1055 // line = Ax + By + C
1056 // solving for C:
1057 // C = -Ax - By
1058 // we know x0 and y0 are on the line; plug them in:
1059 // C = -Ax0 - By0
1060 // plug C back into line equation:
1061 // line = Ax - By - Ax0 - By0
1062 // line = A(x - x0) + B(y - y0)
1063 // dX = (x-x0), dY = (y-y0)
1064 // so all this simplifies to
1065 // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
1066
1067 __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
1068 __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
1069
1070 // evaluate A(dx) and B(dY) for all points
1071 __m256d vAipd = _mm256_cvtepi32_pd(vAi);
1072 __m256d vBipd = _mm256_cvtepi32_pd(vBi);
1073 __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
1074 __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
1075
1076 __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
1077 __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
1078 __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
1079
1080 // apply any edge adjustments(top-left, crast, etc)
1081 adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
1082
1083 // broadcast respective edge results to all lanes
1084 double* pEdge = (double*)&vEdge;
1085 __m256d vEdgeFix16[7];
1086 vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
1087 vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
1088 vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
1089
1090 OSALIGNSIMD(int32_t) aAi[4], aBi[4];
1091 _mm_store_si128((__m128i*)aAi, vAi);
1092 _mm_store_si128((__m128i*)aBi, vBi);
1093 EDGE rastEdges[RT::NumEdgesT::value];
1094
1095 // Compute and store triangle edge data
1096 ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
1097 ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
1098 ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
1099
1100 // Compute and store triangle edge data if scissor needs to rasterized
1101 ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
1102 (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
1103
1104 // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
1105 // used to for testing if entire raster tile is inside a triangle
1106 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1107 {
1108 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
1109 }
1110
1111 // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
1112 // step sample positions to the raster tile bbox of multisample points
1113 // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples)
1114 // | |
1115 // | |
1116 // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
1117 __m256d vEdgeTileBbox[3];
1118 if (NumCoverageSamplesT::value > 1)
1119 {
1120 const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
1121 const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
1122 const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
1123
1124 __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
1125 __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
1126
1127 // step edge equation tests from Tile
1128 // used to for testing if entire raster tile is inside a triangle
1129 for (uint32_t e = 0; e < 3; ++e)
1130 {
1131 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
1132 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
1133 vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1134
1135 // adjust for msaa tile bbox edges outward for conservative rast, if enabled
1136 adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
1137 }
1138 }
1139
1140 RDTSC_END(BEStepSetup, 0);
1141
1142 uint32_t tY = minTileY;
1143 uint32_t tX = minTileX;
1144 uint32_t maxY = maxTileY;
1145 uint32_t maxX = maxTileX;
1146
1147 RenderOutputBuffers renderBuffers, currentRenderBufferRow;
1148 GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
1149 currentRenderBufferRow = renderBuffers;
1150
1151 // rasterize and generate coverage masks per sample
1152 for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
1153 {
1154 __m256d vStartOfRowEdge[RT::NumEdgesT::value];
1155 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1156 {
1157 vStartOfRowEdge[e] = vEdgeFix16[e];
1158 }
1159
1160 for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
1161 {
1162 triDesc.anyCoveredSamples = 0;
1163
1164 // is the corner of the edge outside of the raster tile? (vEdge < 0)
1165 int mask0, mask1, mask2;
1166 UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
1167
1168 for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
1169 {
1170 // trivial reject, at least one edge has all 4 corners of raster tile outside
1171 bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
1172
1173 if (!trivialReject)
1174 {
1175 // trivial accept mask
1176 triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
1177
1178 // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
1179 UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
1180 (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
1181
1182 // @todo Make this a bit smarter to allow use of trivial accept when:
1183 // 1) scissor/vp intersection rect is raster tile aligned
1184 // 2) raster tile is entirely within scissor/vp intersection rect
1185 if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
1186 {
1187 // trivial accept, all 4 corners of all 3 edges are negative
1188 // i.e. raster tile completely inside triangle
1189 triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
1190 if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
1191 {
1192 triDesc.innerCoverageMask = 0xffffffffffffffffULL;
1193 }
1194 RDTSC_EVENT(BETrivialAccept, 1, 0);
1195 }
1196 else
1197 {
1198 __m256d vEdgeAtSample[RT::NumEdgesT::value];
1199 if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1200 {
1201 // should get optimized out for single sample case (global value numbering or copy propagation)
1202 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1203 {
1204 vEdgeAtSample[e] = vEdgeFix16[e];
1205 }
1206 }
1207 else
1208 {
1209 const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
1210 __m128i vSampleOffsetXh = samplePos.vXi(sampleNum);
1211 __m128i vSampleOffsetYh = samplePos.vYi(sampleNum);
1212 __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
1213 __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
1214
1215 // step edge equation tests from UL tile corner to pixel sample position
1216 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1217 {
1218 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
1219 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
1220 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1221 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
1222 }
1223 }
1224
1225 double startQuadEdges[RT::NumEdgesT::value];
1226 const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1227 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1228 {
1229 _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
1230 }
1231
1232 // not trivial accept or reject, must rasterize full tile
1233 RDTSC_BEGIN(BERasterizePartial, pDC->drawId);
1234 triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
1235 RDTSC_END(BERasterizePartial, 0);
1236
1237 triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
1238
1239 // Output SV InnerCoverage, if needed
1240 GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
1241 }
1242 }
1243 else
1244 {
1245 // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
1246 if(NumCoverageSamplesT::value > 1)
1247 {
1248 triDesc.coverageMask[sampleNum] = 0;
1249 }
1250 RDTSC_EVENT(BETrivialReject, 1, 0);
1251 }
1252 }
1253
1254 #if KNOB_ENABLE_TOSS_POINTS
1255 if(KNOB_TOSS_RS)
1256 {
1257 gToss = triDesc.coverageMask[0];
1258 }
1259 else
1260 #endif
1261 if(triDesc.anyCoveredSamples)
1262 {
1263 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
1264 // copy conservative coverage result to all samples
1265 if(RT::IsConservativeT::value)
1266 {
1267 auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
1268 UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
1269 }
1270
1271 RDTSC_BEGIN(BEPixelBackend, pDC->drawId);
1272 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
1273 RDTSC_END(BEPixelBackend, 0);
1274 }
1275
1276 // step to the next tile in X
1277 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1278 {
1279 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
1280 }
1281 StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers);
1282 }
1283
1284 // step to the next tile in Y
1285 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1286 {
1287 vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
1288 }
1289 StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow);
1290 }
1291
1292 RDTSC_END(BERasterizeTriangle, 1);
1293 }
1294
1295 // Get pointers to hot tile memory for color RT, depth, stencil
1296 template <uint32_t numSamples>
1297 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
1298 {
1299 const API_STATE& state = GetApiState(pDC);
1300 SWR_CONTEXT *pContext = pDC->pContext;
1301
1302 uint32_t mx, my;
1303 MacroTileMgr::getTileIndices(macroID, mx, my);
1304 tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
1305 tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
1306
1307 // compute tile offset for active hottile buffers
1308 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
1309 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1310 offset*=numSamples;
1311
1312 unsigned long rtSlot = 0;
1313 uint32_t colorHottileEnableMask = state.colorHottileEnable;
1314 while(_BitScanForward(&rtSlot, colorHottileEnableMask))
1315 {
1316 HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true,
1317 numSamples, renderTargetArrayIndex);
1318 pColor->state = HOTTILE_DIRTY;
1319 renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
1320
1321 colorHottileEnableMask &= ~(1 << rtSlot);
1322 }
1323 if(state.depthHottileEnable)
1324 {
1325 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
1326 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1327 offset*=numSamples;
1328 HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true,
1329 numSamples, renderTargetArrayIndex);
1330 pDepth->state = HOTTILE_DIRTY;
1331 SWR_ASSERT(pDepth->pBuffer != nullptr);
1332 renderBuffers.pDepth = pDepth->pBuffer + offset;
1333 }
1334 if(state.stencilHottileEnable)
1335 {
1336 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
1337 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1338 offset*=numSamples;
1339 HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true,
1340 numSamples, renderTargetArrayIndex);
1341 pStencil->state = HOTTILE_DIRTY;
1342 SWR_ASSERT(pStencil->pBuffer != nullptr);
1343 renderBuffers.pStencil = pStencil->pBuffer + offset;
1344 }
1345 }
1346
1347 template <typename RT>
1348 INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers)
1349 {
1350 DWORD rt = 0;
1351 while (_BitScanForward(&rt, colorHotTileMask))
1352 {
1353 colorHotTileMask &= ~(1 << rt);
1354 buffers.pColor[rt] += RT::colorRasterTileStep;
1355 }
1356
1357 buffers.pDepth += RT::depthRasterTileStep;
1358 buffers.pStencil += RT::stencilRasterTileStep;
1359 }
1360
1361 template <typename RT>
1362 INLINE void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
1363 {
1364 DWORD rt = 0;
1365 while (_BitScanForward(&rt, colorHotTileMask))
1366 {
1367 colorHotTileMask &= ~(1 << rt);
1368 startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
1369 buffers.pColor[rt] = startBufferRow.pColor[rt];
1370 }
1371 startBufferRow.pDepth += RT::depthRasterTileRowStep;
1372 buffers.pDepth = startBufferRow.pDepth;
1373
1374 startBufferRow.pStencil += RT::stencilRasterTileRowStep;
1375 buffers.pStencil = startBufferRow.pStencil;
1376 }
1377