src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file rasterizer.cpp
  24 *
  25 * @brief Implementation for the rasterizer.
  26 *
  27 ******************************************************************************/
  28
  29 #include <vector>
  30 #include <algorithm>
  31
  32 #include "rasterizer.h"
  33 #include "rdtsc_core.h"
  34 #include "backend.h"
  35 #include "utils.h"
  36 #include "frontend.h"
  37 #include "tilemgr.h"
  38 #include "memory/tilingtraits.h"
  39
  40 extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
  41
  42 template <uint32_t numSamples = 1>
  43 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
  44 template <typename RT>
  45 void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers);
  46 template <typename RT>
  47 void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
  48
  49 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
  50 static const __m256d gMaskToVecpd[] =
  51 {
  52     MASKTOVEC(0, 0, 0, 0),
  53     MASKTOVEC(0, 0, 0, 1),
  54     MASKTOVEC(0, 0, 1, 0),
  55     MASKTOVEC(0, 0, 1, 1),
  56     MASKTOVEC(0, 1, 0, 0),
  57     MASKTOVEC(0, 1, 0, 1),
  58     MASKTOVEC(0, 1, 1, 0),
  59     MASKTOVEC(0, 1, 1, 1),
  60     MASKTOVEC(1, 0, 0, 0),
  61     MASKTOVEC(1, 0, 0, 1),
  62     MASKTOVEC(1, 0, 1, 0),
  63     MASKTOVEC(1, 0, 1, 1),
  64     MASKTOVEC(1, 1, 0, 0),
  65     MASKTOVEC(1, 1, 0, 1),
  66     MASKTOVEC(1, 1, 1, 0),
  67     MASKTOVEC(1, 1, 1, 1),
  68 };
  69
  70 struct POS
  71 {
  72     int32_t x, y;
  73 };
  74
  75 struct EDGE
  76 {
  77     double a, b;                // a, b edge coefficients in fix8
  78     double stepQuadX;           // step to adjacent horizontal quad in fix16
  79     double stepQuadY;           // step to adjacent vertical quad in fix16
  80     double stepRasterTileX;     // step to adjacent horizontal raster tile in fix16
  81     double stepRasterTileY;     // step to adjacent vertical raster tile in fix16
  82
  83     __m256d vQuadOffsets;       // offsets for 4 samples of a quad
  84     __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
  85 };
  86
  87 //////////////////////////////////////////////////////////////////////////
  88 /// @brief rasterize a raster tile partially covered by the triangle
  89 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
  90 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
  91 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
  92 ///        Used to step between quads when sweeping over the raster tile.
  93 template<uint32_t NumEdges, typename EdgeMaskT>
  94 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
  95 {
  96     uint64_t coverageMask = 0;
  97
  98     __m256d vEdges[NumEdges];
  99     __m256d vStepX[NumEdges];
 100     __m256d vStepY[NumEdges];
 101
 102     for (uint32_t e = 0; e < NumEdges; ++e)
 103     {
 104         // Step to the pixel sample locations of the 1st quad
 105         vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
 106
 107         // compute step to next quad (mul by 2 in x and y direction)
 108         vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
 109         vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
 110     }
 111
 112     // fast unrolled version for 8x8 tile
 113 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
 114     int edgeMask[NumEdges];
 115     uint64_t mask;
 116
 117     auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
 118     auto update_lambda = [&](int e){mask &= edgeMask[e];};
 119     auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
 120     auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
 121     auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
 122
 123 // evaluate which pixels in the quad are covered
 124 #define EVAL \
 125             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
 126
 127     // update coverage mask
 128     // if edge 0 is degenerate and will be skipped; init the mask
 129 #define UPDATE_MASK(bit) \
 130             if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
 131                 mask = 0xf;\
 132             }\
 133             else{\
 134                 mask = edgeMask[0]; \
 135             }\
 136             UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
 137             coverageMask |= (mask << bit);
 138
 139     // step in the +x direction to the next quad
 140 #define INCX \
 141             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
 142
 143     // step in the +y direction to the next quad
 144 #define INCY \
 145             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
 146
 147     // step in the -x direction to the next quad
 148 #define DECX \
 149             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
 150
 151     // sweep 2x2 quad back and forth through the raster tile,
 152     // computing coverage masks for the entire tile
 153
 154     // raster tile
 155     // 0  1  2  3  4  5  6  7
 156     // x  x
 157     // x  x ------------------>
 158     //                   x  x  |
 159     // <-----------------x  x  V
 160     // ..
 161
 162     // row 0
 163     EVAL;
 164     UPDATE_MASK(0);
 165     INCX;
 166     EVAL;
 167     UPDATE_MASK(4);
 168     INCX;
 169     EVAL;
 170     UPDATE_MASK(8);
 171     INCX;
 172     EVAL;
 173     UPDATE_MASK(12);
 174     INCY;
 175
 176     //row 1
 177     EVAL;
 178     UPDATE_MASK(28);
 179     DECX;
 180     EVAL;
 181     UPDATE_MASK(24);
 182     DECX;
 183     EVAL;
 184     UPDATE_MASK(20);
 185     DECX;
 186     EVAL;
 187     UPDATE_MASK(16);
 188     INCY;
 189
 190     // row 2
 191     EVAL;
 192     UPDATE_MASK(32);
 193     INCX;
 194     EVAL;
 195     UPDATE_MASK(36);
 196     INCX;
 197     EVAL;
 198     UPDATE_MASK(40);
 199     INCX;
 200     EVAL;
 201     UPDATE_MASK(44);
 202     INCY;
 203
 204     // row 3
 205     EVAL;
 206     UPDATE_MASK(60);
 207     DECX;
 208     EVAL;
 209     UPDATE_MASK(56);
 210     DECX;
 211     EVAL;
 212     UPDATE_MASK(52);
 213     DECX;
 214     EVAL;
 215     UPDATE_MASK(48);
 216 #else
 217     uint32_t bit = 0;
 218     for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
 219     {
 220         __m256d vStartOfRowEdge[NumEdges];
 221         for (uint32_t e = 0; e < NumEdges; ++e)
 222         {
 223             vStartOfRowEdge[e] = vEdges[e];
 224         }
 225
 226         for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
 227         {
 228             int edgeMask[NumEdges];
 229             for (uint32_t e = 0; e < NumEdges; ++e)
 230             {
 231                 edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
 232             }
 233
 234             uint64_t mask = edgeMask[0];
 235             for (uint32_t e = 1; e < NumEdges; ++e)
 236             {
 237                 mask &= edgeMask[e];
 238             }
 239             coverageMask |= (mask << bit);
 240
 241             // step to the next pixel in the x
 242             for (uint32_t e = 0; e < NumEdges; ++e)
 243             {
 244                 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
 245             }
 246             bit+=4;
 247         }
 248
 249         // step to the next row
 250         for (uint32_t e = 0; e < NumEdges; ++e)
 251         {
 252             vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
 253         }
 254     }
 255 #endif
 256     return coverageMask;
 257
 258 }
 259 // Top left rule:
 260 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
 261 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
 262 // Top left: a sample is in if it is a top or left edge.
 263 // Out: !(horizontal && above) = !horizontal && below
 264 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right
 265 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge)
 266 {
 267     // if vA < 0, vC--
 268     // if vA == 0 && vB < 0, vC--
 269
 270     __m256d vEdgeOut = vEdge;
 271     __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
 272
 273     // if vA < 0 (line is not horizontal and below)
 274     int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
 275
 276     // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
 277     __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
 278     int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
 279     msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
 280
 281     // if either of these are true and we're on the line (edge == 0), bump it outside the line
 282     vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
 283 }
 284
 285 //////////////////////////////////////////////////////////////////////////
 286 /// @brief calculates difference in precision between the result of manh
 287 /// calculation and the edge precision, based on compile time trait values
 288 template<typename RT>
 289 constexpr int64_t ManhToEdgePrecisionAdjust()
 290 {
 291     static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
 292                   "Inadequate precision of result of manh calculation ");
 293     return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
 294 }
 295
 296 //////////////////////////////////////////////////////////////////////////
 297 /// @struct adjustEdgeConservative
 298 /// @brief Primary template definition used for partially specializing
 299 /// the adjustEdgeConservative function. This struct should never
 300 /// be instantiated.
 301 /// @tparam RT: rasterizer traits
 302 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
 303 template <typename RT, typename ConservativeEdgeOffsetT>
 304 struct adjustEdgeConservative
 305 {
 306     //////////////////////////////////////////////////////////////////////////
 307     /// @brief Performs calculations to adjust each edge of a triangle away
 308     /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
 309     /// direction.
 310     ///
 311     /// Uncertainty regions arise from fixed point rounding, which
 312     /// can snap a vertex +/- by min fixed point value.
 313     /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
 314     /// This allows the rasterizer to test for coverage only at the pixel center,
 315     /// instead of having to test individual pixel corners for conservative coverage
 316     INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
 317     {
 318         // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
 319         // from the pixel center (in the direction of the edge normal A/B)
 320
 321         // edge = Ax + Bx + C - (manh/e)
 322         // manh = manhattan distance = abs(A) + abs(B)
 323         // e = absolute rounding error from snapping from float to fixed point precision
 324
 325         // 'fixed point' multiply (in double to be avx1 friendly)
 326         // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
 327         __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
 328         __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
 329                                      _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
 330
 331         static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
 332                       "Inadequate precision of result of manh calculation ");
 333
 334         // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
 335         // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
 336         manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
 337
 338         // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
 339         // this allows the rasterizer to do a single conservative coverage test to see if the primitive
 340         // intersects the pixel at all
 341         vEdge = _mm256_sub_pd(vEdge, manh);
 342     };
 343 };
 344
 345 //////////////////////////////////////////////////////////////////////////
 346 /// @brief adjustEdgeConservative specialization where no edge offset is needed
 347 template <typename RT>
 348 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
 349 {
 350     INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
 351 };
 352
 353 //////////////////////////////////////////////////////////////////////////
 354 /// @brief calculates the distance a degenerate BBox needs to be adjusted
 355 /// for conservative rast based on compile time trait values
 356 template<typename RT>
 357 constexpr int64_t ConservativeScissorOffset()
 358 {
 359     static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
 360     // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
 361     typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
 362     // 1/2 pixel edge offset + conservative offset - degenerateTriangle
 363     return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
 364 }
 365
 366 //////////////////////////////////////////////////////////////////////////
 367 /// @brief Performs calculations to adjust each a vector of evaluated edges out
 368 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
 369 /// direction.
 370 template <typename RT>
 371 INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
 372 {
 373     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
 374     int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
 375     vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
 376 };
 377
 378 //////////////////////////////////////////////////////////////////////////
 379 /// @brief Performs calculations to adjust each a scalar evaluated edge out
 380 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
 381 /// direction.
 382 template <typename RT, typename OffsetT>
 383 INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
 384 {
 385     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
 386     int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
 387     return (Edge - manh);
 388 };
 389
 390 //////////////////////////////////////////////////////////////////////////
 391 /// @brief Perform any needed adjustments to evaluated triangle edges
 392 template <typename RT, typename EdgeOffsetT>
 393 struct adjustEdgesFix16
 394 {
 395     INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
 396     {
 397         static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
 398                       "Edge equation expected to be in x.16 fixed point");
 399
 400         static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
 401
 402         // need to apply any edge offsets before applying the top-left rule
 403         adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
 404
 405         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
 406     }
 407 };
 408
 409 //////////////////////////////////////////////////////////////////////////
 410 /// @brief Perform top left adjustments to evaluated triangle edges
 411 template <typename RT>
 412 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
 413 {
 414     INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
 415     {
 416         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
 417     }
 418 };
 419
 420 // max(abs(dz/dx), abs(dz,dy)
 421 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
 422 {
 423     /*
 424     // evaluate i,j at (0,0)
 425     float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
 426     float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
 427
 428     // evaluate i,j at (1,0)
 429     float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
 430     float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
 431
 432     // compute dz/dx
 433     float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
 434     float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
 435     float dzdx = abs(d10 - d00);
 436
 437     // evaluate i,j at (0,1)
 438     float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
 439     float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
 440
 441     float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
 442     float dzdy = abs(d01 - d00);
 443     */
 444
 445     // optimized version of above
 446     float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
 447     float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
 448
 449     return std::max(dzdx, dzdy);
 450 }
 451
 452 INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
 453 {
 454     if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
 455     {
 456         return (1.0f / (1 << 24));
 457     }
 458     else if (pState->depthFormat == R16_UNORM)
 459     {
 460         return (1.0f / (1 << 16));
 461     }
 462     else
 463     {
 464         SWR_ASSERT(pState->depthFormat == R32_FLOAT);
 465
 466         // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
 467         float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
 468         uint32_t zMaxInt = *(uint32_t*)&zMax;
 469         zMaxInt &= 0x7f800000;
 470         zMax = *(float*)&zMaxInt;
 471
 472         return zMax * (1.0f / (1 << 23));
 473     }
 474 }
 475
 476 INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
 477 {
 478     if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
 479     {
 480         return 0.0f;
 481     }
 482
 483     float scale = pState->slopeScaledDepthBias;
 484     if (scale != 0.0f)
 485     {
 486         scale *= ComputeMaxDepthSlope(pTri);
 487     }
 488
 489     float bias = pState->depthBias;
 490     if (!pState->depthBiasPreAdjusted)
 491     {
 492         bias *= ComputeBiasFactor(pState, pTri, z);
 493     }
 494     bias += scale;
 495
 496     if (pState->depthBiasClamp > 0.0f)
 497     {
 498         bias = std::min(bias, pState->depthBiasClamp);
 499     }
 500     else if (pState->depthBiasClamp < 0.0f)
 501     {
 502         bias = std::max(bias, pState->depthBiasClamp);
 503     }
 504
 505     return bias;
 506 }
 507
 508 // Prevent DCE by writing coverage mask from rasterizer to volatile
 509 #if KNOB_ENABLE_TOSS_POINTS
 510 __declspec(thread) volatile uint64_t gToss;
 511 #endif
 512
 513 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
 514 // try to avoid _chkstk insertions; make this thread local
 515 static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
 516
 517 INLINE
 518 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
 519 {
 520     edge.a = a;
 521     edge.b = b;
 522
 523     // compute constant steps to adjacent quads
 524     edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
 525     edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
 526
 527     // compute constant steps to adjacent raster tiles
 528     edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
 529     edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
 530
 531     // compute quad offsets
 532     const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
 533     const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
 534
 535     __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
 536     __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
 537     edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
 538
 539     // compute raster tile offsets
 540     const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
 541     const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
 542
 543     __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
 544     __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
 545     edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
 546 }
 547
 548 INLINE
 549 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
 550 {
 551     ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
 552 }
 553
 554 //////////////////////////////////////////////////////////////////////////
 555 /// @brief Primary template definition used for partially specializing
 556 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
 557 /// corner to sample position, and test for coverage
 558 /// @tparam sampleCount: multisample count
 559 template <typename NumSamplesT>
 560 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
 561                             int32_t &mask0, int32_t &mask1, int32_t &mask2)
 562 {
 563     __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
 564     // evaluate edge equations at the tile multisample bounding box
 565     vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
 566     vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
 567     vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
 568     mask0 = _mm256_movemask_pd(vSampleBboxTest0);
 569     mask1 = _mm256_movemask_pd(vSampleBboxTest1);
 570     mask2 = _mm256_movemask_pd(vSampleBboxTest2);
 571 }
 572
 573 //////////////////////////////////////////////////////////////////////////
 574 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
 575 /// when only rasterizing a single coverage test point
 576 template <>
 577 INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
 578                                            int32_t &mask0, int32_t &mask1, int32_t &mask2)
 579 {
 580     mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
 581     mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
 582     mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
 583 }
 584
 585 //////////////////////////////////////////////////////////////////////////
 586 /// @struct ComputeScissorEdges
 587 /// @brief Primary template definition. Allows the function to be generically
 588 /// called. When paired with below specializations, will result in an empty
 589 /// inlined function if scissor is not enabled
 590 /// @tparam RasterScissorEdgesT: is scissor enabled?
 591 /// @tparam IsConservativeT: is conservative rast enabled?
 592 /// @tparam RT: rasterizer traits
 593 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
 594 struct ComputeScissorEdges
 595 {
 596     INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
 597                               EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
 598 };
 599
 600 //////////////////////////////////////////////////////////////////////////
 601 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
 602 /// specialization. Instantiated when conservative rast and scissor are enabled
 603 template <typename RT>
 604 struct ComputeScissorEdges<std::true_type, std::true_type, RT>
 605 {
 606     //////////////////////////////////////////////////////////////////////////
 607     /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
 608     /// evaluate edge equations and offset them away from pixel center.
 609     INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
 610                               EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
 611     {
 612         // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
 613         SWR_RECT scissor;
 614         scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
 615         scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
 616         scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
 617         scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
 618
 619         POS topLeft{scissor.xmin, scissor.ymin};
 620         POS bottomLeft{scissor.xmin, scissor.ymax};
 621         POS topRight{scissor.xmax, scissor.ymin};
 622         POS bottomRight{scissor.xmax, scissor.ymax};
 623
 624         // construct 4 scissor edges in ccw direction
 625         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
 626         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
 627         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
 628         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
 629
 630         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
 631         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
 632         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
 633         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
 634
 635         // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
 636         adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
 637         adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
 638         adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
 639         adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
 640
 641         // Upper left rule for scissor
 642         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
 643         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
 644     }
 645 };
 646
 647 //////////////////////////////////////////////////////////////////////////
 648 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
 649 /// specialization. Instantiated when scissor is enabled and conservative rast
 650 /// is disabled.
 651 template <typename RT>
 652 struct ComputeScissorEdges<std::true_type, std::false_type, RT>
 653 {
 654     //////////////////////////////////////////////////////////////////////////
 655     /// @brief Compute scissor edge vectors and evaluate edge equations
 656     INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
 657                               EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
 658     {
 659         const SWR_RECT &scissor = scissorBBox;
 660         POS topLeft{scissor.xmin, scissor.ymin};
 661         POS bottomLeft{scissor.xmin, scissor.ymax};
 662         POS topRight{scissor.xmax, scissor.ymin};
 663         POS bottomRight{scissor.xmax, scissor.ymax};
 664
 665         // construct 4 scissor edges in ccw direction
 666         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
 667         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
 668         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
 669         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
 670
 671         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
 672         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
 673         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
 674         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
 675
 676         // Upper left rule for scissor
 677         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
 678         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
 679     }
 680 };
 681
 682 //////////////////////////////////////////////////////////////////////////
 683 /// @brief Primary function template for TrivialRejectTest. Should
 684 /// never be called, but TemplateUnroller instantiates a few unused values,
 685 /// so it calls a runtime assert instead of a static_assert.
 686 template <typename ValidEdgeMaskT>
 687 INLINE bool TrivialRejectTest(const int, const int, const int)
 688 {
 689     SWR_INVALID("Primary templated function should never be called");
 690     return false;
 691 };
 692
 693 //////////////////////////////////////////////////////////////////////////
 694 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
 695 /// and edge 1 for trivial coverage reject
 696 template <>
 697 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
 698 {
 699     return (!(mask0 && mask1)) ? true : false;
 700 };
 701
 702 //////////////////////////////////////////////////////////////////////////
 703 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
 704 /// and edge 2 for trivial coverage reject
 705 template <>
 706 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
 707 {
 708     return (!(mask0 && mask2)) ? true : false;
 709 };
 710
 711 //////////////////////////////////////////////////////////////////////////
 712 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
 713 /// and edge 2 for trivial coverage reject
 714 template <>
 715 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
 716 {
 717     return (!(mask1 && mask2)) ? true : false;
 718 };
 719
 720 //////////////////////////////////////////////////////////////////////////
 721 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
 722 /// primitive edges for trivial coverage reject
 723 template <>
 724 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
 725 {
 726     return (!(mask0 && mask1 && mask2)) ? true : false;;
 727 };
 728
 729 //////////////////////////////////////////////////////////////////////////
 730 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
 731 /// point, so return false and rasterize against conservative BBox
 732 template <>
 733 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
 734 {
 735     return false;
 736 };
 737
 738 //////////////////////////////////////////////////////////////////////////
 739 /// @brief Primary function template for TrivialAcceptTest. Always returns
 740 /// false, since it will only be called for degenerate tris, and as such
 741 /// will never cover the entire raster tile
 742 template <typename ScissorEnableT>
 743 INLINE bool TrivialAcceptTest(const int, const int, const int)
 744 {
 745     return false;
 746 };
 747
 748 //////////////////////////////////////////////////////////////////////////
 749 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
 750 /// edge masks for a fully covered raster tile
 751 template <>
 752 INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
 753 {
 754     return ((mask0 & mask1 & mask2) == 0xf);
 755 };
 756
 757 //////////////////////////////////////////////////////////////////////////
 758 /// @brief Primary function template for GenerateSVInnerCoverage. Results
 759 /// in an empty function call if SVInnerCoverage isn't requested
 760 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
 761 struct GenerateSVInnerCoverage
 762 {
 763     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*,  uint64_t &){};
 764 };
 765
 766 //////////////////////////////////////////////////////////////////////////
 767 /// @brief Specialization of GenerateSVInnerCoverage where all edges
 768 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
 769 /// edge values from OuterConservative to InnerConservative and rasterizes.
 770 template <typename RT>
 771 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
 772 {
 773     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges,  uint64_t &innerCoverageMask)
 774     {
 775         SWR_CONTEXT *pContext = pDC->pContext;
 776
 777         double startQuadEdgesAdj[RT::NumEdgesT::value];
 778         for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
 779         {
 780             startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
 781         }
 782
 783         // not trivial accept or reject, must rasterize full tile
 784         AR_BEGIN(BERasterizePartial, pDC->drawId);
 785         innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
 786         AR_END(BERasterizePartial, 0);
 787     }
 788 };
 789
 790 //////////////////////////////////////////////////////////////////////////
 791 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
 792 /// in an empty function call if SVInnerCoverage isn't requested
 793 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
 794 struct UpdateEdgeMasksInnerConservative
 795 {
 796     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
 797                                            const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
 798 };
 799
 800 //////////////////////////////////////////////////////////////////////////
 801 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
 802 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
 803 /// evaluated at raster tile corners to inner conservative position and
 804 /// updates edge masks
 805 template <typename RT>
 806 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
 807 {
 808     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
 809                                            const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
 810     {
 811         __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
 812
 813         // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
 814         // conservative evaluated edge when adjusting the edge in for inner conservative tests
 815         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
 816         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
 817         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
 818
 819         UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
 820     }
 821 };
 822
 823 //////////////////////////////////////////////////////////////////////////
 824 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
 825 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
 826 /// cover an entire raster tile, set mask0 to 0 to force it down the
 827 /// rastierizePartialTile path
 828 template <typename RT, typename ValidEdgeMaskT>
 829 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
 830 {
 831     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
 832                                    const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
 833     {
 834         // set one mask to zero to force the triangle down the rastierizePartialTile path
 835         mask0 = 0;
 836     }
 837 };
 838
 839 template <typename RT>
 840 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
 841 {
 842     SWR_CONTEXT *pContext = pDC->pContext;
 843     const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
 844 #if KNOB_ENABLE_TOSS_POINTS
 845     if (KNOB_TOSS_BIN_TRIS)
 846     {
 847         return;
 848     }
 849 #endif
 850     AR_BEGIN(BERasterizeTriangle, pDC->drawId);
 851     AR_BEGIN(BETriangleSetup, pDC->drawId);
 852
 853     const API_STATE &state = GetApiState(pDC);
 854     const SWR_RASTSTATE &rastState = state.rastState;
 855     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
 856
 857     OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
 858     triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
 859
 860     __m128 vX, vY, vZ, vRecipW;
 861
 862     // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
 863     // eg: vX = [x0 x1 x2 dc]
 864     vX = _mm_load_ps(workDesc.pTriBuffer);
 865     vY = _mm_load_ps(workDesc.pTriBuffer + 4);
 866     vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
 867     vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
 868
 869     // convert to fixed point
 870     static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
 871     __m128i vXi = fpToFixedPoint(vX);
 872     __m128i vYi = fpToFixedPoint(vY);
 873
 874     // quantize floating point position to fixed point precision
 875     // to prevent attribute creep around the triangle vertices
 876     vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
 877     vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
 878
 879     // triangle setup - A and B edge equation coefs
 880     __m128 vA, vB;
 881     triangleSetupAB(vX, vY, vA, vB);
 882
 883     __m128i vAi, vBi;
 884     triangleSetupABInt(vXi, vYi, vAi, vBi);
 885
 886     // determinant
 887     float det = calcDeterminantInt(vAi, vBi);
 888
 889     // Verts in Pixel Coordinate Space at this point
 890     // Det > 0 = CW winding order
 891     // Convert CW triangles to CCW
 892     if (det > 0.0)
 893     {
 894         vA  = _mm_mul_ps(vA, _mm_set1_ps(-1));
 895         vB  = _mm_mul_ps(vB, _mm_set1_ps(-1));
 896         vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
 897         vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
 898         det = -det;
 899     }
 900
 901     __m128 vC;
 902     // Finish triangle setup - C edge coef
 903     triangleSetupC(vX, vY, vA, vB, vC);
 904
 905     if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
 906     {
 907         // If we have degenerate edge(s) to rasterize, set I and J coefs
 908         // to 0 for constant interpolation of attributes
 909         triDesc.I[0] = 0.0f;
 910         triDesc.I[1] = 0.0f;
 911         triDesc.I[2] = 0.0f;
 912         triDesc.J[0] = 0.0f;
 913         triDesc.J[1] = 0.0f;
 914         triDesc.J[2] = 0.0f;
 915
 916         // Degenerate triangles have no area
 917         triDesc.recipDet = 0.0f;
 918     }
 919     else
 920     {
 921         // only extract coefs for 2 of the barycentrics; the 3rd can be
 922         // determined from the barycentric equation:
 923         // i + j + k = 1 <=> k = 1 - j - i
 924         _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
 925         _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
 926         _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
 927         _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
 928         _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
 929         _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
 930
 931         // compute recipDet, used to calculate barycentric i and j in the backend
 932         triDesc.recipDet = 1.0f/det;
 933     }
 934
 935     OSALIGNSIMD(float) oneOverW[4];
 936     _mm_store_ps(oneOverW, vRecipW);
 937     triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
 938     triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
 939     triDesc.OneOverW[2] = oneOverW[2];
 940
 941     // calculate perspective correct coefs per vertex attrib
 942     float* pPerspAttribs = perspAttribsTLS;
 943     float* pAttribs = workDesc.pAttribs;
 944     triDesc.pPerspAttribs = pPerspAttribs;
 945     triDesc.pAttribs = pAttribs;
 946     float *pRecipW = workDesc.pTriBuffer + 12;
 947     triDesc.pRecipW = pRecipW;
 948     __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
 949     __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
 950     __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
 951     for(uint32_t i = 0; i < workDesc.numAttribs; i++)
 952     {
 953         __m128 attribA = _mm_load_ps(pAttribs);
 954         __m128 attribB = _mm_load_ps(pAttribs+=4);
 955         __m128 attribC = _mm_load_ps(pAttribs+=4);
 956         pAttribs+=4;
 957
 958         attribA = _mm_mul_ps(attribA, vOneOverWV0);
 959         attribB = _mm_mul_ps(attribB, vOneOverWV1);
 960         attribC = _mm_mul_ps(attribC, vOneOverWV2);
 961
 962         _mm_store_ps(pPerspAttribs, attribA);
 963         _mm_store_ps(pPerspAttribs+=4, attribB);
 964         _mm_store_ps(pPerspAttribs+=4, attribC);
 965         pPerspAttribs+=4;
 966     }
 967
 968     // compute bary Z
 969     // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
 970     OSALIGNSIMD(float) a[4];
 971     _mm_store_ps(a, vZ);
 972     triDesc.Z[0] = a[0] - a[2];
 973     triDesc.Z[1] = a[1] - a[2];
 974     triDesc.Z[2] = a[2];
 975
 976     // add depth bias
 977     triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
 978
 979     // Calc bounding box of triangle
 980     OSALIGNSIMD(SWR_RECT) bbox;
 981     calcBoundingBoxInt(vXi, vYi, bbox);
 982
 983     const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
 984
 985     if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
 986     {
 987         // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
 988         bbox.xmin--;    bbox.xmax++;    bbox.ymin--;    bbox.ymax++;
 989         SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
 990                    "Conservative rast degenerate handling requires a valid scissor rect");
 991     }
 992
 993     // Intersect with scissor/viewport
 994     OSALIGNSIMD(SWR_RECT) intersect;
 995     intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
 996     intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
 997     intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
 998     intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
 999
1000     triDesc.triFlags = workDesc.triFlags;
1001
1002     // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
1003     uint32_t macroX, macroY;
1004     MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
1005     int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
1006     int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
1007     int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
1008     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
1009
1010     intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
1011     intersect.ymin = std::max(intersect.ymin, macroBoxTop);
1012     intersect.xmax = std::min(intersect.xmax, macroBoxRight);
1013     intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
1014
1015     SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
1016
1017     AR_END(BETriangleSetup, 0);
1018
1019     // update triangle desc
1020     uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1021     uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1022     uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1023     uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1024     uint32_t numTilesX = maxTileX - minTileX + 1;
1025     uint32_t numTilesY = maxTileY - minTileY + 1;
1026
1027     if (numTilesX == 0 || numTilesY == 0)
1028     {
1029         RDTSC_EVENT(BEEmptyTriangle, 1, 0);
1030         AR_END(BERasterizeTriangle, 1);
1031         return;
1032     }
1033
1034     AR_BEGIN(BEStepSetup, pDC->drawId);
1035
1036     // Step to pixel center of top-left pixel of the triangle bbox
1037     // Align intersect bbox (top/left) to raster tile's (top/left).
1038     int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
1039     int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
1040
1041     // convenience typedef
1042     typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
1043
1044     // single sample rasterization evaluates edges at pixel center,
1045     // multisample evaluates edges UL pixel corner and steps to each sample position
1046     if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1047     {
1048         // Add 0.5, in fixed point, to offset to pixel center
1049         x += (FIXED_POINT_SCALE / 2);
1050         y += (FIXED_POINT_SCALE / 2);
1051     }
1052
1053     __m128i vTopLeftX = _mm_set1_epi32(x);
1054     __m128i vTopLeftY = _mm_set1_epi32(y);
1055
1056     // evaluate edge equations at top-left pixel using 64bit math
1057     //
1058     // line = Ax + By + C
1059     // solving for C:
1060     // C = -Ax - By
1061     // we know x0 and y0 are on the line; plug them in:
1062     // C = -Ax0 - By0
1063     // plug C back into line equation:
1064     // line = Ax - By - Ax0 - By0
1065     // line = A(x - x0) + B(y - y0)
1066     // dX = (x-x0), dY = (y-y0)
1067     // so all this simplifies to
1068     // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
1069
1070     __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
1071     __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
1072
1073     // evaluate A(dx) and B(dY) for all points
1074     __m256d vAipd = _mm256_cvtepi32_pd(vAi);
1075     __m256d vBipd = _mm256_cvtepi32_pd(vBi);
1076     __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
1077     __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
1078
1079     __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
1080     __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
1081     __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
1082
1083     // apply any edge adjustments(top-left, crast, etc)
1084     adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
1085
1086     // broadcast respective edge results to all lanes
1087     double* pEdge = (double*)&vEdge;
1088     __m256d vEdgeFix16[7];
1089     vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
1090     vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
1091     vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
1092
1093     OSALIGNSIMD(int32_t) aAi[4], aBi[4];
1094     _mm_store_si128((__m128i*)aAi, vAi);
1095     _mm_store_si128((__m128i*)aBi, vBi);
1096     EDGE rastEdges[RT::NumEdgesT::value];
1097
1098     // Compute and store triangle edge data
1099     ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
1100     ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
1101     ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
1102
1103     // Compute and store triangle edge data if scissor needs to rasterized
1104     ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
1105                        (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
1106
1107     // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
1108     // used to for testing if entire raster tile is inside a triangle
1109     for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1110     {
1111         vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
1112     }
1113
1114     // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
1115     // step sample positions to the raster tile bbox of multisample points
1116     // min(xSamples),min(ySamples)  ------  max(xSamples),min(ySamples)
1117     //                             |      |
1118     //                             |      |
1119     // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
1120     __m256d vEdgeTileBbox[3];
1121     if (NumCoverageSamplesT::value > 1)
1122     {
1123         const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
1124         const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
1125         const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
1126
1127         __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
1128         __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
1129
1130         // step edge equation tests from Tile
1131         // used to for testing if entire raster tile is inside a triangle
1132         for (uint32_t e = 0; e < 3; ++e)
1133         {
1134             __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
1135             __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
1136             vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1137
1138             // adjust for msaa tile bbox edges outward for conservative rast, if enabled
1139             adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
1140         }
1141     }
1142
1143     AR_END(BEStepSetup, 0);
1144
1145     uint32_t tY = minTileY;
1146     uint32_t tX = minTileX;
1147     uint32_t maxY = maxTileY;
1148     uint32_t maxX = maxTileX;
1149
1150     RenderOutputBuffers renderBuffers, currentRenderBufferRow;
1151     GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
1152     currentRenderBufferRow = renderBuffers;
1153
1154     // rasterize and generate coverage masks per sample
1155     for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
1156     {
1157         __m256d vStartOfRowEdge[RT::NumEdgesT::value];
1158         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1159         {
1160             vStartOfRowEdge[e] = vEdgeFix16[e];
1161         }
1162
1163         for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
1164         {
1165             triDesc.anyCoveredSamples = 0;
1166
1167             // is the corner of the edge outside of the raster tile? (vEdge < 0)
1168             int mask0, mask1, mask2;
1169             UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
1170
1171             for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
1172             {
1173                 // trivial reject, at least one edge has all 4 corners of raster tile outside
1174                 bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
1175
1176                 if (!trivialReject)
1177                 {
1178                     // trivial accept mask
1179                     triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
1180
1181                     // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
1182                     UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
1183                         (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
1184
1185                     // @todo Make this a bit smarter to allow use of trivial accept when:
1186                     //   1) scissor/vp intersection rect is raster tile aligned
1187                     //   2) raster tile is entirely within scissor/vp intersection rect
1188                     if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
1189                     {
1190                         // trivial accept, all 4 corners of all 3 edges are negative
1191                         // i.e. raster tile completely inside triangle
1192                         triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
1193                         if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
1194                         {
1195                             triDesc.innerCoverageMask = 0xffffffffffffffffULL;
1196                         }
1197                         RDTSC_EVENT(BETrivialAccept, 1, 0);
1198                     }
1199                     else
1200                     {
1201                         __m256d vEdgeAtSample[RT::NumEdgesT::value];
1202                         if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1203                         {
1204                             // should get optimized out for single sample case (global value numbering or copy propagation)
1205                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1206                             {
1207                                 vEdgeAtSample[e] = vEdgeFix16[e];
1208                             }
1209                         }
1210                         else
1211                         {
1212                             const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
1213                             __m128i vSampleOffsetXh = samplePos.vXi(sampleNum);
1214                             __m128i vSampleOffsetYh = samplePos.vYi(sampleNum);
1215                             __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
1216                             __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
1217
1218                             // step edge equation tests from UL tile corner to pixel sample position
1219                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1220                             {
1221                                 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
1222                                 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
1223                                 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1224                                 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
1225                             }
1226                         }
1227
1228                         double startQuadEdges[RT::NumEdgesT::value];
1229                         const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1230                         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1231                         {
1232                             _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
1233                         }
1234
1235                         // not trivial accept or reject, must rasterize full tile
1236                         AR_BEGIN(BERasterizePartial, pDC->drawId);
1237                         triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
1238                         AR_END(BERasterizePartial, 0);
1239
1240                         triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
1241
1242                         // Output SV InnerCoverage, if needed
1243                         GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
1244                     }
1245                 }
1246                 else
1247                 {
1248                     // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
1249                     if(NumCoverageSamplesT::value > 1)
1250                     {
1251                         triDesc.coverageMask[sampleNum] = 0;
1252                     }
1253                     RDTSC_EVENT(BETrivialReject, 1, 0);
1254                 }
1255             }
1256
1257 #if KNOB_ENABLE_TOSS_POINTS
1258             if(KNOB_TOSS_RS)
1259             {
1260                 gToss = triDesc.coverageMask[0];
1261             }
1262             else
1263 #endif
1264             if(triDesc.anyCoveredSamples)
1265             {
1266                 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
1267                 // copy conservative coverage result to all samples
1268                 if(RT::IsConservativeT::value)
1269                 {
1270                     auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
1271                     UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
1272                 }
1273
1274                 AR_BEGIN(BEPixelBackend, pDC->drawId);
1275                 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
1276                 AR_END(BEPixelBackend, 0);
1277             }
1278
1279             // step to the next tile in X
1280             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1281             {
1282                 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
1283             }
1284             StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers);
1285         }
1286
1287         // step to the next tile in Y
1288         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1289         {
1290             vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
1291         }
1292         StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow);
1293     }
1294
1295     AR_END(BERasterizeTriangle, 1);
1296 }
1297
1298 // Get pointers to hot tile memory for color RT, depth, stencil
1299 template <uint32_t numSamples>
1300 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
1301 {
1302     const API_STATE& state = GetApiState(pDC);
1303     SWR_CONTEXT *pContext = pDC->pContext;
1304
1305     uint32_t mx, my;
1306     MacroTileMgr::getTileIndices(macroID, mx, my);
1307     tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
1308     tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
1309
1310     // compute tile offset for active hottile buffers
1311     const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
1312     uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1313     offset*=numSamples;
1314
1315     unsigned long rtSlot = 0;
1316     uint32_t colorHottileEnableMask = state.colorHottileEnable;
1317     while(_BitScanForward(&rtSlot, colorHottileEnableMask))
1318     {
1319         HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true,
1320             numSamples, renderTargetArrayIndex);
1321         pColor->state = HOTTILE_DIRTY;
1322         renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
1323
1324         colorHottileEnableMask &= ~(1 << rtSlot);
1325     }
1326     if(state.depthHottileEnable)
1327     {
1328         const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
1329         uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1330         offset*=numSamples;
1331         HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true,
1332             numSamples, renderTargetArrayIndex);
1333         pDepth->state = HOTTILE_DIRTY;
1334         SWR_ASSERT(pDepth->pBuffer != nullptr);
1335         renderBuffers.pDepth = pDepth->pBuffer + offset;
1336     }
1337     if(state.stencilHottileEnable)
1338     {
1339         const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
1340         uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1341         offset*=numSamples;
1342         HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true,
1343             numSamples, renderTargetArrayIndex);
1344         pStencil->state = HOTTILE_DIRTY;
1345         SWR_ASSERT(pStencil->pBuffer != nullptr);
1346         renderBuffers.pStencil = pStencil->pBuffer + offset;
1347     }
1348 }
1349
1350 template <typename RT>
1351 INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers)
1352 {
1353     DWORD rt = 0;
1354     while (_BitScanForward(&rt, colorHotTileMask))
1355     {
1356         colorHotTileMask &= ~(1 << rt);
1357         buffers.pColor[rt] += RT::colorRasterTileStep;
1358     }
1359
1360     buffers.pDepth += RT::depthRasterTileStep;
1361     buffers.pStencil += RT::stencilRasterTileStep;
1362 }
1363
1364 template <typename RT>
1365 INLINE void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
1366 {
1367     DWORD rt = 0;
1368     while (_BitScanForward(&rt, colorHotTileMask))
1369     {
1370         colorHotTileMask &= ~(1 << rt);
1371         startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
1372         buffers.pColor[rt] = startBufferRow.pColor[rt];
1373     }
1374     startBufferRow.pDepth += RT::depthRasterTileRowStep;
1375     buffers.pDepth = startBufferRow.pDepth;
1376
1377     startBufferRow.pStencil += RT::stencilRasterTileRowStep;
1378     buffers.pStencil = startBufferRow.pStencil;
1379 }
1380