src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h

   1 /****************************************************************************
   2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file rasterizer.cpp
  24  *
  25  * @brief Implementation for the rasterizer.
  26  *
  27  ******************************************************************************/
  28
  29 #include <vector>
  30 #include <algorithm>
  31
  32 #include "rasterizer.h"
  33 #include "rdtsc_core.h"
  34 #include "backend.h"
  35 #include "utils.h"
  36 #include "frontend.h"
  37 #include "tilemgr.h"
  38 #include "memory/tilingtraits.h"
  39
  40 extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT]
  41                                      [STATE_VALID_TRI_EDGE_COUNT][2];
  42
  43 template <uint32_t numSamples = 1>
  44 void GetRenderHotTiles(DRAW_CONTEXT*        pDC,
  45                        uint32_t             workerId,
  46                        uint32_t             macroID,
  47                        uint32_t             x,
  48                        uint32_t             y,
  49                        RenderOutputBuffers& renderBuffers,
  50                        uint32_t             renderTargetArrayIndex);
  51 template <typename RT>
  52 void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers);
  53 template <typename RT>
  54 void StepRasterTileY(uint32_t             colorHotTileMask,
  55                      RenderOutputBuffers& buffers,
  56                      RenderOutputBuffers& startBufferRow);
  57
  58 #define MASKTOVEC(i3, i2, i1, i0) \
  59     {                             \
  60         -i0, -i1, -i2, -i3        \
  61     }
  62 static const __m256d gMaskToVecpd[] = {
  63     MASKTOVEC(0, 0, 0, 0),
  64     MASKTOVEC(0, 0, 0, 1),
  65     MASKTOVEC(0, 0, 1, 0),
  66     MASKTOVEC(0, 0, 1, 1),
  67     MASKTOVEC(0, 1, 0, 0),
  68     MASKTOVEC(0, 1, 0, 1),
  69     MASKTOVEC(0, 1, 1, 0),
  70     MASKTOVEC(0, 1, 1, 1),
  71     MASKTOVEC(1, 0, 0, 0),
  72     MASKTOVEC(1, 0, 0, 1),
  73     MASKTOVEC(1, 0, 1, 0),
  74     MASKTOVEC(1, 0, 1, 1),
  75     MASKTOVEC(1, 1, 0, 0),
  76     MASKTOVEC(1, 1, 0, 1),
  77     MASKTOVEC(1, 1, 1, 0),
  78     MASKTOVEC(1, 1, 1, 1),
  79 };
  80
  81 struct POS
  82 {
  83     int32_t x, y;
  84 };
  85
  86 struct EDGE
  87 {
  88     double a, b;            // a, b edge coefficients in fix8
  89     double stepQuadX;       // step to adjacent horizontal quad in fix16
  90     double stepQuadY;       // step to adjacent vertical quad in fix16
  91     double stepRasterTileX; // step to adjacent horizontal raster tile in fix16
  92     double stepRasterTileY; // step to adjacent vertical raster tile in fix16
  93
  94     __m256d vQuadOffsets;       // offsets for 4 samples of a quad
  95     __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
  96 };
  97
  98 //////////////////////////////////////////////////////////////////////////
  99 /// @brief rasterize a raster tile partially covered by the triangle
 100 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster
 101 /// tile
 102 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
 103 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
 104 ///        Used to step between quads when sweeping over the raster tile.
 105 template <uint32_t NumEdges, typename EdgeMaskT>
 106 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT* pDC,
 107                                      double        startEdges[NumEdges],
 108                                      EDGE*         pRastEdges)
 109 {
 110     uint64_t coverageMask = 0;
 111
 112     __m256d vEdges[NumEdges];
 113     __m256d vStepX[NumEdges];
 114     __m256d vStepY[NumEdges];
 115
 116     for (uint32_t e = 0; e < NumEdges; ++e)
 117     {
 118         // Step to the pixel sample locations of the 1st quad
 119         vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
 120
 121         // compute step to next quad (mul by 2 in x and y direction)
 122         vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
 123         vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
 124     }
 125
 126     // fast unrolled version for 8x8 tile
 127 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
 128     int      edgeMask[NumEdges];
 129     uint64_t mask;
 130
 131     auto eval_lambda   = [&](int e) { edgeMask[e] = _mm256_movemask_pd(vEdges[e]); };
 132     auto update_lambda = [&](int e) { mask &= edgeMask[e]; };
 133     auto incx_lambda   = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); };
 134     auto incy_lambda   = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]); };
 135     auto decx_lambda   = [&](int e) { vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]); };
 136
 137 // evaluate which pixels in the quad are covered
 138 #define EVAL UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
 139
 140     // update coverage mask
 141     // if edge 0 is degenerate and will be skipped; init the mask
 142 #define UPDATE_MASK(bit)                                                  \
 143     if (std::is_same<EdgeMaskT, E1E2ValidT>::value ||                     \
 144         std::is_same<EdgeMaskT, NoEdgesValidT>::value)                    \
 145     {                                                                     \
 146         mask = 0xf;                                                       \
 147     }                                                                     \
 148     else                                                                  \
 149     {                                                                     \
 150         mask = edgeMask[0];                                               \
 151     }                                                                     \
 152     UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
 153     coverageMask |= (mask << bit);
 154
 155     // step in the +x direction to the next quad
 156 #define INCX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
 157
 158     // step in the +y direction to the next quad
 159 #define INCY UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
 160
 161     // step in the -x direction to the next quad
 162 #define DECX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
 163
 164     // sweep 2x2 quad back and forth through the raster tile,
 165     // computing coverage masks for the entire tile
 166
 167     // raster tile
 168     // 0  1  2  3  4  5  6  7
 169     // x  x
 170     // x  x ------------------>
 171     //                   x  x  |
 172     // <-----------------x  x  V
 173     // ..
 174
 175     // row 0
 176     EVAL;
 177     UPDATE_MASK(0);
 178     INCX;
 179     EVAL;
 180     UPDATE_MASK(4);
 181     INCX;
 182     EVAL;
 183     UPDATE_MASK(8);
 184     INCX;
 185     EVAL;
 186     UPDATE_MASK(12);
 187     INCY;
 188
 189     // row 1
 190     EVAL;
 191     UPDATE_MASK(28);
 192     DECX;
 193     EVAL;
 194     UPDATE_MASK(24);
 195     DECX;
 196     EVAL;
 197     UPDATE_MASK(20);
 198     DECX;
 199     EVAL;
 200     UPDATE_MASK(16);
 201     INCY;
 202
 203     // row 2
 204     EVAL;
 205     UPDATE_MASK(32);
 206     INCX;
 207     EVAL;
 208     UPDATE_MASK(36);
 209     INCX;
 210     EVAL;
 211     UPDATE_MASK(40);
 212     INCX;
 213     EVAL;
 214     UPDATE_MASK(44);
 215     INCY;
 216
 217     // row 3
 218     EVAL;
 219     UPDATE_MASK(60);
 220     DECX;
 221     EVAL;
 222     UPDATE_MASK(56);
 223     DECX;
 224     EVAL;
 225     UPDATE_MASK(52);
 226     DECX;
 227     EVAL;
 228     UPDATE_MASK(48);
 229 #else
 230     uint32_t bit = 0;
 231     for (uint32_t y = 0; y < KNOB_TILE_Y_DIM / 2; ++y)
 232     {
 233         __m256d vStartOfRowEdge[NumEdges];
 234         for (uint32_t e = 0; e < NumEdges; ++e)
 235         {
 236             vStartOfRowEdge[e] = vEdges[e];
 237         }
 238
 239         for (uint32_t x = 0; x < KNOB_TILE_X_DIM / 2; ++x)
 240         {
 241             int edgeMask[NumEdges];
 242             for (uint32_t e = 0; e < NumEdges; ++e)
 243             {
 244                 edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
 245             }
 246
 247             uint64_t mask = edgeMask[0];
 248             for (uint32_t e = 1; e < NumEdges; ++e)
 249             {
 250                 mask &= edgeMask[e];
 251             }
 252             coverageMask |= (mask << bit);
 253
 254             // step to the next pixel in the x
 255             for (uint32_t e = 0; e < NumEdges; ++e)
 256             {
 257                 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
 258             }
 259             bit += 4;
 260         }
 261
 262         // step to the next row
 263         for (uint32_t e = 0; e < NumEdges; ++e)
 264         {
 265             vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
 266         }
 267     }
 268 #endif
 269     return coverageMask;
 270 }
 271 // Top left rule:
 272 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
 273 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it
 274 // is a 'left' edge Top left: a sample is in if it is a top or left edge. Out: !(horizontal &&
 275 // above) = !horizontal && below Out: !horizontal && left = !(!horizontal && left) = horizontal and
 276 // right
 277 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d& vEdge)
 278 {
 279     // if vA < 0, vC--
 280     // if vA == 0 && vB < 0, vC--
 281
 282     __m256d vEdgeOut    = vEdge;
 283     __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
 284
 285     // if vA < 0 (line is not horizontal and below)
 286     int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
 287
 288     // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
 289     __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
 290     int     msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
 291     msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
 292
 293     // if either of these are true and we're on the line (edge == 0), bump it outside the line
 294     vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
 295 }
 296
 297 //////////////////////////////////////////////////////////////////////////
 298 /// @brief calculates difference in precision between the result of manh
 299 /// calculation and the edge precision, based on compile time trait values
 300 template <typename RT>
 301 constexpr int64_t ManhToEdgePrecisionAdjust()
 302 {
 303     static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >=
 304                       RT::EdgePrecisionT::BitsT::value,
 305                   "Inadequate precision of result of manh calculation ");
 306     return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) -
 307             RT::EdgePrecisionT::BitsT::value);
 308 }
 309
 310 //////////////////////////////////////////////////////////////////////////
 311 /// @struct adjustEdgeConservative
 312 /// @brief Primary template definition used for partially specializing
 313 /// the adjustEdgeConservative function. This struct should never
 314 /// be instantiated.
 315 /// @tparam RT: rasterizer traits
 316 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
 317 template <typename RT, typename ConservativeEdgeOffsetT>
 318 struct adjustEdgeConservative
 319 {
 320     //////////////////////////////////////////////////////////////////////////
 321     /// @brief Performs calculations to adjust each edge of a triangle away
 322     /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
 323     /// direction.
 324     ///
 325     /// Uncertainty regions arise from fixed point rounding, which
 326     /// can snap a vertex +/- by min fixed point value.
 327     /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
 328     /// This allows the rasterizer to test for coverage only at the pixel center,
 329     /// instead of having to test individual pixel corners for conservative coverage
 330     INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
 331     {
 332         // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge
 333         // away from the pixel center (in the direction of the edge normal A/B)
 334
 335         // edge = Ax + Bx + C - (manh/e)
 336         // manh = manhattan distance = abs(A) + abs(B)
 337         // e = absolute rounding error from snapping from float to fixed point precision
 338
 339         // 'fixed point' multiply (in double to be avx1 friendly)
 340         // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
 341         __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)),
 342                 vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
 343         __m256d manh =
 344             _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
 345                           _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
 346
 347         static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >=
 348                           RT::EdgePrecisionT::BitsT::value,
 349                       "Inadequate precision of result of manh calculation ");
 350
 351         // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the
 352         // same precision since we're doing fixed math in double format, multiply by multiples of
 353         // 1/2 instead of a bit shift right
 354         manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
 355
 356         // move the edge away from the pixel center by the required conservative precision + 1/2
 357         // pixel this allows the rasterizer to do a single conservative coverage test to see if the
 358         // primitive intersects the pixel at all
 359         vEdge = _mm256_sub_pd(vEdge, manh);
 360     };
 361 };
 362
 363 //////////////////////////////////////////////////////////////////////////
 364 /// @brief adjustEdgeConservative specialization where no edge offset is needed
 365 template <typename RT>
 366 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
 367 {
 368     INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge){};
 369 };
 370
 371 //////////////////////////////////////////////////////////////////////////
 372 /// @brief calculates the distance a degenerate BBox needs to be adjusted
 373 /// for conservative rast based on compile time trait values
 374 template <typename RT>
 375 constexpr int64_t ConservativeScissorOffset()
 376 {
 377     static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0,
 378                   "Rasterizer precision > conservative precision");
 379     // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox
 380     // when calculating scissor edges
 381     typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1>
 382         DegenerateEdgeOffsetT;
 383     // 1/2 pixel edge offset + conservative offset - degenerateTriangle
 384     return RT::ConservativeEdgeOffsetT::value -
 385            (DegenerateEdgeOffsetT::value
 386             << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
 387 }
 388
 389 //////////////////////////////////////////////////////////////////////////
 390 /// @brief Performs calculations to adjust each a vector of evaluated edges out
 391 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
 392 /// direction.
 393 template <typename RT>
 394 INLINE void adjustScissorEdge(const double a, const double b, __m256d& vEdge)
 395 {
 396     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
 397     int64_t manh =
 398         ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >>
 399         ManhToEdgePrecisionAdjust<RT>();
 400     vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
 401 };
 402
 403 //////////////////////////////////////////////////////////////////////////
 404 /// @brief Performs calculations to adjust each a scalar evaluated edge out
 405 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
 406 /// direction.
 407 template <typename RT, typename OffsetT>
 408 INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
 409 {
 410     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
 411     int64_t manh =
 412         ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
 413     return (Edge - manh);
 414 };
 415
 416 //////////////////////////////////////////////////////////////////////////
 417 /// @brief Perform any needed adjustments to evaluated triangle edges
 418 template <typename RT, typename EdgeOffsetT>
 419 struct adjustEdgesFix16
 420 {
 421     INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
 422     {
 423         static_assert(
 424             std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
 425             "Edge equation expected to be in x.16 fixed point");
 426
 427         static_assert(RT::IsConservativeT::value,
 428                       "Edge offset assumes conservative rasterization is enabled");
 429
 430         // need to apply any edge offsets before applying the top-left rule
 431         adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
 432
 433         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
 434     }
 435 };
 436
 437 //////////////////////////////////////////////////////////////////////////
 438 /// @brief Perform top left adjustments to evaluated triangle edges
 439 template <typename RT>
 440 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
 441 {
 442     INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
 443     {
 444         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
 445     }
 446 };
 447
 448 // max(abs(dz/dx), abs(dz,dy)
 449 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
 450 {
 451     /*
 452     // evaluate i,j at (0,0)
 453     float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
 454     float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
 455
 456     // evaluate i,j at (1,0)
 457     float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
 458     float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
 459
 460     // compute dz/dx
 461     float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
 462     float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
 463     float dzdx = abs(d10 - d00);
 464
 465     // evaluate i,j at (0,1)
 466     float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
 467     float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
 468
 469     float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
 470     float dzdy = abs(d01 - d00);
 471     */
 472
 473     // optimized version of above
 474     float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
 475     float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
 476
 477     return std::max(dzdx, dzdy);
 478 }
 479
 480 INLINE float
 481 ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
 482 {
 483     if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
 484     {
 485         return (1.0f / (1 << 24));
 486     }
 487     else if (pState->depthFormat == R16_UNORM)
 488     {
 489         return (1.0f / (1 << 16));
 490     }
 491     else
 492     {
 493         SWR_ASSERT(pState->depthFormat == R32_FLOAT);
 494
 495         // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
 496         float    zMax    = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
 497         uint32_t zMaxInt = *(uint32_t*)&zMax;
 498         zMaxInt &= 0x7f800000;
 499         zMax = *(float*)&zMaxInt;
 500
 501         return zMax * (1.0f / (1 << 23));
 502     }
 503 }
 504
 505 INLINE float
 506 ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
 507 {
 508     if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
 509     {
 510         return 0.0f;
 511     }
 512
 513     float scale = pState->slopeScaledDepthBias;
 514     if (scale != 0.0f)
 515     {
 516         scale *= ComputeMaxDepthSlope(pTri);
 517     }
 518
 519     float bias = pState->depthBias;
 520     if (!pState->depthBiasPreAdjusted)
 521     {
 522         bias *= ComputeBiasFactor(pState, pTri, z);
 523     }
 524     bias += scale;
 525
 526     if (pState->depthBiasClamp > 0.0f)
 527     {
 528         bias = std::min(bias, pState->depthBiasClamp);
 529     }
 530     else if (pState->depthBiasClamp < 0.0f)
 531     {
 532         bias = std::max(bias, pState->depthBiasClamp);
 533     }
 534
 535     return bias;
 536 }
 537
 538 // Prevent DCE by writing coverage mask from rasterizer to volatile
 539 #if KNOB_ENABLE_TOSS_POINTS
 540 __declspec(thread) volatile uint64_t gToss;
 541 #endif
 542
 543 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
 544 // try to avoid _chkstk insertions; make this thread local
 545 static THREAD
 546 OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
 547
 548 INLINE
 549 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
 550 {
 551     edge.a = a;
 552     edge.b = b;
 553
 554     // compute constant steps to adjacent quads
 555     edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
 556     edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
 557
 558     // compute constant steps to adjacent raster tiles
 559     edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
 560     edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
 561
 562     // compute quad offsets
 563     const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
 564     const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
 565
 566     __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
 567     __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
 568     edge.vQuadOffsets       = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
 569
 570     // compute raster tile offsets
 571     const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd(
 572         (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0);
 573     const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd(
 574         (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, 0, 0);
 575
 576     __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
 577     __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
 578     edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
 579 }
 580
 581 INLINE
 582 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
 583 {
 584     ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
 585 }
 586
 587 //////////////////////////////////////////////////////////////////////////
 588 /// @brief Primary template definition used for partially specializing
 589 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
 590 /// corner to sample position, and test for coverage
 591 /// @tparam sampleCount: multisample count
 592 template <typename NumSamplesT>
 593 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3],
 594                             const __m256d* vEdgeFix16,
 595                             int32_t&       mask0,
 596                             int32_t&       mask1,
 597                             int32_t&       mask2)
 598 {
 599     __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
 600     // evaluate edge equations at the tile multisample bounding box
 601     vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
 602     vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
 603     vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
 604     mask0            = _mm256_movemask_pd(vSampleBboxTest0);
 605     mask1            = _mm256_movemask_pd(vSampleBboxTest1);
 606     mask2            = _mm256_movemask_pd(vSampleBboxTest2);
 607 }
 608
 609 //////////////////////////////////////////////////////////////////////////
 610 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
 611 /// when only rasterizing a single coverage test point
 612 template <>
 613 INLINE void UpdateEdgeMasks<SingleSampleT>(
 614     const __m256d (&)[3], const __m256d* vEdgeFix16, int32_t& mask0, int32_t& mask1, int32_t& mask2)
 615 {
 616     mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
 617     mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
 618     mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
 619 }
 620
 621 //////////////////////////////////////////////////////////////////////////
 622 /// @struct ComputeScissorEdges
 623 /// @brief Primary template definition. Allows the function to be generically
 624 /// called. When paired with below specializations, will result in an empty
 625 /// inlined function if scissor is not enabled
 626 /// @tparam RasterScissorEdgesT: is scissor enabled?
 627 /// @tparam IsConservativeT: is conservative rast enabled?
 628 /// @tparam RT: rasterizer traits
 629 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
 630 struct ComputeScissorEdges
 631 {
 632     INLINE ComputeScissorEdges(const SWR_RECT& triBBox,
 633                                const SWR_RECT& scissorBBox,
 634                                const int32_t   x,
 635                                const int32_t   y,
 636                                EDGE (&rastEdges)[RT::NumEdgesT::value],
 637                                __m256d (&vEdgeFix16)[7]){};
 638 };
 639
 640 //////////////////////////////////////////////////////////////////////////
 641 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
 642 /// specialization. Instantiated when conservative rast and scissor are enabled
 643 template <typename RT>
 644 struct ComputeScissorEdges<std::true_type, std::true_type, RT>
 645 {
 646     //////////////////////////////////////////////////////////////////////////
 647     /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
 648     /// evaluate edge equations and offset them away from pixel center.
 649     INLINE ComputeScissorEdges(const SWR_RECT& triBBox,
 650                                const SWR_RECT& scissorBBox,
 651                                const int32_t   x,
 652                                const int32_t   y,
 653                                EDGE (&rastEdges)[RT::NumEdgesT::value],
 654                                __m256d (&vEdgeFix16)[7])
 655     {
 656         // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
 657         SWR_RECT scissor;
 658         scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
 659         scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
 660         scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
 661         scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
 662
 663         POS topLeft{scissor.xmin, scissor.ymin};
 664         POS bottomLeft{scissor.xmin, scissor.ymax};
 665         POS topRight{scissor.xmax, scissor.ymin};
 666         POS bottomRight{scissor.xmax, scissor.ymax};
 667
 668         // construct 4 scissor edges in ccw direction
 669         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
 670         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
 671         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
 672         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
 673
 674         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) +
 675                                        (rastEdges[3].b * (y - scissor.ymin)));
 676         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) +
 677                                        (rastEdges[4].b * (y - scissor.ymax)));
 678         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) +
 679                                        (rastEdges[5].b * (y - scissor.ymax)));
 680         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) +
 681                                        (rastEdges[6].b * (y - scissor.ymin)));
 682
 683         // if conservative rasterizing, need to bump the scissor edges out by the conservative
 684         // uncertainty distance, else do nothing
 685         adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
 686         adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
 687         adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
 688         adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
 689
 690         // Upper left rule for scissor
 691         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
 692         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
 693     }
 694 };
 695
 696 //////////////////////////////////////////////////////////////////////////
 697 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
 698 /// specialization. Instantiated when scissor is enabled and conservative rast
 699 /// is disabled.
 700 template <typename RT>
 701 struct ComputeScissorEdges<std::true_type, std::false_type, RT>
 702 {
 703     //////////////////////////////////////////////////////////////////////////
 704     /// @brief Compute scissor edge vectors and evaluate edge equations
 705     INLINE ComputeScissorEdges(const SWR_RECT&,
 706                                const SWR_RECT& scissorBBox,
 707                                const int32_t   x,
 708                                const int32_t   y,
 709                                EDGE (&rastEdges)[RT::NumEdgesT::value],
 710                                __m256d (&vEdgeFix16)[7])
 711     {
 712         const SWR_RECT& scissor = scissorBBox;
 713         POS             topLeft{scissor.xmin, scissor.ymin};
 714         POS             bottomLeft{scissor.xmin, scissor.ymax};
 715         POS             topRight{scissor.xmax, scissor.ymin};
 716         POS             bottomRight{scissor.xmax, scissor.ymax};
 717
 718         // construct 4 scissor edges in ccw direction
 719         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
 720         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
 721         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
 722         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
 723
 724         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) +
 725                                        (rastEdges[3].b * (y - scissor.ymin)));
 726         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) +
 727                                        (rastEdges[4].b * (y - scissor.ymax)));
 728         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) +
 729                                        (rastEdges[5].b * (y - scissor.ymax)));
 730         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) +
 731                                        (rastEdges[6].b * (y - scissor.ymin)));
 732
 733         // Upper left rule for scissor
 734         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
 735         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
 736     }
 737 };
 738
 739 //////////////////////////////////////////////////////////////////////////
 740 /// @brief Primary function template for TrivialRejectTest. Should
 741 /// never be called, but TemplateUnroller instantiates a few unused values,
 742 /// so it calls a runtime assert instead of a static_assert.
 743 template <typename ValidEdgeMaskT>
 744 INLINE bool TrivialRejectTest(const int, const int, const int)
 745 {
 746     SWR_INVALID("Primary templated function should never be called");
 747     return false;
 748 };
 749
 750 //////////////////////////////////////////////////////////////////////////
 751 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
 752 /// and edge 1 for trivial coverage reject
 753 template <>
 754 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
 755 {
 756     return (!(mask0 && mask1)) ? true : false;
 757 };
 758
 759 //////////////////////////////////////////////////////////////////////////
 760 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
 761 /// and edge 2 for trivial coverage reject
 762 template <>
 763 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
 764 {
 765     return (!(mask0 && mask2)) ? true : false;
 766 };
 767
 768 //////////////////////////////////////////////////////////////////////////
 769 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
 770 /// and edge 2 for trivial coverage reject
 771 template <>
 772 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
 773 {
 774     return (!(mask1 && mask2)) ? true : false;
 775 };
 776
 777 //////////////////////////////////////////////////////////////////////////
 778 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
 779 /// primitive edges for trivial coverage reject
 780 template <>
 781 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
 782 {
 783     return (!(mask0 && mask1 && mask2)) ? true : false;
 784     ;
 785 };
 786
 787 //////////////////////////////////////////////////////////////////////////
 788 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
 789 /// point, so return false and rasterize against conservative BBox
 790 template <>
 791 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
 792 {
 793     return false;
 794 };
 795
 796 //////////////////////////////////////////////////////////////////////////
 797 /// @brief Primary function template for TrivialAcceptTest. Always returns
 798 /// false, since it will only be called for degenerate tris, and as such
 799 /// will never cover the entire raster tile
 800 template <typename ScissorEnableT>
 801 INLINE bool TrivialAcceptTest(const int, const int, const int)
 802 {
 803     return false;
 804 };
 805
 806 //////////////////////////////////////////////////////////////////////////
 807 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
 808 /// edge masks for a fully covered raster tile
 809 template <>
 810 INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
 811 {
 812     return ((mask0 & mask1 & mask2) == 0xf);
 813 };
 814
 815 //////////////////////////////////////////////////////////////////////////
 816 /// @brief Primary function template for GenerateSVInnerCoverage. Results
 817 /// in an empty function call if SVInnerCoverage isn't requested
 818 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
 819 struct GenerateSVInnerCoverage
 820 {
 821     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t&){};
 822 };
 823
 824 //////////////////////////////////////////////////////////////////////////
 825 /// @brief Specialization of GenerateSVInnerCoverage where all edges
 826 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
 827 /// edge values from OuterConservative to InnerConservative and rasterizes.
 828 template <typename RT>
 829 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
 830 {
 831     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC,
 832                                    uint32_t      workerId,
 833                                    EDGE*         pRastEdges,
 834                                    double*       pStartQuadEdges,
 835                                    uint64_t&     innerCoverageMask)
 836     {
 837         double startQuadEdgesAdj[RT::NumEdgesT::value];
 838         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
 839         {
 840             startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(
 841                 pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
 842         }
 843
 844         // not trivial accept or reject, must rasterize full tile
 845         RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizePartial, pDC->drawId);
 846         innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(
 847             pDC, startQuadEdgesAdj, pRastEdges);
 848         RDTSC_END(pDC->pContext->pBucketMgr, BERasterizePartial, 0);
 849     }
 850 };
 851
 852 //////////////////////////////////////////////////////////////////////////
 853 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
 854 /// in an empty function call if SVInnerCoverage isn't requested
 855 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
 856 struct UpdateEdgeMasksInnerConservative
 857 {
 858     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3],
 859                                             const __m256d*,
 860                                             const __m128i,
 861                                             const __m128i,
 862                                             int32_t&,
 863                                             int32_t&,
 864                                             int32_t&){};
 865 };
 866
 867 //////////////////////////////////////////////////////////////////////////
 868 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
 869 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
 870 /// evaluated at raster tile corners to inner conservative position and
 871 /// updates edge masks
 872 template <typename RT>
 873 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
 874 {
 875     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3],
 876                                             const __m256d* vEdgeFix16,
 877                                             const __m128i  vAi,
 878                                             const __m128i  vBi,
 879                                             int32_t&       mask0,
 880                                             int32_t&       mask1,
 881                                             int32_t&       mask2)
 882     {
 883         __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
 884
 885         // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
 886         // conservative evaluated edge when adjusting the edge in for inner conservative tests
 887         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
 888             vAi, vBi, vTempEdge[0]);
 889         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
 890             vAi, vBi, vTempEdge[1]);
 891         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
 892             vAi, vBi, vTempEdge[2]);
 893
 894         UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(
 895             vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
 896     }
 897 };
 898
 899 //////////////////////////////////////////////////////////////////////////
 900 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
 901 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
 902 /// cover an entire raster tile, set mask0 to 0 to force it down the
 903 /// rastierizePartialTile path
 904 template <typename RT, typename ValidEdgeMaskT>
 905 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
 906 {
 907     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3],
 908                                             const __m256d*,
 909                                             const __m128i,
 910                                             const __m128i,
 911                                             int32_t& mask0,
 912                                             int32_t&,
 913                                             int32_t&)
 914     {
 915         // set one mask to zero to force the triangle down the rastierizePartialTile path
 916         mask0 = 0;
 917     }
 918 };
 919
 920 template <typename RT>
 921 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
 922 {
 923     const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
 924 #if KNOB_ENABLE_TOSS_POINTS
 925     if (KNOB_TOSS_BIN_TRIS)
 926     {
 927         return;
 928     }
 929 #endif
 930     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeTriangle, pDC->drawId);
 931     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BETriangleSetup, pDC->drawId);
 932
 933     const API_STATE&     state        = GetApiState(pDC);
 934     const SWR_RASTSTATE& rastState    = state.rastState;
 935     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
 936
 937     OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
 938     triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
 939
 940     __m128 vX, vY, vZ, vRecipW;
 941
 942     // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
 943     // eg: vX = [x0 x1 x2 dc]
 944     vX      = _mm_load_ps(workDesc.pTriBuffer);
 945     vY      = _mm_load_ps(workDesc.pTriBuffer + 4);
 946     vZ      = _mm_load_ps(workDesc.pTriBuffer + 8);
 947     vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
 948
 949     // convert to fixed point
 950     static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value,
 951                   "Rasterizer expects 16.8 fixed point precision");
 952     __m128i vXi = fpToFixedPoint(vX);
 953     __m128i vYi = fpToFixedPoint(vY);
 954
 955     // quantize floating point position to fixed point precision
 956     // to prevent attribute creep around the triangle vertices
 957     vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
 958     vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
 959
 960     // triangle setup - A and B edge equation coefs
 961     __m128 vA, vB;
 962     triangleSetupAB(vX, vY, vA, vB);
 963
 964     __m128i vAi, vBi;
 965     triangleSetupABInt(vXi, vYi, vAi, vBi);
 966
 967     // determinant
 968     float det = calcDeterminantInt(vAi, vBi);
 969
 970     // Verts in Pixel Coordinate Space at this point
 971     // Det > 0 = CW winding order
 972     // Convert CW triangles to CCW
 973     if (det > 0.0)
 974     {
 975         vA  = _mm_mul_ps(vA, _mm_set1_ps(-1));
 976         vB  = _mm_mul_ps(vB, _mm_set1_ps(-1));
 977         vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
 978         vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
 979         det = -det;
 980     }
 981
 982     __m128 vC;
 983     // Finish triangle setup - C edge coef
 984     triangleSetupC(vX, vY, vA, vB, vC);
 985
 986     if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
 987     {
 988         // If we have degenerate edge(s) to rasterize, set I and J coefs
 989         // to 0 for constant interpolation of attributes
 990         triDesc.I[0] = 0.0f;
 991         triDesc.I[1] = 0.0f;
 992         triDesc.I[2] = 0.0f;
 993         triDesc.J[0] = 0.0f;
 994         triDesc.J[1] = 0.0f;
 995         triDesc.J[2] = 0.0f;
 996
 997         // Degenerate triangles have no area
 998         triDesc.recipDet = 0.0f;
 999     }
1000     else
1001     {
1002         // only extract coefs for 2 of the barycentrics; the 3rd can be
1003         // determined from the barycentric equation:
1004         // i + j + k = 1 <=> k = 1 - j - i
1005         _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
1006         _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
1007         _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
1008         _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
1009         _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
1010         _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
1011
1012         // compute recipDet, used to calculate barycentric i and j in the backend
1013         triDesc.recipDet = 1.0f / det;
1014     }
1015
1016     OSALIGNSIMD(float) oneOverW[4];
1017     _mm_store_ps(oneOverW, vRecipW);
1018     triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
1019     triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
1020     triDesc.OneOverW[2] = oneOverW[2];
1021
1022     // calculate perspective correct coefs per vertex attrib
1023     float* pPerspAttribs  = perspAttribsTLS;
1024     float* pAttribs       = workDesc.pAttribs;
1025     triDesc.pPerspAttribs = pPerspAttribs;
1026     triDesc.pAttribs      = pAttribs;
1027     float* pRecipW        = workDesc.pTriBuffer + 12;
1028     triDesc.pRecipW       = pRecipW;
1029     __m128 vOneOverWV0    = _mm_broadcast_ss(pRecipW);
1030     __m128 vOneOverWV1    = _mm_broadcast_ss(pRecipW += 1);
1031     __m128 vOneOverWV2    = _mm_broadcast_ss(pRecipW += 1);
1032     for (uint32_t i = 0; i < workDesc.numAttribs; i++)
1033     {
1034         __m128 attribA = _mm_load_ps(pAttribs);
1035         __m128 attribB = _mm_load_ps(pAttribs += 4);
1036         __m128 attribC = _mm_load_ps(pAttribs += 4);
1037         pAttribs += 4;
1038
1039         attribA = _mm_mul_ps(attribA, vOneOverWV0);
1040         attribB = _mm_mul_ps(attribB, vOneOverWV1);
1041         attribC = _mm_mul_ps(attribC, vOneOverWV2);
1042
1043         _mm_store_ps(pPerspAttribs, attribA);
1044         _mm_store_ps(pPerspAttribs += 4, attribB);
1045         _mm_store_ps(pPerspAttribs += 4, attribC);
1046         pPerspAttribs += 4;
1047     }
1048
1049     // compute bary Z
1050     // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
1051     OSALIGNSIMD(float) a[4];
1052     _mm_store_ps(a, vZ);
1053     triDesc.Z[0] = a[0] - a[2];
1054     triDesc.Z[1] = a[1] - a[2];
1055     triDesc.Z[2] = a[2];
1056
1057     // add depth bias
1058     triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
1059
1060     // Calc bounding box of triangle
1061     OSALIGNSIMD(SWR_RECT) bbox;
1062     calcBoundingBoxInt(vXi, vYi, bbox);
1063
1064     const SWR_RECT& scissorInFixedPoint =
1065         state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
1066
1067     if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
1068     {
1069         // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is
1070         // valid
1071         bbox.xmin--;
1072         bbox.xmax++;
1073         bbox.ymin--;
1074         bbox.ymax++;
1075         SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
1076                    "Conservative rast degenerate handling requires a valid scissor rect");
1077     }
1078
1079     // Intersect with scissor/viewport
1080     OSALIGNSIMD(SWR_RECT) intersect;
1081     intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
1082     intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
1083     intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
1084     intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
1085
1086     triDesc.triFlags = workDesc.triFlags;
1087
1088     // further constrain backend to intersecting bounding box of macro tile and scissored triangle
1089     // bbox
1090     uint32_t macroX, macroY;
1091     MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
1092     int32_t macroBoxLeft   = macroX * KNOB_MACROTILE_X_DIM_FIXED;
1093     int32_t macroBoxRight  = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
1094     int32_t macroBoxTop    = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
1095     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
1096
1097     intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
1098     intersect.ymin = std::max(intersect.ymin, macroBoxTop);
1099     intersect.xmax = std::min(intersect.xmax, macroBoxRight);
1100     intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
1101
1102     SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax &&
1103                intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 &&
1104                intersect.ymax >= 0);
1105
1106     RDTSC_END(pDC->pContext->pBucketMgr, BETriangleSetup, 0);
1107
1108     // update triangle desc
1109     uint32_t minTileX  = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1110     uint32_t minTileY  = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1111     uint32_t maxTileX  = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1112     uint32_t maxTileY  = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1113     uint32_t numTilesX = maxTileX - minTileX + 1;
1114     uint32_t numTilesY = maxTileY - minTileY + 1;
1115
1116     if (numTilesX == 0 || numTilesY == 0)
1117     {
1118         RDTSC_EVENT(pDC->pContext->pBucketMgr, BEEmptyTriangle, 1, 0);
1119         RDTSC_END(pDC->pContext->pBucketMgr, BERasterizeTriangle, 1);
1120         return;
1121     }
1122
1123     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEStepSetup, pDC->drawId);
1124
1125     // Step to pixel center of top-left pixel of the triangle bbox
1126     // Align intersect bbox (top/left) to raster tile's (top/left).
1127     int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
1128     int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
1129
1130     // convenience typedef
1131     typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
1132
1133     // single sample rasterization evaluates edges at pixel center,
1134     // multisample evaluates edges UL pixel corner and steps to each sample position
1135     if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1136     {
1137         // Add 0.5, in fixed point, to offset to pixel center
1138         x += (FIXED_POINT_SCALE / 2);
1139         y += (FIXED_POINT_SCALE / 2);
1140     }
1141
1142     __m128i vTopLeftX = _mm_set1_epi32(x);
1143     __m128i vTopLeftY = _mm_set1_epi32(y);
1144
1145     // evaluate edge equations at top-left pixel using 64bit math
1146     //
1147     // line = Ax + By + C
1148     // solving for C:
1149     // C = -Ax - By
1150     // we know x0 and y0 are on the line; plug them in:
1151     // C = -Ax0 - By0
1152     // plug C back into line equation:
1153     // line = Ax - By - Ax0 - By0
1154     // line = A(x - x0) + B(y - y0)
1155     // dX = (x-x0), dY = (y-y0)
1156     // so all this simplifies to
1157     // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
1158
1159     __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
1160     __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
1161
1162     // evaluate A(dx) and B(dY) for all points
1163     __m256d vAipd     = _mm256_cvtepi32_pd(vAi);
1164     __m256d vBipd     = _mm256_cvtepi32_pd(vBi);
1165     __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
1166     __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
1167
1168     __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
1169     __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
1170     __m256d vEdge          = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
1171
1172     // apply any edge adjustments(top-left, crast, etc)
1173     adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
1174
1175     // broadcast respective edge results to all lanes
1176     double* pEdge = (double*)&vEdge;
1177     __m256d vEdgeFix16[7];
1178     vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
1179     vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
1180     vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
1181
1182     OSALIGNSIMD(int32_t) aAi[4], aBi[4];
1183     _mm_store_si128((__m128i*)aAi, vAi);
1184     _mm_store_si128((__m128i*)aBi, vBi);
1185     EDGE rastEdges[RT::NumEdgesT::value];
1186
1187     // Compute and store triangle edge data
1188     ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
1189     ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
1190     ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
1191
1192     // Compute and store triangle edge data if scissor needs to rasterized
1193     ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>(
1194         bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
1195
1196     // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
1197     // used to for testing if entire raster tile is inside a triangle
1198     for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1199     {
1200         vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
1201     }
1202
1203     // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
1204     // step sample positions to the raster tile bbox of multisample points
1205     // min(xSamples),min(ySamples)  ------  max(xSamples),min(ySamples)
1206     //                             |      |
1207     //                             |      |
1208     // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
1209     __m256d vEdgeTileBbox[3];
1210     if (NumCoverageSamplesT::value > 1)
1211     {
1212         const SWR_MULTISAMPLE_POS& samplePos         = rastState.samplePositions;
1213         const __m128i              vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
1214         const __m128i              vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
1215
1216         __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
1217         __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
1218
1219         // step edge equation tests from Tile
1220         // used to for testing if entire raster tile is inside a triangle
1221         for (uint32_t e = 0; e < 3; ++e)
1222         {
1223             __m256d vResultAxFix16 =
1224                 _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
1225             __m256d vResultByFix16 =
1226                 _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
1227             vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1228
1229             // adjust for msaa tile bbox edges outward for conservative rast, if enabled
1230             adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(
1231                 vAi, vBi, vEdgeTileBbox[e]);
1232         }
1233     }
1234
1235     RDTSC_END(pDC->pContext->pBucketMgr, BEStepSetup, 0);
1236
1237     uint32_t tY   = minTileY;
1238     uint32_t tX   = minTileX;
1239     uint32_t maxY = maxTileY;
1240     uint32_t maxX = maxTileX;
1241
1242     RenderOutputBuffers renderBuffers, currentRenderBufferRow;
1243     GetRenderHotTiles<RT::MT::numSamples>(pDC,
1244                                           workerId,
1245                                           macroTile,
1246                                           minTileX,
1247                                           minTileY,
1248                                           renderBuffers,
1249                                           triDesc.triFlags.renderTargetArrayIndex);
1250     currentRenderBufferRow = renderBuffers;
1251
1252     // rasterize and generate coverage masks per sample
1253     for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
1254     {
1255         __m256d vStartOfRowEdge[RT::NumEdgesT::value];
1256         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1257         {
1258             vStartOfRowEdge[e] = vEdgeFix16[e];
1259         }
1260
1261         for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
1262         {
1263             triDesc.anyCoveredSamples = 0;
1264
1265             // is the corner of the edge outside of the raster tile? (vEdge < 0)
1266             int mask0, mask1, mask2;
1267             UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
1268
1269             for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
1270             {
1271                 // trivial reject, at least one edge has all 4 corners of raster tile outside
1272                 bool trivialReject =
1273                     TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
1274
1275                 if (!trivialReject)
1276                 {
1277                     // trivial accept mask
1278                     triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
1279
1280                     // Update the raster tile edge masks based on inner conservative edge offsets,
1281                     // if enabled
1282                     UpdateEdgeMasksInnerConservative<RT,
1283                                                      typename RT::ValidEdgeMaskT,
1284                                                      typename RT::InputCoverageT>(
1285                         vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
1286
1287                     // @todo Make this a bit smarter to allow use of trivial accept when:
1288                     //   1) scissor/vp intersection rect is raster tile aligned
1289                     //   2) raster tile is entirely within scissor/vp intersection rect
1290                     if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
1291                     {
1292                         // trivial accept, all 4 corners of all 3 edges are negative
1293                         // i.e. raster tile completely inside triangle
1294                         triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
1295                         if (std::is_same<typename RT::InputCoverageT,
1296                                          InnerConservativeCoverageT>::value)
1297                         {
1298                             triDesc.innerCoverageMask = 0xffffffffffffffffULL;
1299                         }
1300                         RDTSC_EVENT(pDC->pContext->pBucketMgr, BETrivialAccept, 1, 0);
1301                     }
1302                     else
1303                     {
1304                         __m256d vEdgeAtSample[RT::NumEdgesT::value];
1305                         if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1306                         {
1307                             // should get optimized out for single sample case (global value
1308                             // numbering or copy propagation)
1309                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1310                             {
1311                                 vEdgeAtSample[e] = vEdgeFix16[e];
1312                             }
1313                         }
1314                         else
1315                         {
1316                             const SWR_MULTISAMPLE_POS& samplePos       = rastState.samplePositions;
1317                             __m128i                    vSampleOffsetXh = samplePos.vXi(sampleNum);
1318                             __m128i                    vSampleOffsetYh = samplePos.vYi(sampleNum);
1319                             __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
1320                             __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
1321
1322                             // step edge equation tests from UL tile corner to pixel sample position
1323                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1324                             {
1325                                 __m256d vResultAxFix16 =
1326                                     _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
1327                                 __m256d vResultByFix16 =
1328                                     _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
1329                                 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1330                                 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
1331                             }
1332                         }
1333
1334                         double        startQuadEdges[RT::NumEdgesT::value];
1335                         const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1336                         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1337                         {
1338                             _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
1339                         }
1340
1341                         // not trivial accept or reject, must rasterize full tile
1342                         RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizePartial, pDC->drawId);
1343                         triDesc.coverageMask[sampleNum] =
1344                             rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(
1345                                 pDC, startQuadEdges, rastEdges);
1346                         RDTSC_END(pDC->pContext->pBucketMgr, BERasterizePartial, 0);
1347
1348                         triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
1349
1350                         // Output SV InnerCoverage, if needed
1351                         GenerateSVInnerCoverage<RT,
1352                                                 typename RT::ValidEdgeMaskT,
1353                                                 typename RT::InputCoverageT>(
1354                             pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
1355                     }
1356                 }
1357                 else
1358                 {
1359                     // if we're calculating coverage per sample, need to store it off. otherwise no
1360                     // covered samples, don't need to do anything
1361                     if (NumCoverageSamplesT::value > 1)
1362                     {
1363                         triDesc.coverageMask[sampleNum] = 0;
1364                     }
1365                     RDTSC_EVENT(pDC->pContext->pBucketMgr, BETrivialReject, 1, 0);
1366                 }
1367             }
1368
1369 #if KNOB_ENABLE_TOSS_POINTS
1370             if (KNOB_TOSS_RS)
1371             {
1372                 gToss = triDesc.coverageMask[0];
1373             }
1374             else
1375 #endif
1376                 if (triDesc.anyCoveredSamples)
1377             {
1378                 // if conservative rast and MSAA are enabled, conservative coverage for a pixel
1379                 // means all samples in that pixel are covered copy conservative coverage result to
1380                 // all samples
1381                 if (RT::IsConservativeT::value)
1382                 {
1383                     auto copyCoverage = [&](int sample) {
1384                         triDesc.coverageMask[sample] = triDesc.coverageMask[0];
1385                     };
1386                     UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
1387                 }
1388
1389                 // Track rasterized subspans
1390                 AR_EVENT(RasterTileCount(pDC->drawId, 1));
1391
1392                 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelBackend, pDC->drawId);
1393                 backendFuncs.pfnBackend(pDC,
1394                                         workerId,
1395                                         tileX << KNOB_TILE_X_DIM_SHIFT,
1396                                         tileY << KNOB_TILE_Y_DIM_SHIFT,
1397                                         triDesc,
1398                                         renderBuffers);
1399                 RDTSC_END(pDC->pContext->pBucketMgr, BEPixelBackend, 0);
1400             }
1401
1402             // step to the next tile in X
1403             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1404             {
1405                 vEdgeFix16[e] =
1406                     _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
1407             }
1408             StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers);
1409         }
1410
1411         // step to the next tile in Y
1412         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1413         {
1414             vEdgeFix16[e] =
1415                 _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
1416         }
1417         StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow);
1418     }
1419
1420     RDTSC_END(pDC->pContext->pBucketMgr, BERasterizeTriangle, 1);
1421 }
1422
1423 // Get pointers to hot tile memory for color RT, depth, stencil
1424 template <uint32_t numSamples>
1425 void GetRenderHotTiles(DRAW_CONTEXT*        pDC,
1426                        uint32_t             workerId,
1427                        uint32_t             macroID,
1428                        uint32_t             tileX,
1429                        uint32_t             tileY,
1430                        RenderOutputBuffers& renderBuffers,
1431                        uint32_t             renderTargetArrayIndex)
1432 {
1433     const API_STATE& state    = GetApiState(pDC);
1434     SWR_CONTEXT*     pContext = pDC->pContext;
1435     HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1436
1437     uint32_t mx, my;
1438     MacroTileMgr::getTileIndices(macroID, mx, my);
1439     tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
1440     tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
1441
1442     // compute tile offset for active hottile buffers
1443     const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
1444     uint32_t       offset = ComputeTileOffset2D<
1445         TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp>>(
1446         pitch, tileX, tileY);
1447     offset *= numSamples;
1448
1449     unsigned long rtSlot                 = 0;
1450     uint32_t      colorHottileEnableMask = state.colorHottileEnable;
1451     while (_BitScanForward(&rtSlot, colorHottileEnableMask))
1452     {
1453         HOTTILE* pColor = pContext->pHotTileMgr->GetHotTile(
1454             pContext,
1455             pDC,
1456             hWorkerPrivateData,
1457             macroID,
1458             (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
1459             true,
1460             numSamples,
1461             renderTargetArrayIndex);
1462         renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
1463         renderBuffers.pColorHotTile[rtSlot] = pColor;
1464
1465         colorHottileEnableMask &= ~(1 << rtSlot);
1466     }
1467     if (state.depthHottileEnable)
1468     {
1469         const uint32_t pitch =
1470             KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
1471         uint32_t offset = ComputeTileOffset2D<
1472             TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp>>(
1473             pitch, tileX, tileY);
1474         offset *= numSamples;
1475         HOTTILE* pDepth = pContext->pHotTileMgr->GetHotTile(pContext,
1476                                                             pDC,
1477                                                             hWorkerPrivateData,
1478                                                             macroID,
1479                                                             SWR_ATTACHMENT_DEPTH,
1480                                                             true,
1481                                                             numSamples,
1482                                                             renderTargetArrayIndex);
1483         pDepth->state   = HOTTILE_DIRTY;
1484         SWR_ASSERT(pDepth->pBuffer != nullptr);
1485         renderBuffers.pDepth = pDepth->pBuffer + offset;
1486         renderBuffers.pDepthHotTile = pDepth;
1487     }
1488     if (state.stencilHottileEnable)
1489     {
1490         const uint32_t pitch =
1491             KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
1492         uint32_t offset = ComputeTileOffset2D<
1493             TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp>>(
1494             pitch, tileX, tileY);
1495         offset *= numSamples;
1496         HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext,
1497                                                               pDC,
1498                                                               hWorkerPrivateData,
1499                                                               macroID,
1500                                                               SWR_ATTACHMENT_STENCIL,
1501                                                               true,
1502                                                               numSamples,
1503                                                               renderTargetArrayIndex);
1504         pStencil->state   = HOTTILE_DIRTY;
1505         SWR_ASSERT(pStencil->pBuffer != nullptr);
1506         renderBuffers.pStencil = pStencil->pBuffer + offset;
1507         renderBuffers.pStencilHotTile = pStencil;
1508     }
1509 }
1510
1511 template <typename RT>
1512 INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers)
1513 {
1514     DWORD rt = 0;
1515     while (_BitScanForward(&rt, colorHotTileMask))
1516     {
1517         colorHotTileMask &= ~(1 << rt);
1518         buffers.pColor[rt] += RT::colorRasterTileStep;
1519     }
1520
1521     buffers.pDepth += RT::depthRasterTileStep;
1522     buffers.pStencil += RT::stencilRasterTileStep;
1523 }
1524
1525 template <typename RT>
1526 INLINE void StepRasterTileY(uint32_t             colorHotTileMask,
1527                             RenderOutputBuffers& buffers,
1528                             RenderOutputBuffers& startBufferRow)
1529 {
1530     DWORD rt = 0;
1531     while (_BitScanForward(&rt, colorHotTileMask))
1532     {
1533         colorHotTileMask &= ~(1 << rt);
1534         startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
1535         buffers.pColor[rt] = startBufferRow.pColor[rt];
1536     }
1537     startBufferRow.pDepth += RT::depthRasterTileRowStep;
1538     buffers.pDepth = startBufferRow.pDepth;
1539
1540     startBufferRow.pStencil += RT::stencilRasterTileRowStep;
1541     buffers.pStencil = startBufferRow.pStencil;
1542 }