src/gallium/drivers/swr/rasterizer/core/frontend.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file frontend.cpp
  24 *
  25 * @brief Implementation for Frontend which handles vertex processing,
  26 *        primitive assembly, clipping, binning, etc.
  27 *
  28 ******************************************************************************/
  29
  30 #include "api.h"
  31 #include "frontend.h"
  32 #include "backend.h"
  33 #include "context.h"
  34 #include "rdtsc_core.h"
  35 #include "rasterizer.h"
  36 #include "conservativeRast.h"
  37 #include "utils.h"
  38 #include "threads.h"
  39 #include "pa.h"
  40 #include "clip.h"
  41 #include "tilemgr.h"
  42 #include "tessellator.h"
  43 #include <limits>
  44
  45 //////////////////////////////////////////////////////////////////////////
  46 /// @brief Helper macro to generate a bitmask
  47 static INLINE uint32_t GenMask(uint32_t numBits)
  48 {
  49     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
  50     return ((1U << numBits) - 1);
  51 }
  52
  53 //////////////////////////////////////////////////////////////////////////
  54 /// @brief Offsets added to post-viewport vertex positions based on
  55 /// raster state.
  56 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
  57 {
  58     _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
  59     _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
  60 };
  61
  62 //////////////////////////////////////////////////////////////////////////
  63 /// @brief FE handler for SwrSync.
  64 /// @param pContext - pointer to SWR context.
  65 /// @param pDC - pointer to draw context.
  66 /// @param workerId - thread's worker id. Even thread has a unique id.
  67 /// @param pUserData - Pointer to user data passed back to sync callback.
  68 /// @todo This should go away when we switch this to use compute threading.
  69 void ProcessSync(
  70     SWR_CONTEXT *pContext,
  71     DRAW_CONTEXT *pDC,
  72     uint32_t workerId,
  73     void *pUserData)
  74 {
  75     BE_WORK work;
  76     work.type = SYNC;
  77     work.pfnWork = ProcessSyncBE;
  78
  79     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  80     pTileMgr->enqueue(0, 0, &work);
  81 }
  82
  83 //////////////////////////////////////////////////////////////////////////
  84 /// @brief FE handler for SwrClearRenderTarget.
  85 /// @param pContext - pointer to SWR context.
  86 /// @param pDC - pointer to draw context.
  87 /// @param workerId - thread's worker id. Even thread has a unique id.
  88 /// @param pUserData - Pointer to user data passed back to clear callback.
  89 /// @todo This should go away when we switch this to use compute threading.
  90 void ProcessClear(
  91     SWR_CONTEXT *pContext,
  92     DRAW_CONTEXT *pDC,
  93     uint32_t workerId,
  94     void *pUserData)
  95 {
  96     CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
  97     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  98
  99     // queue a clear to each macro tile
 100     // compute macro tile bounds for the specified rect
 101     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 102     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 103     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 104     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 105
 106     BE_WORK work;
 107     work.type = CLEAR;
 108     work.pfnWork = ProcessClearBE;
 109     work.desc.clear = *pDesc;
 110
 111     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 112     {
 113         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 114         {
 115             pTileMgr->enqueue(x, y, &work);
 116         }
 117     }
 118 }
 119
 120 //////////////////////////////////////////////////////////////////////////
 121 /// @brief FE handler for SwrStoreTiles.
 122 /// @param pContext - pointer to SWR context.
 123 /// @param pDC - pointer to draw context.
 124 /// @param workerId - thread's worker id. Even thread has a unique id.
 125 /// @param pUserData - Pointer to user data passed back to callback.
 126 /// @todo This should go away when we switch this to use compute threading.
 127 void ProcessStoreTiles(
 128     SWR_CONTEXT *pContext,
 129     DRAW_CONTEXT *pDC,
 130     uint32_t workerId,
 131     void *pUserData)
 132 {
 133     AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
 134     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 135     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
 136
 137     // queue a store to each macro tile
 138     // compute macro tile bounds for the specified rect
 139     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 140     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 141     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 142     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 143
 144     // store tiles
 145     BE_WORK work;
 146     work.type = STORETILES;
 147     work.pfnWork = ProcessStoreTileBE;
 148     work.desc.storeTiles = *pDesc;
 149
 150     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 151     {
 152         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 153         {
 154             pTileMgr->enqueue(x, y, &work);
 155         }
 156     }
 157
 158     AR_END(FEProcessStoreTiles, 0);
 159 }
 160
 161 //////////////////////////////////////////////////////////////////////////
 162 /// @brief FE handler for SwrInvalidateTiles.
 163 /// @param pContext - pointer to SWR context.
 164 /// @param pDC - pointer to draw context.
 165 /// @param workerId - thread's worker id. Even thread has a unique id.
 166 /// @param pUserData - Pointer to user data passed back to callback.
 167 /// @todo This should go away when we switch this to use compute threading.
 168 void ProcessDiscardInvalidateTiles(
 169     SWR_CONTEXT *pContext,
 170     DRAW_CONTEXT *pDC,
 171     uint32_t workerId,
 172     void *pUserData)
 173 {
 174     AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
 175     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
 176     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 177
 178     // compute macro tile bounds for the specified rect
 179     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
 180     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
 181     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
 182     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
 183
 184     if (pDesc->fullTilesOnly == false)
 185     {
 186         // include partial tiles
 187         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 188         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 189         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 190         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 191     }
 192
 193     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
 194     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
 195
 196     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
 197     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
 198
 199     // load tiles
 200     BE_WORK work;
 201     work.type = DISCARDINVALIDATETILES;
 202     work.pfnWork = ProcessDiscardInvalidateTilesBE;
 203     work.desc.discardInvalidateTiles = *pDesc;
 204
 205     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 206     {
 207         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 208         {
 209             pTileMgr->enqueue(x, y, &work);
 210         }
 211     }
 212
 213     AR_END(FEProcessInvalidateTiles, 0);
 214 }
 215
 216 //////////////////////////////////////////////////////////////////////////
 217 /// @brief Computes the number of primitives given the number of verts.
 218 /// @param mode - primitive topology for draw operation.
 219 /// @param numPrims - number of vertices or indices for draw.
 220 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 221 uint32_t GetNumPrims(
 222     PRIMITIVE_TOPOLOGY mode,
 223     uint32_t numPrims)
 224 {
 225     switch (mode)
 226     {
 227     case TOP_POINT_LIST: return numPrims;
 228     case TOP_TRIANGLE_LIST: return numPrims / 3;
 229     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
 230     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
 231     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
 232     case TOP_QUAD_LIST: return numPrims / 4;
 233     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
 234     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
 235     case TOP_LINE_LIST: return numPrims / 2;
 236     case TOP_LINE_LOOP: return numPrims;
 237     case TOP_RECT_LIST: return numPrims / 3;
 238     case TOP_LINE_LIST_ADJ: return numPrims / 4;
 239     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
 240     case TOP_TRI_LIST_ADJ: return numPrims / 6;
 241     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 242
 243     case TOP_PATCHLIST_1:
 244     case TOP_PATCHLIST_2:
 245     case TOP_PATCHLIST_3:
 246     case TOP_PATCHLIST_4:
 247     case TOP_PATCHLIST_5:
 248     case TOP_PATCHLIST_6:
 249     case TOP_PATCHLIST_7:
 250     case TOP_PATCHLIST_8:
 251     case TOP_PATCHLIST_9:
 252     case TOP_PATCHLIST_10:
 253     case TOP_PATCHLIST_11:
 254     case TOP_PATCHLIST_12:
 255     case TOP_PATCHLIST_13:
 256     case TOP_PATCHLIST_14:
 257     case TOP_PATCHLIST_15:
 258     case TOP_PATCHLIST_16:
 259     case TOP_PATCHLIST_17:
 260     case TOP_PATCHLIST_18:
 261     case TOP_PATCHLIST_19:
 262     case TOP_PATCHLIST_20:
 263     case TOP_PATCHLIST_21:
 264     case TOP_PATCHLIST_22:
 265     case TOP_PATCHLIST_23:
 266     case TOP_PATCHLIST_24:
 267     case TOP_PATCHLIST_25:
 268     case TOP_PATCHLIST_26:
 269     case TOP_PATCHLIST_27:
 270     case TOP_PATCHLIST_28:
 271     case TOP_PATCHLIST_29:
 272     case TOP_PATCHLIST_30:
 273     case TOP_PATCHLIST_31:
 274     case TOP_PATCHLIST_32:
 275         return numPrims / (mode - TOP_PATCHLIST_BASE);
 276
 277     case TOP_POLYGON:
 278     case TOP_POINT_LIST_BF:
 279     case TOP_LINE_STRIP_CONT:
 280     case TOP_LINE_STRIP_BF:
 281     case TOP_LINE_STRIP_CONT_BF:
 282     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 283     case TOP_TRI_STRIP_REVERSE:
 284     case TOP_PATCHLIST_BASE:
 285     case TOP_UNKNOWN:
 286         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 287         return 0;
 288     }
 289
 290     return 0;
 291 }
 292
 293 //////////////////////////////////////////////////////////////////////////
 294 /// @brief Computes the number of verts given the number of primitives.
 295 /// @param mode - primitive topology for draw operation.
 296 /// @param numPrims - number of primitives for draw.
 297 uint32_t GetNumVerts(
 298     PRIMITIVE_TOPOLOGY mode,
 299     uint32_t numPrims)
 300 {
 301     switch (mode)
 302     {
 303     case TOP_POINT_LIST: return numPrims;
 304     case TOP_TRIANGLE_LIST: return numPrims * 3;
 305     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
 306     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
 307     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
 308     case TOP_QUAD_LIST: return numPrims * 4;
 309     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
 310     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
 311     case TOP_LINE_LIST: return numPrims * 2;
 312     case TOP_LINE_LOOP: return numPrims;
 313     case TOP_RECT_LIST: return numPrims * 3;
 314     case TOP_LINE_LIST_ADJ: return numPrims * 4;
 315     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
 316     case TOP_TRI_LIST_ADJ: return numPrims * 6;
 317     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
 318
 319     case TOP_PATCHLIST_1:
 320     case TOP_PATCHLIST_2:
 321     case TOP_PATCHLIST_3:
 322     case TOP_PATCHLIST_4:
 323     case TOP_PATCHLIST_5:
 324     case TOP_PATCHLIST_6:
 325     case TOP_PATCHLIST_7:
 326     case TOP_PATCHLIST_8:
 327     case TOP_PATCHLIST_9:
 328     case TOP_PATCHLIST_10:
 329     case TOP_PATCHLIST_11:
 330     case TOP_PATCHLIST_12:
 331     case TOP_PATCHLIST_13:
 332     case TOP_PATCHLIST_14:
 333     case TOP_PATCHLIST_15:
 334     case TOP_PATCHLIST_16:
 335     case TOP_PATCHLIST_17:
 336     case TOP_PATCHLIST_18:
 337     case TOP_PATCHLIST_19:
 338     case TOP_PATCHLIST_20:
 339     case TOP_PATCHLIST_21:
 340     case TOP_PATCHLIST_22:
 341     case TOP_PATCHLIST_23:
 342     case TOP_PATCHLIST_24:
 343     case TOP_PATCHLIST_25:
 344     case TOP_PATCHLIST_26:
 345     case TOP_PATCHLIST_27:
 346     case TOP_PATCHLIST_28:
 347     case TOP_PATCHLIST_29:
 348     case TOP_PATCHLIST_30:
 349     case TOP_PATCHLIST_31:
 350     case TOP_PATCHLIST_32:
 351         return numPrims * (mode - TOP_PATCHLIST_BASE);
 352
 353     case TOP_POLYGON:
 354     case TOP_POINT_LIST_BF:
 355     case TOP_LINE_STRIP_CONT:
 356     case TOP_LINE_STRIP_BF:
 357     case TOP_LINE_STRIP_CONT_BF:
 358     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 359     case TOP_TRI_STRIP_REVERSE:
 360     case TOP_PATCHLIST_BASE:
 361     case TOP_UNKNOWN:
 362         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 363         return 0;
 364     }
 365
 366     return 0;
 367 }
 368
 369 //////////////////////////////////////////////////////////////////////////
 370 /// @brief Return number of verts per primitive.
 371 /// @param topology - topology
 372 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 373 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 374 {
 375     uint32_t numVerts = 0;
 376     switch (topology)
 377     {
 378     case TOP_POINT_LIST:
 379     case TOP_POINT_LIST_BF:
 380         numVerts = 1;
 381         break;
 382     case TOP_LINE_LIST:
 383     case TOP_LINE_STRIP:
 384     case TOP_LINE_LIST_ADJ:
 385     case TOP_LINE_LOOP:
 386     case TOP_LINE_STRIP_CONT:
 387     case TOP_LINE_STRIP_BF:
 388     case TOP_LISTSTRIP_ADJ:
 389         numVerts = 2;
 390         break;
 391     case TOP_TRIANGLE_LIST:
 392     case TOP_TRIANGLE_STRIP:
 393     case TOP_TRIANGLE_FAN:
 394     case TOP_TRI_LIST_ADJ:
 395     case TOP_TRI_STRIP_ADJ:
 396     case TOP_TRI_STRIP_REVERSE:
 397     case TOP_RECT_LIST:
 398         numVerts = 3;
 399         break;
 400     case TOP_QUAD_LIST:
 401     case TOP_QUAD_STRIP:
 402         numVerts = 4;
 403         break;
 404     case TOP_PATCHLIST_1:
 405     case TOP_PATCHLIST_2:
 406     case TOP_PATCHLIST_3:
 407     case TOP_PATCHLIST_4:
 408     case TOP_PATCHLIST_5:
 409     case TOP_PATCHLIST_6:
 410     case TOP_PATCHLIST_7:
 411     case TOP_PATCHLIST_8:
 412     case TOP_PATCHLIST_9:
 413     case TOP_PATCHLIST_10:
 414     case TOP_PATCHLIST_11:
 415     case TOP_PATCHLIST_12:
 416     case TOP_PATCHLIST_13:
 417     case TOP_PATCHLIST_14:
 418     case TOP_PATCHLIST_15:
 419     case TOP_PATCHLIST_16:
 420     case TOP_PATCHLIST_17:
 421     case TOP_PATCHLIST_18:
 422     case TOP_PATCHLIST_19:
 423     case TOP_PATCHLIST_20:
 424     case TOP_PATCHLIST_21:
 425     case TOP_PATCHLIST_22:
 426     case TOP_PATCHLIST_23:
 427     case TOP_PATCHLIST_24:
 428     case TOP_PATCHLIST_25:
 429     case TOP_PATCHLIST_26:
 430     case TOP_PATCHLIST_27:
 431     case TOP_PATCHLIST_28:
 432     case TOP_PATCHLIST_29:
 433     case TOP_PATCHLIST_30:
 434     case TOP_PATCHLIST_31:
 435     case TOP_PATCHLIST_32:
 436         numVerts = topology - TOP_PATCHLIST_BASE;
 437         break;
 438     default:
 439         SWR_ASSERT(false, "Unsupported topology: %d", topology);
 440         break;
 441     }
 442
 443     if (includeAdjVerts)
 444     {
 445         switch (topology)
 446         {
 447         case TOP_LISTSTRIP_ADJ:
 448         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
 449         case TOP_TRI_STRIP_ADJ:
 450         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
 451         default: break;
 452         }
 453     }
 454
 455     return numVerts;
 456 }
 457
 458 //////////////////////////////////////////////////////////////////////////
 459 /// @brief Generate mask from remaining work.
 460 /// @param numWorkItems - Number of items being worked on by a SIMD.
 461 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 462 {
 463     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
 464     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 465     return _simd_castps_si(vMask(mask));
 466 }
 467
 468
 469 //////////////////////////////////////////////////////////////////////////
 470 /// @brief  Gather scissor rect data based on per-prim viewport indices.
 471 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
 472 /// @param pViewportIndex - array of per-primitive vewport indexes.
 473 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
 474 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
 475 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
 476 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
 477 //
 478 /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
 479 template<size_t SimdWidth>
 480 struct GatherScissors
 481 {
 482     static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
 483         simdscalari &scisXmin, simdscalari &scisYmin,
 484         simdscalari &scisXmax, simdscalari &scisYmax)
 485     {
 486         SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
 487     }
 488 };
 489
 490 template<>
 491 struct GatherScissors<8>
 492 {
 493     static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
 494         simdscalari &scisXmin, simdscalari &scisYmin,
 495         simdscalari &scisXmax, simdscalari &scisYmax)
 496     {
 497         scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
 498             pScissorsInFixedPoint[pViewportIndex[1]].xmin,
 499             pScissorsInFixedPoint[pViewportIndex[2]].xmin,
 500             pScissorsInFixedPoint[pViewportIndex[3]].xmin,
 501             pScissorsInFixedPoint[pViewportIndex[4]].xmin,
 502             pScissorsInFixedPoint[pViewportIndex[5]].xmin,
 503             pScissorsInFixedPoint[pViewportIndex[6]].xmin,
 504             pScissorsInFixedPoint[pViewportIndex[7]].xmin);
 505         scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
 506             pScissorsInFixedPoint[pViewportIndex[1]].ymin,
 507             pScissorsInFixedPoint[pViewportIndex[2]].ymin,
 508             pScissorsInFixedPoint[pViewportIndex[3]].ymin,
 509             pScissorsInFixedPoint[pViewportIndex[4]].ymin,
 510             pScissorsInFixedPoint[pViewportIndex[5]].ymin,
 511             pScissorsInFixedPoint[pViewportIndex[6]].ymin,
 512             pScissorsInFixedPoint[pViewportIndex[7]].ymin);
 513         scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
 514             pScissorsInFixedPoint[pViewportIndex[1]].xmax,
 515             pScissorsInFixedPoint[pViewportIndex[2]].xmax,
 516             pScissorsInFixedPoint[pViewportIndex[3]].xmax,
 517             pScissorsInFixedPoint[pViewportIndex[4]].xmax,
 518             pScissorsInFixedPoint[pViewportIndex[5]].xmax,
 519             pScissorsInFixedPoint[pViewportIndex[6]].xmax,
 520             pScissorsInFixedPoint[pViewportIndex[7]].xmax);
 521         scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
 522             pScissorsInFixedPoint[pViewportIndex[1]].ymax,
 523             pScissorsInFixedPoint[pViewportIndex[2]].ymax,
 524             pScissorsInFixedPoint[pViewportIndex[3]].ymax,
 525             pScissorsInFixedPoint[pViewportIndex[4]].ymax,
 526             pScissorsInFixedPoint[pViewportIndex[5]].ymax,
 527             pScissorsInFixedPoint[pViewportIndex[6]].ymax,
 528             pScissorsInFixedPoint[pViewportIndex[7]].ymax);
 529     }
 530 };
 531
 532 //////////////////////////////////////////////////////////////////////////
 533 /// @brief StreamOut - Streams vertex data out to SO buffers.
 534 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 535 /// @param pDC - pointer to draw context.
 536 /// @param workerId - thread's worker id. Even thread has a unique id.
 537 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 538 static void StreamOut(
 539     DRAW_CONTEXT* pDC,
 540     PA_STATE& pa,
 541     uint32_t workerId,
 542     uint32_t* pPrimData,
 543     uint32_t streamIndex)
 544 {
 545     SWR_CONTEXT *pContext = pDC->pContext;
 546
 547     AR_BEGIN(FEStreamout, pDC->drawId);
 548
 549     const API_STATE& state = GetApiState(pDC);
 550     const SWR_STREAMOUT_STATE &soState = state.soState;
 551
 552     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 553
 554     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
 555     uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
 556
 557     SWR_STREAMOUT_CONTEXT soContext = { 0 };
 558
 559     // Setup buffer state pointers.
 560     for (uint32_t i = 0; i < 4; ++i)
 561     {
 562         soContext.pBuffer[i] = &state.soBuffer[i];
 563     }
 564
 565     uint32_t numPrims = pa.NumPrims();
 566     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 567     {
 568         DWORD slot = 0;
 569         uint32_t soMask = soState.streamMasks[streamIndex];
 570
 571         // Write all entries into primitive data buffer for SOS.
 572         while (_BitScanForward(&slot, soMask))
 573         {
 574             __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
 575             uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
 576             pa.AssembleSingle(paSlot, primIndex, attrib);
 577
 578             // Attribute offset is relative offset from start of vertex.
 579             // Note that attributes start at slot 1 in the PA buffer. We need to write this
 580             // to prim data starting at slot 0. Which is why we do (slot - 1).
 581             // Also note: GL works slightly differently, and needs slot 0
 582             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
 583
 584             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
 585             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
 586             {
 587                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 588
 589                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 590             }
 591             soMask &= ~(1 << slot);
 592         }
 593
 594         // Update pPrimData pointer
 595         soContext.pPrimData = pPrimData;
 596
 597         // Call SOS
 598         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
 599         state.pfnSoFunc[streamIndex](soContext);
 600     }
 601
 602     // Update SO write offset. The driver provides memory for the update.
 603     for (uint32_t i = 0; i < 4; ++i)
 604     {
 605         if (state.soBuffer[i].pWriteOffset)
 606         {
 607             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 608         }
 609
 610         if (state.soBuffer[i].soWriteEnable)
 611         {
 612             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 613             pDC->dynState.SoWriteOffsetDirty[i] = true;
 614         }
 615     }
 616
 617     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
 618     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 619
 620     AR_END(FEStreamout, 1);
 621 }
 622
 623 //////////////////////////////////////////////////////////////////////////
 624 /// @brief Computes number of invocations. The current index represents
 625 ///        the start of the SIMD. The max index represents how much work
 626 ///        items are remaining. If there is less then a SIMD's xmin of work
 627 ///        then return the remaining amount of work.
 628 /// @param curIndex - The start index for the SIMD.
 629 /// @param maxIndex - The last index for all work items.
 630 static INLINE uint32_t GetNumInvocations(
 631     uint32_t curIndex,
 632     uint32_t maxIndex)
 633 {
 634     uint32_t remainder = (maxIndex - curIndex);
 635     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 636 }
 637
 638 //////////////////////////////////////////////////////////////////////////
 639 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 640 ///        The geometry shader will loop over each active streamout buffer, assembling
 641 ///        primitives for the downstream stages. When multistream output is enabled,
 642 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 643 ///        buffer for the primitive assembler.
 644 /// @param stream - stream id to generate the cut buffer for
 645 /// @param pStreamIdBase - pointer to the stream ID buffer
 646 /// @param numEmittedVerts - Number of total verts emitted by the GS
 647 /// @param pCutBuffer - output buffer to write cuts to
 648 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
 649 {
 650     SWR_ASSERT(stream < MAX_SO_STREAMS);
 651
 652     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
 653     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 654
 655     for (uint32_t b = 0; b < numOutputBytes; ++b)
 656     {
 657         uint8_t curInputByte = pStreamIdBase[2*b];
 658         uint8_t outByte = 0;
 659         for (uint32_t i = 0; i < 4; ++i)
 660         {
 661             if ((curInputByte & 0x3) != stream)
 662             {
 663                 outByte |= (1 << i);
 664             }
 665             curInputByte >>= 2;
 666         }
 667
 668         curInputByte = pStreamIdBase[2 * b + 1];
 669         for (uint32_t i = 0; i < 4; ++i)
 670         {
 671             if ((curInputByte & 0x3) != stream)
 672             {
 673                 outByte |= (1 << (i + 4));
 674             }
 675             curInputByte >>= 2;
 676         }
 677
 678         *pCutBuffer++ = outByte;
 679     }
 680 }
 681
 682 THREAD SWR_GS_CONTEXT tlsGsContext;
 683
 684 //////////////////////////////////////////////////////////////////////////
 685 /// @brief Implements GS stage.
 686 /// @param pDC - pointer to draw context.
 687 /// @param workerId - thread's worker id. Even thread has a unique id.
 688 /// @param pa - The primitive assembly object.
 689 /// @param pGsOut - output stream for GS
 690 template <
 691     typename HasStreamOutT,
 692     typename HasRastT>
 693 static void GeometryShaderStage(
 694     DRAW_CONTEXT *pDC,
 695     uint32_t workerId,
 696     PA_STATE& pa,
 697     void* pGsOut,
 698     void* pCutBuffer,
 699     void* pStreamCutBuffer,
 700     uint32_t* pSoPrimData,
 701     simdscalari primID)
 702 {
 703     SWR_CONTEXT *pContext = pDC->pContext;
 704
 705     AR_BEGIN(FEGeometryShader, pDC->drawId);
 706
 707     const API_STATE& state = GetApiState(pDC);
 708     const SWR_GS_STATE* pState = &state.gsState;
 709
 710     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
 711     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 712
 713     tlsGsContext.pStream = (uint8_t*)pGsOut;
 714     tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
 715     tlsGsContext.PrimitiveID = primID;
 716
 717     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 718     simdvector attrib[MAX_ATTRIBUTES];
 719
 720     // assemble all attributes for the input primitive
 721     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 722     {
 723         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
 724         pa.Assemble(attribSlot, attrib);
 725
 726         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 727         {
 728             tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
 729         }
 730     }
 731
 732     // assemble position
 733     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 734     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 735     {
 736         tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
 737     }
 738
 739     const uint32_t vertexStride = sizeof(simdvertex);
 740     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 741     const uint32_t inputPrimStride = numSimdBatches * vertexStride;
 742     const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
 743     uint32_t cutPrimStride;
 744     uint32_t cutInstanceStride;
 745
 746     if (pState->isSingleStream)
 747     {
 748         cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 749         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 750     }
 751     else
 752     {
 753         cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 754         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 755     }
 756
 757     // record valid prims from the frontend to avoid over binning the newly generated
 758     // prims from the GS
 759     uint32_t numInputPrims = pa.NumPrims();
 760
 761     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 762     {
 763         tlsGsContext.InstanceID = instance;
 764         tlsGsContext.mask = GenerateMask(numInputPrims);
 765
 766         // execute the geometry shader
 767         state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 768
 769         tlsGsContext.pStream += instanceStride;
 770         tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
 771     }
 772
 773     // set up new binner and state for the GS output topology
 774     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 775     if (HasRastT::value)
 776     {
 777         switch (pState->outputTopology)
 778         {
 779         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
 780         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
 781         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
 782         default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
 783         }
 784     }
 785
 786     // foreach input prim:
 787     // - setup a new PA based on the emitted verts for that prim
 788     // - loop over the new verts, calling PA to assemble each prim
 789     uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
 790     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 791
 792     uint32_t totalPrimsGenerated = 0;
 793     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 794     {
 795         uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
 796         uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
 797         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 798         {
 799             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 800             if (numEmittedVerts == 0)
 801             {
 802                 continue;
 803             }
 804
 805             uint8_t* pBase = pInstanceBase + instance * instanceStride;
 806             uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
 807
 808             uint32_t numAttribs = state.feNumAttributes;
 809
 810             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
 811             {
 812                 bool processCutVerts = false;
 813
 814                 uint8_t* pCutBuffer = pCutBase;
 815
 816                 // assign default stream ID, only relevant when GS is outputting a single stream
 817                 uint32_t streamID = 0;
 818                 if (pState->isSingleStream)
 819                 {
 820                     processCutVerts = true;
 821                     streamID = pState->singleStreamID;
 822                     if (streamID != stream) continue;
 823                 }
 824                 else
 825                 {
 826                     // early exit if this stream is not enabled for streamout
 827                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
 828                     {
 829                         continue;
 830                     }
 831
 832                     // multi-stream output, need to translate StreamID buffer to a cut buffer
 833                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
 834                     pCutBuffer = (uint8_t*)pStreamCutBuffer;
 835                     processCutVerts = false;
 836                 }
 837
 838                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 839
 840                 while (gsPa.GetNextStreamOutput())
 841                 {
 842                     do
 843                     {
 844                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
 845
 846                         if (assemble)
 847                         {
 848                             totalPrimsGenerated += gsPa.NumPrims();
 849
 850                             if (HasStreamOutT::value)
 851                             {
 852                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
 853                             }
 854
 855                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
 856                             {
 857                                 simdscalari vPrimId;
 858                                 // pull primitiveID from the GS output if available
 859                                 if (state.gsState.emitsPrimitiveID)
 860                                 {
 861                                     simdvector primIdAttrib[3];
 862                                     gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
 863                                     vPrimId = _simd_castps_si(primIdAttrib[0].x);
 864                                 }
 865                                 else
 866                                 {
 867                                     vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
 868                                 }
 869
 870                                 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
 871                                 simdscalari vViewPortIdx;
 872                                 if (state.gsState.emitsViewportArrayIndex)
 873                                 {
 874                                     simdvector vpiAttrib[3];
 875                                     gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
 876
 877                                     // OOB indices => forced to zero.
 878                                     simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 879                                     simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
 880                                     vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
 881
 882                                     vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
 883                                 }
 884                                 else
 885                                 {
 886                                     vViewPortIdx = _simd_set1_epi32(0);
 887                                 }
 888
 889                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
 890                             }
 891                         }
 892                     } while (gsPa.NextPrim());
 893                 }
 894             }
 895         }
 896     }
 897
 898     // update GS pipeline stats
 899     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
 900     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
 901
 902     AR_END(FEGeometryShader, 1);
 903 }
 904
 905 //////////////////////////////////////////////////////////////////////////
 906 /// @brief Allocate GS buffers
 907 /// @param pDC - pointer to draw context.
 908 /// @param state - API state
 909 /// @param ppGsOut - pointer to GS output buffer allocation
 910 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
 911 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
 912     void **ppStreamCutBuffer)
 913 {
 914     auto pArena = pDC->pArena;
 915     SWR_ASSERT(pArena != nullptr);
 916     SWR_ASSERT(state.gsState.gsEnable);
 917     // allocate arena space to hold GS output verts
 918     // @todo pack attribs
 919     // @todo support multiple streams
 920     const uint32_t vertexStride = sizeof(simdvertex);
 921     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 922     uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
 923     *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
 924
 925     const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 926     const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 927     const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 928     const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 929
 930     // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
 931     // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
 932
 933     // allocate space for temporary per-stream cut buffer if multi-stream is enabled
 934     if (state.gsState.isSingleStream)
 935     {
 936         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 937         *ppStreamCutBuffer = nullptr;
 938     }
 939     else
 940     {
 941         *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
 942         *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 943     }
 944
 945 }
 946
 947 //////////////////////////////////////////////////////////////////////////
 948 /// @brief Contains all data generated by the HS and passed to the
 949 /// tessellator and DS.
 950 struct TessellationThreadLocalData
 951 {
 952     SWR_HS_CONTEXT hsContext;
 953     ScalarPatch patchData[KNOB_SIMD_WIDTH];
 954     void* pTxCtx;
 955     size_t tsCtxSize;
 956
 957     simdscalar* pDSOutput;
 958     size_t numDSOutputVectors;
 959 };
 960
 961 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
 962
 963 //////////////////////////////////////////////////////////////////////////
 964 /// @brief Allocate tessellation data for this worker thread.
 965 INLINE
 966 static void AllocateTessellationData(SWR_CONTEXT* pContext)
 967 {
 968     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
 969     if (gt_pTessellationThreadData == nullptr)
 970     {
 971         gt_pTessellationThreadData = (TessellationThreadLocalData*)
 972             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
 973         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
 974     }
 975 }
 976
 977 //////////////////////////////////////////////////////////////////////////
 978 /// @brief Implements Tessellation Stages.
 979 /// @param pDC - pointer to draw context.
 980 /// @param workerId - thread's worker id. Even thread has a unique id.
 981 /// @param pa - The primitive assembly object.
 982 /// @param pGsOut - output stream for GS
 983 template <
 984     typename HasGeometryShaderT,
 985     typename HasStreamOutT,
 986     typename HasRastT>
 987 static void TessellationStages(
 988     DRAW_CONTEXT *pDC,
 989     uint32_t workerId,
 990     PA_STATE& pa,
 991     void* pGsOut,
 992     void* pCutBuffer,
 993     void* pCutStreamBuffer,
 994     uint32_t* pSoPrimData,
 995     simdscalari primID)
 996 {
 997     SWR_CONTEXT *pContext = pDC->pContext;
 998     const API_STATE& state = GetApiState(pDC);
 999     const SWR_TS_STATE& tsState = state.tsState;
1000
1001     SWR_ASSERT(gt_pTessellationThreadData);
1002
1003     HANDLE tsCtx = TSInitCtx(
1004         tsState.domain,
1005         tsState.partitioning,
1006         tsState.tsOutputTopology,
1007         gt_pTessellationThreadData->pTxCtx,
1008         gt_pTessellationThreadData->tsCtxSize);
1009     if (tsCtx == nullptr)
1010     {
1011         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1012         tsCtx = TSInitCtx(
1013             tsState.domain,
1014             tsState.partitioning,
1015             tsState.tsOutputTopology,
1016             gt_pTessellationThreadData->pTxCtx,
1017             gt_pTessellationThreadData->tsCtxSize);
1018     }
1019     SWR_ASSERT(tsCtx);
1020
1021     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1022     if (HasRastT::value)
1023     {
1024         switch (tsState.postDSTopology)
1025         {
1026         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1027         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
1028         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
1029         default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
1030         }
1031     }
1032
1033     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1034     hsContext.pCPout = gt_pTessellationThreadData->patchData;
1035     hsContext.PrimitiveID = primID;
1036
1037     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1038     // Max storage for one attribute for an entire simdprimitive
1039     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1040
1041     // assemble all attributes for the input primitives
1042     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1043     {
1044         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1045         pa.Assemble(attribSlot, simdattrib);
1046
1047         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1048         {
1049             hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1050         }
1051     }
1052
1053 #if defined(_DEBUG)
1054     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1055 #endif
1056
1057     uint32_t numPrims = pa.NumPrims();
1058     hsContext.mask = GenerateMask(numPrims);
1059
1060     // Run the HS
1061     AR_BEGIN(FEHullShader, pDC->drawId);
1062     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1063     AR_END(FEHullShader, 0);
1064
1065     UPDATE_STAT_FE(HsInvocations, numPrims);
1066
1067     const uint32_t* pPrimId = (const uint32_t*)&primID;
1068
1069     for (uint32_t p = 0; p < numPrims; ++p)
1070     {
1071         // Run Tessellator
1072         SWR_TS_TESSELLATED_DATA tsData = { 0 };
1073         AR_BEGIN(FETessellation, pDC->drawId);
1074         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1075         AR_END(FETessellation, 0);
1076
1077         if (tsData.NumPrimitives == 0)
1078         {
1079             continue;
1080         }
1081         SWR_ASSERT(tsData.NumDomainPoints);
1082
1083         // Allocate DS Output memory
1084         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1085         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1086         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1087         if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1088         {
1089             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1090             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1091             gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1092         }
1093         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1094         SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1095
1096 #if defined(_DEBUG)
1097         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1098 #endif
1099
1100         // Run Domain Shader
1101         SWR_DS_CONTEXT dsContext;
1102         dsContext.PrimitiveID = pPrimId[p];
1103         dsContext.pCpIn = &hsContext.pCPout[p];
1104         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1105         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1106         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1107         dsContext.vectorStride = requiredDSVectorInvocations;
1108
1109         uint32_t dsInvocations = 0;
1110
1111         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1112         {
1113             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1114
1115             AR_BEGIN(FEDomainShader, pDC->drawId);
1116             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1117             AR_END(FEDomainShader, 0);
1118
1119             dsInvocations += KNOB_SIMD_WIDTH;
1120         }
1121         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1122
1123         PA_TESS tessPa(
1124             pDC,
1125             dsContext.pOutputData,
1126             dsContext.vectorStride,
1127             tsState.numDsOutputAttribs,
1128             tsData.ppIndices,
1129             tsData.NumPrimitives,
1130             tsState.postDSTopology);
1131
1132         while (tessPa.HasWork())
1133         {
1134             if (HasGeometryShaderT::value)
1135             {
1136                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1137                     pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1138                     _simd_set1_epi32(dsContext.PrimitiveID));
1139             }
1140             else
1141             {
1142                 if (HasStreamOutT::value)
1143                 {
1144                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1145                 }
1146
1147                 if (HasRastT::value)
1148                 {
1149                     simdvector prim[3]; // Only deal with triangles, lines, or points
1150                     AR_BEGIN(FEPAAssemble, pDC->drawId);
1151 #if SWR_ENABLE_ASSERTS
1152                     bool assemble =
1153 #endif
1154                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1155                     AR_END(FEPAAssemble, 1);
1156                     SWR_ASSERT(assemble);
1157
1158                     SWR_ASSERT(pfnClipFunc);
1159                     pfnClipFunc(pDC, tessPa, workerId, prim,
1160                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1161                 }
1162             }
1163
1164             tessPa.NextPrim();
1165
1166         } // while (tessPa.HasWork())
1167     } // for (uint32_t p = 0; p < numPrims; ++p)
1168
1169     TSDestroyCtx(tsCtx);
1170 }
1171
1172 //////////////////////////////////////////////////////////////////////////
1173 /// @brief FE handler for SwrDraw.
1174 /// @tparam IsIndexedT - Is indexed drawing enabled
1175 /// @tparam HasTessellationT - Is tessellation enabled
1176 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1177 /// @tparam HasStreamOutT - Is stream-out enabled
1178 /// @tparam HasRastT - Is rasterization enabled
1179 /// @param pContext - pointer to SWR context.
1180 /// @param pDC - pointer to draw context.
1181 /// @param workerId - thread's worker id.
1182 /// @param pUserData - Pointer to DRAW_WORK
1183 template <
1184     typename IsIndexedT,
1185     typename IsCutIndexEnabledT,
1186     typename HasTessellationT,
1187     typename HasGeometryShaderT,
1188     typename HasStreamOutT,
1189     typename HasRastT>
1190 void ProcessDraw(
1191     SWR_CONTEXT *pContext,
1192     DRAW_CONTEXT *pDC,
1193     uint32_t workerId,
1194     void *pUserData)
1195 {
1196
1197 #if KNOB_ENABLE_TOSS_POINTS
1198     if (KNOB_TOSS_QUEUE_FE)
1199     {
1200         return;
1201     }
1202 #endif
1203
1204     AR_BEGIN(FEProcessDraw, pDC->drawId);
1205
1206     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1207     const API_STATE&    state = GetApiState(pDC);
1208     __m256i             vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1209     SWR_VS_CONTEXT      vsContext;
1210     simdvertex          vin;
1211
1212     int indexSize = 0;
1213     uint32_t endVertex = work.numVerts;
1214
1215     const int32_t* pLastRequestedIndex = nullptr;
1216     if (IsIndexedT::value)
1217     {
1218         switch (work.type)
1219         {
1220         case R32_UINT:
1221             indexSize = sizeof(uint32_t);
1222             pLastRequestedIndex = &(work.pIB[endVertex]);
1223             break;
1224         case R16_UINT:
1225             indexSize = sizeof(uint16_t);
1226             // nasty address offset to last index
1227             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1228             break;
1229         case R8_UINT:
1230             indexSize = sizeof(uint8_t);
1231             // nasty address offset to last index
1232             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1233             break;
1234         default:
1235             SWR_ASSERT(0);
1236         }
1237     }
1238     else
1239     {
1240         // No cuts, prune partial primitives.
1241         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1242     }
1243
1244     SWR_FETCH_CONTEXT fetchInfo = { 0 };
1245     fetchInfo.pStreams = &state.vertexBuffers[0];
1246     fetchInfo.StartInstance = work.startInstance;
1247     fetchInfo.StartVertex = 0;
1248
1249     vsContext.pVin = &vin;
1250
1251     if (IsIndexedT::value)
1252     {
1253         fetchInfo.BaseVertex = work.baseVertex;
1254
1255         // if the entire index buffer isn't being consumed, set the last index
1256         // so that fetches < a SIMD wide will be masked off
1257         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1258         if (pLastRequestedIndex < fetchInfo.pLastIndex)
1259         {
1260             fetchInfo.pLastIndex = pLastRequestedIndex;
1261         }
1262     }
1263     else
1264     {
1265         fetchInfo.StartVertex = work.startVertex;
1266     }
1267
1268 #ifdef KNOB_ENABLE_RDTSC
1269     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1270 #endif
1271
1272     void* pGsOut = nullptr;
1273     void* pCutBuffer = nullptr;
1274     void* pStreamCutBuffer = nullptr;
1275     if (HasGeometryShaderT::value)
1276     {
1277         AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1278     }
1279
1280     if (HasTessellationT::value)
1281     {
1282         SWR_ASSERT(state.tsState.tsEnable == true);
1283         SWR_ASSERT(state.pfnHsFunc != nullptr);
1284         SWR_ASSERT(state.pfnDsFunc != nullptr);
1285
1286         AllocateTessellationData(pContext);
1287     }
1288     else
1289     {
1290         SWR_ASSERT(state.tsState.tsEnable == false);
1291         SWR_ASSERT(state.pfnHsFunc == nullptr);
1292         SWR_ASSERT(state.pfnDsFunc == nullptr);
1293     }
1294
1295     // allocate space for streamout input prim data
1296     uint32_t* pSoPrimData = nullptr;
1297     if (HasStreamOutT::value)
1298     {
1299         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1300     }
1301
1302     // choose primitive assembler
1303     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1304     PA_STATE& pa = paFactory.GetPA();
1305
1306     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1307     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1308     {
1309         simdscalari vIndex;
1310         uint32_t  i = 0;
1311
1312         if (IsIndexedT::value)
1313         {
1314             fetchInfo.pIndices = work.pIB;
1315         }
1316         else
1317         {
1318             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1319             fetchInfo.pIndices = (const int32_t*)&vIndex;
1320         }
1321
1322         fetchInfo.CurInstance = instanceNum;
1323         vsContext.InstanceID = instanceNum;
1324
1325         while (pa.HasWork())
1326         {
1327             // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1328             // So we need to keep this outside of (i < endVertex) check.
1329             simdmask* pvCutIndices = nullptr;
1330             if (IsIndexedT::value)
1331             {
1332                 pvCutIndices = &pa.GetNextVsIndices();
1333             }
1334
1335             simdvertex& vout = pa.GetNextVsOutput();
1336             vsContext.pVout = &vout;
1337
1338             if (i < endVertex)
1339             {
1340
1341                 // 1. Execute FS/VS for a single SIMD.
1342                 AR_BEGIN(FEFetchShader, pDC->drawId);
1343                 state.pfnFetchFunc(fetchInfo, vin);
1344                 AR_END(FEFetchShader, 0);
1345
1346                 // forward fetch generated vertex IDs to the vertex shader
1347                 vsContext.VertexID = fetchInfo.VertexID;
1348
1349                 // Setup active mask for vertex shader.
1350                 vsContext.mask = GenerateMask(endVertex - i);
1351
1352                 // forward cut mask to the PA
1353                 if (IsIndexedT::value)
1354                 {
1355                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1356                 }
1357
1358                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1359
1360 #if KNOB_ENABLE_TOSS_POINTS
1361                 if (!KNOB_TOSS_FETCH)
1362 #endif
1363                 {
1364                     AR_BEGIN(FEVertexShader, pDC->drawId);
1365                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1366                     AR_END(FEVertexShader, 0);
1367
1368                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1369                 }
1370             }
1371
1372             // 2. Assemble primitives given the last two SIMD.
1373             do
1374             {
1375                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1376                 // PaAssemble returns false if there is not enough verts to assemble.
1377                 AR_BEGIN(FEPAAssemble, pDC->drawId);
1378                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1379                 AR_END(FEPAAssemble, 1);
1380
1381 #if KNOB_ENABLE_TOSS_POINTS
1382                 if (!KNOB_TOSS_FETCH)
1383 #endif
1384                 {
1385 #if KNOB_ENABLE_TOSS_POINTS
1386                     if (!KNOB_TOSS_VS)
1387 #endif
1388                     {
1389                         if (assemble)
1390                         {
1391                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1392
1393                             if (HasTessellationT::value)
1394                             {
1395                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1396                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1397                             }
1398                             else if (HasGeometryShaderT::value)
1399                             {
1400                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1401                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1402                             }
1403                             else
1404                             {
1405                                 // If streamout is enabled then stream vertices out to memory.
1406                                 if (HasStreamOutT::value)
1407                                 {
1408                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1409                                 }
1410
1411                                 if (HasRastT::value)
1412                                 {
1413                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
1414                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1415                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1416                                 }
1417                             }
1418                         }
1419                     }
1420                 }
1421             } while (pa.NextPrim());
1422
1423             i += KNOB_SIMD_WIDTH;
1424             if (IsIndexedT::value)
1425             {
1426                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1427             }
1428             else
1429             {
1430                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1431             }
1432         }
1433         pa.Reset();
1434     }
1435
1436     AR_END(FEProcessDraw, numPrims * work.numInstances);
1437 }
1438
1439 struct FEDrawChooser
1440 {
1441     typedef PFN_FE_WORK_FUNC FuncType;
1442
1443     template <typename... ArgsB>
1444     static FuncType GetFunc()
1445     {
1446         return ProcessDraw<ArgsB...>;
1447     }
1448 };
1449
1450
1451 // Selector for correct templated Draw front-end function
1452 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1453     bool IsIndexed,
1454     bool IsCutIndexEnabled,
1455     bool HasTessellation,
1456     bool HasGeometryShader,
1457     bool HasStreamOut,
1458     bool HasRasterization)
1459 {
1460     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1461 }
1462
1463 //////////////////////////////////////////////////////////////////////////
1464 /// @brief Processes attributes for the backend based on linkage mask and
1465 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
1466 /// @param pDC - Draw context
1467 /// @param pa - Primitive Assembly state
1468 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1469 /// @param pLinkageMap - maps VS attribute slot to PS slot
1470 /// @param triIndex - Triangle to process attributes for
1471 /// @param pBuffer - Output result
1472 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
1473 INLINE void ProcessAttributes(
1474     DRAW_CONTEXT *pDC,
1475     PA_STATE&pa,
1476     uint32_t triIndex,
1477     uint32_t primId,
1478     float *pBuffer)
1479 {
1480     static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
1481     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1482     // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
1483     LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
1484     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1485     const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
1486
1487     static const float constTable[3][4] = {
1488         {0.0f, 0.0f, 0.0f, 0.0f},
1489         {0.0f, 0.0f, 0.0f, 1.0f},
1490         {1.0f, 1.0f, 1.0f, 1.0f}
1491     };
1492
1493     for (uint32_t i = 0; i < backendState.numAttributes; ++i)
1494     {
1495         uint32_t inputSlot;
1496         if (IsSwizzledT::value)
1497         {
1498             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
1499             inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
1500
1501         }
1502         else
1503         {
1504             inputSlot = VERTEX_ATTRIB_START_SLOT + i;
1505         }
1506
1507         __m128 attrib[3];    // triangle attribs (always 4 wide)
1508         float* pAttribStart = pBuffer;
1509
1510         if (HasConstantInterpT::value || IsDegenerate::value)
1511         {
1512             if (_bittest(&constantInterpMask, i))
1513             {
1514                 uint32_t vid;
1515                 uint32_t adjustedTriIndex;
1516                 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
1517                 static const int32_t quadProvokingTri[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} };
1518                 static const uint32_t quadProvokingVertex[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} };
1519                 static const int32_t qstripProvokingTri[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} };
1520                 static const uint32_t qstripProvokingVertex[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} };
1521
1522                 switch (topo) {
1523                 case TOP_QUAD_LIST:
1524                     adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
1525                     vid = quadProvokingVertex[triIndex & 1][provokingVertex];
1526                     break;
1527                 case TOP_QUAD_STRIP:
1528                     adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
1529                     vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
1530                     break;
1531                 case TOP_TRIANGLE_STRIP:
1532                     adjustedTriIndex = triIndex;
1533                     vid = (triIndex & 1)
1534                         ? tristripProvokingVertex[provokingVertex]
1535                         : provokingVertex;
1536                     break;
1537                 default:
1538                     adjustedTriIndex = triIndex;
1539                     vid = provokingVertex;
1540                     break;
1541                 }
1542
1543                 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
1544
1545                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1546                 {
1547                     _mm_store_ps(pBuffer, attrib[vid]);
1548                     pBuffer += 4;
1549                 }
1550             }
1551             else
1552             {
1553                 pa.AssembleSingle(inputSlot, triIndex, attrib);
1554
1555                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1556                 {
1557                     _mm_store_ps(pBuffer, attrib[i]);
1558                     pBuffer += 4;
1559                 }
1560             }
1561         }
1562         else
1563         {
1564             pa.AssembleSingle(inputSlot, triIndex, attrib);
1565
1566             for (uint32_t i = 0; i < NumVertsT::value; ++i)
1567             {
1568                 _mm_store_ps(pBuffer, attrib[i]);
1569                 pBuffer += 4;
1570             }
1571         }
1572
1573         // pad out the attrib buffer to 3 verts to ensure the triangle
1574         // interpolation code in the pixel shader works correctly for the
1575         // 3 topologies - point, line, tri.  This effectively zeros out the
1576         // effect of the missing vertices in the triangle interpolation.
1577         for (uint32_t v = NumVertsT::value; v < 3; ++v)
1578         {
1579             _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
1580             pBuffer += 4;
1581         }
1582
1583         // check for constant source overrides
1584         if (IsSwizzledT::value)
1585         {
1586             uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
1587             if (mask)
1588             {
1589                 DWORD comp;
1590                 while (_BitScanForward(&comp, mask))
1591                 {
1592                     mask &= ~(1 << comp);
1593
1594                     float constantValue = 0.0f;
1595                     switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
1596                     {
1597                     case SWR_CONSTANT_SOURCE_CONST_0000:
1598                     case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
1599                     case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
1600                         constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
1601                         break;
1602                     case SWR_CONSTANT_SOURCE_PRIM_ID:
1603                         constantValue = *(float*)&primId;
1604                         break;
1605                     }
1606
1607                     // apply constant value to all 3 vertices
1608                     for (uint32_t v = 0; v < 3; ++v)
1609                     {
1610                         pAttribStart[comp + v * 4] = constantValue;
1611                     }
1612                 }
1613             }
1614         }
1615     }
1616 }
1617
1618
1619 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
1620
1621 struct ProcessAttributesChooser
1622 {
1623     typedef PFN_PROCESS_ATTRIBUTES FuncType;
1624
1625     template <typename... ArgsB>
1626     static FuncType GetFunc()
1627     {
1628         return ProcessAttributes<ArgsB...>;
1629     }
1630 };
1631
1632 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
1633 {
1634     return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
1635 }
1636
1637 //////////////////////////////////////////////////////////////////////////
1638 /// @brief Processes enabled user clip distances. Loads the active clip
1639 ///        distances from the PA, sets up barycentric equations, and
1640 ///        stores the results to the output buffer
1641 /// @param pa - Primitive Assembly state
1642 /// @param primIndex - primitive index to process
1643 /// @param clipDistMask - mask of enabled clip distances
1644 /// @param pUserClipBuffer - buffer to store results
1645 template<uint32_t NumVerts>
1646 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1647 {
1648     DWORD clipDist;
1649     while (_BitScanForward(&clipDist, clipDistMask))
1650     {
1651         clipDistMask &= ~(1 << clipDist);
1652         uint32_t clipSlot = clipDist >> 2;
1653         uint32_t clipComp = clipDist & 0x3;
1654         uint32_t clipAttribSlot = clipSlot == 0 ?
1655             VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1656
1657         __m128 primClipDist[3];
1658         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1659
1660         float vertClipDist[NumVerts];
1661         for (uint32_t e = 0; e < NumVerts; ++e)
1662         {
1663             OSALIGNSIMD(float) aVertClipDist[4];
1664             _mm_store_ps(aVertClipDist, primClipDist[e]);
1665             vertClipDist[e] = aVertClipDist[clipComp];
1666         };
1667
1668         // setup plane equations for barycentric interpolation in the backend
1669         float baryCoeff[NumVerts];
1670         for (uint32_t e = 0; e < NumVerts - 1; ++e)
1671         {
1672             baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1673         }
1674         baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1675
1676         for (uint32_t e = 0; e < NumVerts; ++e)
1677         {
1678             *(pUserClipBuffer++) = baryCoeff[e];
1679         }
1680     }
1681 }
1682
1683 //////////////////////////////////////////////////////////////////////////
1684 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
1685 /// Point precision from FP32.
1686 template <typename PT = FixedPointTraits<Fixed_16_8>>
1687 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
1688 {
1689     simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
1690     return _simd_cvtps_epi32(vFixed);
1691 }
1692
1693 //////////////////////////////////////////////////////////////////////////
1694 /// @brief Helper function to set the X,Y coords of a triangle to the
1695 /// requested Fixed Point precision from FP32.
1696 /// @param tri: simdvector[3] of FP triangle verts
1697 /// @param vXi: fixed point X coords of tri verts
1698 /// @param vYi: fixed point Y coords of tri verts
1699 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3])
1700 {
1701     vXi[0] = fpToFixedPointVertical(tri[0].x);
1702     vYi[0] = fpToFixedPointVertical(tri[0].y);
1703     vXi[1] = fpToFixedPointVertical(tri[1].x);
1704     vYi[1] = fpToFixedPointVertical(tri[1].y);
1705     vXi[2] = fpToFixedPointVertical(tri[2].x);
1706     vYi[2] = fpToFixedPointVertical(tri[2].y);
1707 }
1708
1709 //////////////////////////////////////////////////////////////////////////
1710 /// @brief Calculate bounding box for current triangle
1711 /// @tparam CT: ConservativeRastFETraits type
1712 /// @param vX: fixed point X position for triangle verts
1713 /// @param vY: fixed point Y position for triangle verts
1714 /// @param bbox: fixed point bbox
1715 /// *Note*: expects vX, vY to be in the correct precision for the type
1716 /// of rasterization. This avoids unnecessary FP->fixed conversions.
1717 template <typename CT>
1718 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1719 {
1720     simdscalari vMinX = vX[0];
1721     vMinX = _simd_min_epi32(vMinX, vX[1]);
1722     vMinX = _simd_min_epi32(vMinX, vX[2]);
1723
1724     simdscalari vMaxX = vX[0];
1725     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1726     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1727
1728     simdscalari vMinY = vY[0];
1729     vMinY = _simd_min_epi32(vMinY, vY[1]);
1730     vMinY = _simd_min_epi32(vMinY, vY[2]);
1731
1732     simdscalari vMaxY = vY[0];
1733     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1734     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1735
1736     bbox.xmin = vMinX;
1737     bbox.xmax = vMaxX;
1738     bbox.ymin = vMinY;
1739     bbox.ymax = vMaxY;
1740 }
1741
1742 //////////////////////////////////////////////////////////////////////////
1743 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
1744 /// Offsets BBox for conservative rast
1745 template <>
1746 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1747 {
1748     // FE conservative rast traits
1749     typedef FEConservativeRastT CT;
1750
1751     simdscalari vMinX = vX[0];
1752     vMinX = _simd_min_epi32(vMinX, vX[1]);
1753     vMinX = _simd_min_epi32(vMinX, vX[2]);
1754
1755     simdscalari vMaxX = vX[0];
1756     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1757     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1758
1759     simdscalari vMinY = vY[0];
1760     vMinY = _simd_min_epi32(vMinY, vY[1]);
1761     vMinY = _simd_min_epi32(vMinY, vY[2]);
1762
1763     simdscalari vMaxY = vY[0];
1764     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1765     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1766
1767     /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
1768     /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
1769     bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1770     bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1771     bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1772     bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1773 }
1774
1775 //////////////////////////////////////////////////////////////////////////
1776 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1777 ///        culling, viewport transform, etc.
1778 /// @param pDC - pointer to draw context.
1779 /// @param pa - The primitive assembly object.
1780 /// @param workerId - thread's worker id. Even thread has a unique id.
1781 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1782 /// @param primID - Primitive ID for each triangle.
1783 /// @param viewportIdx - viewport array index for each triangle.
1784 /// @tparam CT - ConservativeRastFETraits
1785 template <typename CT>
1786 void BinTriangles(
1787     DRAW_CONTEXT *pDC,
1788     PA_STATE& pa,
1789     uint32_t workerId,
1790     simdvector tri[3],
1791     uint32_t triMask,
1792     simdscalari primID,
1793     simdscalari viewportIdx)
1794 {
1795     SWR_CONTEXT *pContext = pDC->pContext;
1796
1797     AR_BEGIN(FEBinTriangles, pDC->drawId);
1798
1799     const API_STATE& state = GetApiState(pDC);
1800     const SWR_RASTSTATE& rastState = state.rastState;
1801     const SWR_FRONTEND_STATE& feState = state.frontendState;
1802     const SWR_GS_STATE& gsState = state.gsState;
1803     MacroTileMgr *pTileMgr = pDC->pTileMgr;
1804
1805
1806     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1807     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1808     simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1809
1810     if (feState.vpTransformDisable)
1811     {
1812         // RHW is passed in directly when VP transform is disabled
1813         vRecipW0 = tri[0].v[3];
1814         vRecipW1 = tri[1].v[3];
1815         vRecipW2 = tri[2].v[3];
1816     }
1817     else
1818     {
1819         // Perspective divide
1820         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1821         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1822         vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1823
1824         tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1825         tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1826         tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1827
1828         tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1829         tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1830         tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1831
1832         tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1833         tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1834         tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1835
1836         // Viewport transform to screen space coords
1837         if (state.gsState.emitsViewportArrayIndex)
1838         {
1839             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
1840         }
1841         else
1842         {
1843             viewportTransform<3>(tri, state.vpMatrices);
1844         }
1845     }
1846
1847     // Adjust for pixel center location
1848     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1849     tri[0].x = _simd_add_ps(tri[0].x, offset);
1850     tri[0].y = _simd_add_ps(tri[0].y, offset);
1851
1852     tri[1].x = _simd_add_ps(tri[1].x, offset);
1853     tri[1].y = _simd_add_ps(tri[1].y, offset);
1854
1855     tri[2].x = _simd_add_ps(tri[2].x, offset);
1856     tri[2].y = _simd_add_ps(tri[2].y, offset);
1857
1858     simdscalari vXi[3], vYi[3];
1859     // Set vXi, vYi to required fixed point precision
1860     FPToFixedPoint(tri, vXi, vYi);
1861
1862     // triangle setup
1863     simdscalari vAi[3], vBi[3];
1864     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1865
1866     // determinant
1867     simdscalari vDet[2];
1868     calcDeterminantIntVertical(vAi, vBi, vDet);
1869
1870     // cull zero area
1871     int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1872     int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1873
1874     int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
1875
1876     uint32_t origTriMask = triMask;
1877     // don't cull degenerate triangles if we're conservatively rasterizing
1878     if(!CT::IsConservativeT::value)
1879     {
1880         triMask &= ~cullZeroAreaMask;
1881     }
1882
1883     // determine front winding tris
1884     // CW  +det
1885     // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1886     maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1887     maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1888     int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1889
1890     uint32_t frontWindingTris;
1891     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1892     {
1893         frontWindingTris = cwTriMask;
1894     }
1895     else
1896     {
1897         frontWindingTris = ~cwTriMask;
1898     }
1899
1900     // cull
1901     uint32_t cullTris;
1902     switch ((SWR_CULLMODE)rastState.cullMode)
1903     {
1904     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
1905     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
1906     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1907     // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1908     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
1909     default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1910     }
1911
1912     triMask &= ~cullTris;
1913
1914     if (origTriMask ^ triMask)
1915     {
1916         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1917     }
1918
1919     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1920     // compute per tri backface
1921     uint32_t frontFaceMask = frontWindingTris;
1922     uint32_t *pPrimID = (uint32_t *)&primID;
1923     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1924     DWORD triIndex = 0;
1925     // for center sample pattern, all samples are at pixel center; calculate coverage
1926     // once at center and broadcast the results in the backend
1927     const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1928     uint32_t edgeEnable;
1929     PFN_WORK_FUNC pfnWork;
1930     if(CT::IsConservativeT::value)
1931     {
1932         // determine which edges of the degenerate tri, if any, are valid to rasterize.
1933         // used to call the appropriate templated rasterizer function
1934         if(cullZeroAreaMask > 0)
1935         {
1936             // e0 = v1-v0
1937             simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
1938             simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
1939             uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
1940
1941             // e1 = v2-v1
1942             simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
1943             simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
1944             uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
1945
1946             // e2 = v0-v2
1947             // if v0 == v1 & v1 == v2, v0 == v2
1948             uint32_t e2Mask = e0Mask & e1Mask;
1949             SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
1950
1951             // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1952             // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1953             e0Mask = pdep_u32(e0Mask, 0x00249249);
1954             // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1955             e1Mask = pdep_u32(e1Mask, 0x00492492);
1956             // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1957             e2Mask = pdep_u32(e2Mask, 0x00924924);
1958
1959             edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
1960         }
1961         else
1962         {
1963             edgeEnable = 0x00FFFFFF;
1964         }
1965     }
1966     else
1967     {
1968         // degenerate triangles won't be sent to rasterizer; just enable all edges
1969         pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
1970                                     (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
1971                                     (state.scissorsTileAligned == false));
1972     }
1973
1974     if (!triMask)
1975     {
1976         goto endBinTriangles;
1977     }
1978
1979     // Calc bounding box of triangles
1980     simdBBox bbox;
1981     calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
1982
1983     // determine if triangle falls between pixel centers and discard
1984     // only discard for non-MSAA case and when conservative rast is disabled
1985     // (xmin + 127) & ~255
1986     // (xmax + 128) & ~255
1987     if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
1988     {
1989         origTriMask = triMask;
1990
1991         int cullCenterMask;
1992         {
1993             simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
1994             xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
1995             simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
1996             xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
1997
1998             simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
1999
2000             simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
2001             ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
2002             simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
2003             ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
2004
2005             simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
2006             vMaskV = _simd_or_si(vMaskH, vMaskV);
2007             cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
2008         }
2009
2010         triMask &= ~cullCenterMask;
2011
2012         if(origTriMask ^ triMask)
2013         {
2014             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
2015         }
2016     }
2017
2018     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2019     // Gather the AOS effective scissor rects based on the per-prim VP index.
2020     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
2021     simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2022     if (state.gsState.emitsViewportArrayIndex)
2023     {
2024         GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2025                                                 scisXmin, scisYmin, scisXmax, scisYmax);
2026     }
2027     else // broadcast fast path for non-VPAI case.
2028     {
2029         scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2030         scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2031         scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2032         scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2033     }
2034
2035     bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2036     bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2037     bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2038     bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2039
2040     if(CT::IsConservativeT::value)
2041     {
2042         // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
2043         // some area. Bump the xmax/ymax edges out
2044         simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
2045         bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
2046         simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
2047         bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
2048     }
2049
2050     // Cull tris completely outside scissor
2051     {
2052         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2053         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2054         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2055         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2056         triMask = triMask & ~maskOutsideScissor;
2057     }
2058
2059     if (!triMask)
2060     {
2061         goto endBinTriangles;
2062     }
2063
2064     // Convert triangle bbox to macrotile units.
2065     bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2066     bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2067     bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2068     bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2069
2070     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2071     _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2072     _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2073     _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2074     _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2075
2076     // transpose verts needed for backend
2077     /// @todo modify BE to take non-transformed verts
2078     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2079     vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
2080     vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
2081     vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
2082     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
2083
2084     // store render target array index
2085     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2086     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2087     {
2088         simdvector vRtai[3];
2089         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2090         simdscalari vRtaii;
2091         vRtaii = _simd_castps_si(vRtai[0].x);
2092         _simd_store_si((simdscalari*)aRTAI, vRtaii);
2093     }
2094     else
2095     {
2096         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2097     }
2098
2099     // scan remaining valid triangles and bin each separately
2100     while (_BitScanForward(&triIndex, triMask))
2101     {
2102         uint32_t linkageCount = state.backendState.numAttributes;
2103         uint32_t numScalarAttribs = linkageCount * 4;
2104
2105         BE_WORK work;
2106         work.type = DRAW;
2107
2108         bool isDegenerate;
2109         if(CT::IsConservativeT::value)
2110         {
2111             // only rasterize valid edges if we have a degenerate primitive
2112             int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
2113             work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
2114                                         (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
2115                                         (state.scissorsTileAligned == false));
2116
2117             // Degenerate triangles are required to be constant interpolated
2118             isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
2119         }
2120         else
2121         {
2122             isDegenerate = false;
2123             work.pfnWork = pfnWork;
2124         }
2125
2126         // Select attribute processor
2127         PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
2128             state.backendState.swizzleEnable,  state.backendState.constantInterpolationMask, isDegenerate);
2129
2130         TRIANGLE_WORK_DESC &desc = work.desc.tri;
2131
2132         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
2133         desc.triFlags.primID = pPrimID[triIndex];
2134         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
2135         desc.triFlags.viewportIndex = pViewportIndex[triIndex];
2136
2137         auto pArena = pDC->pArena;
2138         SWR_ASSERT(pArena != nullptr);
2139
2140         // store active attribs
2141         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2142         desc.pAttribs = pAttribs;
2143         desc.numAttribs = linkageCount;
2144         pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
2145
2146         // store triangle vertex data
2147         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2148
2149         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
2150         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
2151         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
2152         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
2153
2154         // store user clip distances
2155         if (rastState.clipDistanceMask)
2156         {
2157             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2158             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
2159             ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2160         }
2161
2162         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
2163         {
2164             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
2165             {
2166 #if KNOB_ENABLE_TOSS_POINTS
2167                 if (!KNOB_TOSS_SETUP_TRIS)
2168 #endif
2169                 {
2170                     pTileMgr->enqueue(x, y, &work);
2171                 }
2172             }
2173         }
2174         triMask &= ~(1 << triIndex);
2175     }
2176
2177 endBinTriangles:
2178     AR_END(FEBinTriangles, 1);
2179 }
2180
2181 struct FEBinTrianglesChooser
2182 {
2183     typedef PFN_PROCESS_PRIMS FuncType;
2184
2185     template <typename... ArgsB>
2186     static FuncType GetFunc()
2187     {
2188         return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
2189     }
2190 };
2191
2192 // Selector for correct templated BinTrinagles function
2193 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
2194 {
2195     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
2196 }
2197
2198 //////////////////////////////////////////////////////////////////////////
2199 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
2200 /// @param pDC - pointer to draw context.
2201 /// @param pa - The primitive assembly object.
2202 /// @param workerId - thread's worker id. Even thread has a unique id.
2203 /// @param tri - Contains point position data for SIMDs worth of points.
2204 /// @param primID - Primitive ID for each point.
2205 void BinPoints(
2206     DRAW_CONTEXT *pDC,
2207     PA_STATE& pa,
2208     uint32_t workerId,
2209     simdvector prim[3],
2210     uint32_t primMask,
2211     simdscalari primID,
2212     simdscalari viewportIdx)
2213 {
2214     SWR_CONTEXT *pContext = pDC->pContext;
2215
2216     AR_BEGIN(FEBinPoints, pDC->drawId);
2217
2218     simdvector& primVerts = prim[0];
2219
2220     const API_STATE& state = GetApiState(pDC);
2221     const SWR_FRONTEND_STATE& feState = state.frontendState;
2222     const SWR_GS_STATE& gsState = state.gsState;
2223     const SWR_RASTSTATE& rastState = state.rastState;
2224     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2225
2226     // Select attribute processor
2227     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
2228         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2229
2230     if (!feState.vpTransformDisable)
2231     {
2232         // perspective divide
2233         simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
2234         primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
2235         primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
2236         primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
2237
2238         // viewport transform to screen coords
2239         if (state.gsState.emitsViewportArrayIndex)
2240         {
2241             viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
2242         }
2243         else
2244         {
2245             viewportTransform<1>(&primVerts, state.vpMatrices);
2246         }
2247     }
2248
2249     // adjust for pixel center location
2250     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2251     primVerts.x = _simd_add_ps(primVerts.x, offset);
2252     primVerts.y = _simd_add_ps(primVerts.y, offset);
2253
2254     // convert to fixed point
2255     simdscalari vXi, vYi;
2256     vXi = fpToFixedPointVertical(primVerts.x);
2257     vYi = fpToFixedPointVertical(primVerts.y);
2258
2259     if (CanUseSimplePoints(pDC))
2260     {
2261         // adjust for ymin-xmin rule
2262         vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
2263         vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
2264
2265         // cull points off the ymin-xmin edge of the viewport
2266         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
2267         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
2268
2269         // compute macro tile coordinates
2270         simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2271         simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2272
2273         OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
2274         _simd_store_si((simdscalari*)aMacroX, macroX);
2275         _simd_store_si((simdscalari*)aMacroY, macroY);
2276
2277         // compute raster tile coordinates
2278         simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
2279         simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
2280
2281         // compute raster tile relative x,y for coverage mask
2282         simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
2283         simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
2284
2285         simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
2286         simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
2287
2288         OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
2289         OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
2290         _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
2291         _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
2292
2293         OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
2294         OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
2295         _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
2296         _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
2297
2298         OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
2299         _simd_store_ps((float*)aZ, primVerts.z);
2300
2301         // store render target array index
2302         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2303         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2304         {
2305             simdvector vRtai;
2306             pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
2307             simdscalari vRtaii = _simd_castps_si(vRtai.x);
2308             _simd_store_si((simdscalari*)aRTAI, vRtaii);
2309         }
2310         else
2311         {
2312             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2313         }
2314
2315         uint32_t *pPrimID = (uint32_t *)&primID;
2316         DWORD primIndex = 0;
2317
2318         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
2319
2320         // scan remaining valid triangles and bin each separately
2321         while (_BitScanForward(&primIndex, primMask))
2322         {
2323             uint32_t linkageCount = backendState.numAttributes;
2324             uint32_t numScalarAttribs = linkageCount * 4;
2325
2326             BE_WORK work;
2327             work.type = DRAW;
2328
2329             TRIANGLE_WORK_DESC &desc = work.desc.tri;
2330
2331             // points are always front facing
2332             desc.triFlags.frontFacing = 1;
2333             desc.triFlags.primID = pPrimID[primIndex];
2334             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2335             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2336
2337             work.pfnWork = RasterizeSimplePoint;
2338
2339             auto pArena = pDC->pArena;
2340             SWR_ASSERT(pArena != nullptr);
2341
2342             // store attributes
2343             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
2344             desc.pAttribs = pAttribs;
2345             desc.numAttribs = linkageCount;
2346
2347             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
2348
2349             // store raster tile aligned x, y, perspective correct z
2350             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2351             desc.pTriBuffer = pTriBuffer;
2352             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
2353             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
2354             *pTriBuffer = aZ[primIndex];
2355
2356             uint32_t tX = aTileRelativeX[primIndex];
2357             uint32_t tY = aTileRelativeY[primIndex];
2358
2359             // pack the relative x,y into the coverageMask, the rasterizer will
2360             // generate the true coverage mask from it
2361             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2362
2363             // bin it
2364             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2365 #if KNOB_ENABLE_TOSS_POINTS
2366             if (!KNOB_TOSS_SETUP_TRIS)
2367 #endif
2368             {
2369                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2370             }
2371             primMask &= ~(1 << primIndex);
2372         }
2373     }
2374     else
2375     {
2376         // non simple points need to be potentially binned to multiple macro tiles
2377         simdscalar vPointSize;
2378         if (rastState.pointParam)
2379         {
2380             simdvector size[3];
2381             pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2382             vPointSize = size[0].x;
2383         }
2384         else
2385         {
2386             vPointSize = _simd_set1_ps(rastState.pointSize);
2387         }
2388
2389         // bloat point to bbox
2390         simdBBox bbox;
2391         bbox.xmin = bbox.xmax = vXi;
2392         bbox.ymin = bbox.ymax = vYi;
2393
2394         simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2395         simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2396         bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2397         bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2398         bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2399         bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2400
2401         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2402         // Gather the AOS effective scissor rects based on the per-prim VP index.
2403         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
2404         simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2405         if (state.gsState.emitsViewportArrayIndex)
2406         {
2407             GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2408                 scisXmin, scisYmin, scisXmax, scisYmax);
2409         }
2410         else // broadcast fast path for non-VPAI case.
2411         {
2412             scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2413             scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2414             scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2415             scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2416         }
2417
2418         bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2419         bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2420         bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2421         bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2422
2423         // Cull bloated points completely outside scissor
2424         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2425         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2426         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2427         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2428         primMask = primMask & ~maskOutsideScissor;
2429
2430         // Convert bbox to macrotile units.
2431         bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2432         bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2433         bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2434         bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2435
2436         OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2437         _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2438         _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2439         _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2440         _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2441
2442         // store render target array index
2443         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2444         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2445         {
2446             simdvector vRtai[2];
2447             pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2448             simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2449             _simd_store_si((simdscalari*)aRTAI, vRtaii);
2450         }
2451         else
2452         {
2453             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2454         }
2455
2456         OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2457         _simd_store_ps((float*)aPointSize, vPointSize);
2458
2459         uint32_t *pPrimID = (uint32_t *)&primID;
2460
2461         OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2462         OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2463         OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2464
2465         _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2466         _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2467         _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2468
2469         // scan remaining valid prims and bin each separately
2470         const SWR_BACKEND_STATE& backendState = state.backendState;
2471         DWORD primIndex;
2472         while (_BitScanForward(&primIndex, primMask))
2473         {
2474             uint32_t linkageCount = backendState.numAttributes;
2475             uint32_t numScalarAttribs = linkageCount * 4;
2476
2477             BE_WORK work;
2478             work.type = DRAW;
2479
2480             TRIANGLE_WORK_DESC &desc = work.desc.tri;
2481
2482             desc.triFlags.frontFacing = 1;
2483             desc.triFlags.primID = pPrimID[primIndex];
2484             desc.triFlags.pointSize = aPointSize[primIndex];
2485             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2486             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2487
2488             work.pfnWork = RasterizeTriPoint;
2489
2490             auto pArena = pDC->pArena;
2491             SWR_ASSERT(pArena != nullptr);
2492
2493             // store active attribs
2494             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2495             desc.numAttribs = linkageCount;
2496             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2497
2498             // store point vertex data
2499             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2500             desc.pTriBuffer = pTriBuffer;
2501             *pTriBuffer++ = aPrimVertsX[primIndex];
2502             *pTriBuffer++ = aPrimVertsY[primIndex];
2503             *pTriBuffer = aPrimVertsZ[primIndex];
2504
2505             // store user clip distances
2506             if (rastState.clipDistanceMask)
2507             {
2508                 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2509                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2510                 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2511             }
2512
2513             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2514             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2515             {
2516                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2517                 {
2518 #if KNOB_ENABLE_TOSS_POINTS
2519                     if (!KNOB_TOSS_SETUP_TRIS)
2520 #endif
2521                     {
2522                         pTileMgr->enqueue(x, y, &work);
2523                     }
2524                 }
2525             }
2526
2527             primMask &= ~(1 << primIndex);
2528         }
2529     }
2530
2531     AR_END(FEBinPoints, 1);
2532 }
2533
2534 //////////////////////////////////////////////////////////////////////////
2535 /// @brief Bin SIMD lines to the backend.
2536 /// @param pDC - pointer to draw context.
2537 /// @param pa - The primitive assembly object.
2538 /// @param workerId - thread's worker id. Even thread has a unique id.
2539 /// @param tri - Contains line position data for SIMDs worth of points.
2540 /// @param primID - Primitive ID for each line.
2541 /// @param viewportIdx - Viewport Array Index for each line.
2542 void BinLines(
2543     DRAW_CONTEXT *pDC,
2544     PA_STATE& pa,
2545     uint32_t workerId,
2546     simdvector prim[],
2547     uint32_t primMask,
2548     simdscalari primID,
2549     simdscalari viewportIdx)
2550 {
2551     SWR_CONTEXT *pContext = pDC->pContext;
2552
2553     AR_BEGIN(FEBinLines, pDC->drawId);
2554
2555     const API_STATE& state = GetApiState(pDC);
2556     const SWR_RASTSTATE& rastState = state.rastState;
2557     const SWR_FRONTEND_STATE& feState = state.frontendState;
2558     const SWR_GS_STATE& gsState = state.gsState;
2559
2560     // Select attribute processor
2561     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2562     state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2563
2564     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2565     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2566
2567     if (!feState.vpTransformDisable)
2568     {
2569         // perspective divide
2570         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2571         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2572
2573         prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2574         prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2575
2576         prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2577         prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2578
2579         prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2580         prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2581
2582         // viewport transform to screen coords
2583         if (state.gsState.emitsViewportArrayIndex)
2584         {
2585             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2586         }
2587         else
2588         {
2589             viewportTransform<2>(prim, state.vpMatrices);
2590         }
2591     }
2592
2593     // adjust for pixel center location
2594     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2595     prim[0].x = _simd_add_ps(prim[0].x, offset);
2596     prim[0].y = _simd_add_ps(prim[0].y, offset);
2597
2598     prim[1].x = _simd_add_ps(prim[1].x, offset);
2599     prim[1].y = _simd_add_ps(prim[1].y, offset);
2600
2601     // convert to fixed point
2602     simdscalari vXi[2], vYi[2];
2603     vXi[0] = fpToFixedPointVertical(prim[0].x);
2604     vYi[0] = fpToFixedPointVertical(prim[0].y);
2605     vXi[1] = fpToFixedPointVertical(prim[1].x);
2606     vYi[1] = fpToFixedPointVertical(prim[1].y);
2607
2608     // compute x-major vs y-major mask
2609     simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2610     simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2611     simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2612     uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2613
2614     // cull zero-length lines
2615     simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2616     vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2617
2618     primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2619
2620     uint32_t *pPrimID = (uint32_t *)&primID;
2621     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2622
2623     simdscalar vUnused = _simd_setzero_ps();
2624
2625     // Calc bounding box of lines
2626     simdBBox bbox;
2627     bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
2628     bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
2629     bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
2630     bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
2631
2632     // bloat bbox by line width along minor axis
2633     simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2634     simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2635     simdBBox bloatBox;
2636     bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2637     bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2638     bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2639     bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2640
2641     bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
2642     bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
2643     bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
2644     bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
2645
2646     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2647     simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2648     if (state.gsState.emitsViewportArrayIndex)
2649     {
2650         GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2651             scisXmin, scisYmin, scisXmax, scisYmax);
2652     }
2653     else // broadcast fast path for non-VPAI case.
2654     {
2655         scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2656         scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2657         scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2658         scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2659     }
2660
2661     bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2662     bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2663     bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2664     bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2665
2666     // Cull prims completely outside scissor
2667     {
2668         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2669         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2670         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2671         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2672         primMask = primMask & ~maskOutsideScissor;
2673     }
2674
2675     if (!primMask)
2676     {
2677         goto endBinLines;
2678     }
2679
2680     // Convert triangle bbox to macrotile units.
2681     bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2682     bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2683     bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2684     bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2685
2686     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2687     _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2688     _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2689     _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2690     _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2691
2692     // transpose verts needed for backend
2693     /// @todo modify BE to take non-transformed verts
2694     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2695     vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2696     vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2697     vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2698     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2699
2700     // store render target array index
2701     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2702     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2703     {
2704         simdvector vRtai[2];
2705         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2706         simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2707         _simd_store_si((simdscalari*)aRTAI, vRtaii);
2708     }
2709     else
2710     {
2711         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2712     }
2713
2714     // scan remaining valid prims and bin each separately
2715     DWORD primIndex;
2716     while (_BitScanForward(&primIndex, primMask))
2717     {
2718         uint32_t linkageCount = state.backendState.numAttributes;
2719         uint32_t numScalarAttribs = linkageCount * 4;
2720
2721         BE_WORK work;
2722         work.type = DRAW;
2723
2724         TRIANGLE_WORK_DESC &desc = work.desc.tri;
2725
2726         desc.triFlags.frontFacing = 1;
2727         desc.triFlags.primID = pPrimID[primIndex];
2728         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2729         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2730         desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2731
2732         work.pfnWork = RasterizeLine;
2733
2734         auto pArena = pDC->pArena;
2735         SWR_ASSERT(pArena != nullptr);
2736
2737         // store active attribs
2738         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2739         desc.numAttribs = linkageCount;
2740         pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2741
2742         // store line vertex data
2743         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2744         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2745         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2746         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2747         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2748
2749         // store user clip distances
2750         if (rastState.clipDistanceMask)
2751         {
2752             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2753             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2754             ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2755         }
2756
2757         MacroTileMgr *pTileMgr = pDC->pTileMgr;
2758         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2759         {
2760             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2761             {
2762 #if KNOB_ENABLE_TOSS_POINTS
2763                 if (!KNOB_TOSS_SETUP_TRIS)
2764 #endif
2765                 {
2766                     pTileMgr->enqueue(x, y, &work);
2767                 }
2768             }
2769         }
2770
2771         primMask &= ~(1 << primIndex);
2772     }
2773
2774 endBinLines:
2775
2776     AR_END(FEBinLines, 1);
2777 }