src/gallium/drivers/swr/rasterizer/core/frontend.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file frontend.cpp
  24 *
  25 * @brief Implementation for Frontend which handles vertex processing,
  26 *        primitive assembly, clipping, binning, etc.
  27 *
  28 ******************************************************************************/
  29
  30 #include "api.h"
  31 #include "frontend.h"
  32 #include "backend.h"
  33 #include "context.h"
  34 #include "rdtsc_core.h"
  35 #include "rasterizer.h"
  36 #include "conservativeRast.h"
  37 #include "utils.h"
  38 #include "threads.h"
  39 #include "pa.h"
  40 #include "clip.h"
  41 #include "tilemgr.h"
  42 #include "tessellator.h"
  43 #include <limits>
  44
  45 //////////////////////////////////////////////////////////////////////////
  46 /// @brief Helper macro to generate a bitmask
  47 static INLINE uint32_t GenMask(uint32_t numBits)
  48 {
  49     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
  50     return ((1U << numBits) - 1);
  51 }
  52
  53 //////////////////////////////////////////////////////////////////////////
  54 /// @brief Offsets added to post-viewport vertex positions based on
  55 /// raster state.
  56 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
  57 {
  58     _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
  59     _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
  60 };
  61
  62 //////////////////////////////////////////////////////////////////////////
  63 /// @brief FE handler for SwrSync.
  64 /// @param pContext - pointer to SWR context.
  65 /// @param pDC - pointer to draw context.
  66 /// @param workerId - thread's worker id. Even thread has a unique id.
  67 /// @param pUserData - Pointer to user data passed back to sync callback.
  68 /// @todo This should go away when we switch this to use compute threading.
  69 void ProcessSync(
  70     SWR_CONTEXT *pContext,
  71     DRAW_CONTEXT *pDC,
  72     uint32_t workerId,
  73     void *pUserData)
  74 {
  75     BE_WORK work;
  76     work.type = SYNC;
  77     work.pfnWork = ProcessSyncBE;
  78
  79     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  80     pTileMgr->enqueue(0, 0, &work);
  81 }
  82
  83 //////////////////////////////////////////////////////////////////////////
  84 /// @brief FE handler for SwrClearRenderTarget.
  85 /// @param pContext - pointer to SWR context.
  86 /// @param pDC - pointer to draw context.
  87 /// @param workerId - thread's worker id. Even thread has a unique id.
  88 /// @param pUserData - Pointer to user data passed back to clear callback.
  89 /// @todo This should go away when we switch this to use compute threading.
  90 void ProcessClear(
  91     SWR_CONTEXT *pContext,
  92     DRAW_CONTEXT *pDC,
  93     uint32_t workerId,
  94     void *pUserData)
  95 {
  96     CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
  97     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  98
  99     // queue a clear to each macro tile
 100     // compute macro tile bounds for the specified rect
 101     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 102     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 103     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 104     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 105
 106     BE_WORK work;
 107     work.type = CLEAR;
 108     work.pfnWork = ProcessClearBE;
 109     work.desc.clear = *pDesc;
 110
 111     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 112     {
 113         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 114         {
 115             pTileMgr->enqueue(x, y, &work);
 116         }
 117     }
 118 }
 119
 120 //////////////////////////////////////////////////////////////////////////
 121 /// @brief FE handler for SwrStoreTiles.
 122 /// @param pContext - pointer to SWR context.
 123 /// @param pDC - pointer to draw context.
 124 /// @param workerId - thread's worker id. Even thread has a unique id.
 125 /// @param pUserData - Pointer to user data passed back to callback.
 126 /// @todo This should go away when we switch this to use compute threading.
 127 void ProcessStoreTiles(
 128     SWR_CONTEXT *pContext,
 129     DRAW_CONTEXT *pDC,
 130     uint32_t workerId,
 131     void *pUserData)
 132 {
 133     RDTSC_START(FEProcessStoreTiles);
 134     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 135     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
 136
 137     // queue a store to each macro tile
 138     // compute macro tile bounds for the specified rect
 139     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 140     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 141     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 142     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 143
 144     // store tiles
 145     BE_WORK work;
 146     work.type = STORETILES;
 147     work.pfnWork = ProcessStoreTileBE;
 148     work.desc.storeTiles = *pDesc;
 149
 150     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 151     {
 152         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 153         {
 154             pTileMgr->enqueue(x, y, &work);
 155         }
 156     }
 157
 158     RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
 159 }
 160
 161 //////////////////////////////////////////////////////////////////////////
 162 /// @brief FE handler for SwrInvalidateTiles.
 163 /// @param pContext - pointer to SWR context.
 164 /// @param pDC - pointer to draw context.
 165 /// @param workerId - thread's worker id. Even thread has a unique id.
 166 /// @param pUserData - Pointer to user data passed back to callback.
 167 /// @todo This should go away when we switch this to use compute threading.
 168 void ProcessDiscardInvalidateTiles(
 169     SWR_CONTEXT *pContext,
 170     DRAW_CONTEXT *pDC,
 171     uint32_t workerId,
 172     void *pUserData)
 173 {
 174     RDTSC_START(FEProcessInvalidateTiles);
 175     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
 176     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 177
 178     // compute macro tile bounds for the specified rect
 179     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
 180     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
 181     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
 182     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
 183
 184     if (pDesc->fullTilesOnly == false)
 185     {
 186         // include partial tiles
 187         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 188         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 189         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 190         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 191     }
 192
 193     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
 194     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
 195
 196     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
 197     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
 198
 199     // load tiles
 200     BE_WORK work;
 201     work.type = DISCARDINVALIDATETILES;
 202     work.pfnWork = ProcessDiscardInvalidateTilesBE;
 203     work.desc.discardInvalidateTiles = *pDesc;
 204
 205     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 206     {
 207         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 208         {
 209             pTileMgr->enqueue(x, y, &work);
 210         }
 211     }
 212
 213     RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
 214 }
 215
 216 //////////////////////////////////////////////////////////////////////////
 217 /// @brief Computes the number of primitives given the number of verts.
 218 /// @param mode - primitive topology for draw operation.
 219 /// @param numPrims - number of vertices or indices for draw.
 220 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 221 uint32_t GetNumPrims(
 222     PRIMITIVE_TOPOLOGY mode,
 223     uint32_t numPrims)
 224 {
 225     switch (mode)
 226     {
 227     case TOP_POINT_LIST: return numPrims;
 228     case TOP_TRIANGLE_LIST: return numPrims / 3;
 229     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
 230     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
 231     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
 232     case TOP_QUAD_LIST: return numPrims / 4;
 233     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
 234     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
 235     case TOP_LINE_LIST: return numPrims / 2;
 236     case TOP_LINE_LOOP: return numPrims;
 237     case TOP_RECT_LIST: return numPrims / 3;
 238     case TOP_LINE_LIST_ADJ: return numPrims / 4;
 239     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
 240     case TOP_TRI_LIST_ADJ: return numPrims / 6;
 241     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 242
 243     case TOP_PATCHLIST_1:
 244     case TOP_PATCHLIST_2:
 245     case TOP_PATCHLIST_3:
 246     case TOP_PATCHLIST_4:
 247     case TOP_PATCHLIST_5:
 248     case TOP_PATCHLIST_6:
 249     case TOP_PATCHLIST_7:
 250     case TOP_PATCHLIST_8:
 251     case TOP_PATCHLIST_9:
 252     case TOP_PATCHLIST_10:
 253     case TOP_PATCHLIST_11:
 254     case TOP_PATCHLIST_12:
 255     case TOP_PATCHLIST_13:
 256     case TOP_PATCHLIST_14:
 257     case TOP_PATCHLIST_15:
 258     case TOP_PATCHLIST_16:
 259     case TOP_PATCHLIST_17:
 260     case TOP_PATCHLIST_18:
 261     case TOP_PATCHLIST_19:
 262     case TOP_PATCHLIST_20:
 263     case TOP_PATCHLIST_21:
 264     case TOP_PATCHLIST_22:
 265     case TOP_PATCHLIST_23:
 266     case TOP_PATCHLIST_24:
 267     case TOP_PATCHLIST_25:
 268     case TOP_PATCHLIST_26:
 269     case TOP_PATCHLIST_27:
 270     case TOP_PATCHLIST_28:
 271     case TOP_PATCHLIST_29:
 272     case TOP_PATCHLIST_30:
 273     case TOP_PATCHLIST_31:
 274     case TOP_PATCHLIST_32:
 275         return numPrims / (mode - TOP_PATCHLIST_BASE);
 276
 277     case TOP_POLYGON:
 278     case TOP_POINT_LIST_BF:
 279     case TOP_LINE_STRIP_CONT:
 280     case TOP_LINE_STRIP_BF:
 281     case TOP_LINE_STRIP_CONT_BF:
 282     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 283     case TOP_TRI_STRIP_REVERSE:
 284     case TOP_PATCHLIST_BASE:
 285     case TOP_UNKNOWN:
 286         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 287         return 0;
 288     }
 289
 290     return 0;
 291 }
 292
 293 //////////////////////////////////////////////////////////////////////////
 294 /// @brief Computes the number of verts given the number of primitives.
 295 /// @param mode - primitive topology for draw operation.
 296 /// @param numPrims - number of primitives for draw.
 297 uint32_t GetNumVerts(
 298     PRIMITIVE_TOPOLOGY mode,
 299     uint32_t numPrims)
 300 {
 301     switch (mode)
 302     {
 303     case TOP_POINT_LIST: return numPrims;
 304     case TOP_TRIANGLE_LIST: return numPrims * 3;
 305     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
 306     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
 307     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
 308     case TOP_QUAD_LIST: return numPrims * 4;
 309     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
 310     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
 311     case TOP_LINE_LIST: return numPrims * 2;
 312     case TOP_LINE_LOOP: return numPrims;
 313     case TOP_RECT_LIST: return numPrims * 3;
 314     case TOP_LINE_LIST_ADJ: return numPrims * 4;
 315     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
 316     case TOP_TRI_LIST_ADJ: return numPrims * 6;
 317     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
 318
 319     case TOP_PATCHLIST_1:
 320     case TOP_PATCHLIST_2:
 321     case TOP_PATCHLIST_3:
 322     case TOP_PATCHLIST_4:
 323     case TOP_PATCHLIST_5:
 324     case TOP_PATCHLIST_6:
 325     case TOP_PATCHLIST_7:
 326     case TOP_PATCHLIST_8:
 327     case TOP_PATCHLIST_9:
 328     case TOP_PATCHLIST_10:
 329     case TOP_PATCHLIST_11:
 330     case TOP_PATCHLIST_12:
 331     case TOP_PATCHLIST_13:
 332     case TOP_PATCHLIST_14:
 333     case TOP_PATCHLIST_15:
 334     case TOP_PATCHLIST_16:
 335     case TOP_PATCHLIST_17:
 336     case TOP_PATCHLIST_18:
 337     case TOP_PATCHLIST_19:
 338     case TOP_PATCHLIST_20:
 339     case TOP_PATCHLIST_21:
 340     case TOP_PATCHLIST_22:
 341     case TOP_PATCHLIST_23:
 342     case TOP_PATCHLIST_24:
 343     case TOP_PATCHLIST_25:
 344     case TOP_PATCHLIST_26:
 345     case TOP_PATCHLIST_27:
 346     case TOP_PATCHLIST_28:
 347     case TOP_PATCHLIST_29:
 348     case TOP_PATCHLIST_30:
 349     case TOP_PATCHLIST_31:
 350     case TOP_PATCHLIST_32:
 351         return numPrims * (mode - TOP_PATCHLIST_BASE);
 352
 353     case TOP_POLYGON:
 354     case TOP_POINT_LIST_BF:
 355     case TOP_LINE_STRIP_CONT:
 356     case TOP_LINE_STRIP_BF:
 357     case TOP_LINE_STRIP_CONT_BF:
 358     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 359     case TOP_TRI_STRIP_REVERSE:
 360     case TOP_PATCHLIST_BASE:
 361     case TOP_UNKNOWN:
 362         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 363         return 0;
 364     }
 365
 366     return 0;
 367 }
 368
 369 //////////////////////////////////////////////////////////////////////////
 370 /// @brief Return number of verts per primitive.
 371 /// @param topology - topology
 372 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 373 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 374 {
 375     uint32_t numVerts = 0;
 376     switch (topology)
 377     {
 378     case TOP_POINT_LIST:
 379     case TOP_POINT_LIST_BF:
 380         numVerts = 1;
 381         break;
 382     case TOP_LINE_LIST:
 383     case TOP_LINE_STRIP:
 384     case TOP_LINE_LIST_ADJ:
 385     case TOP_LINE_LOOP:
 386     case TOP_LINE_STRIP_CONT:
 387     case TOP_LINE_STRIP_BF:
 388     case TOP_LISTSTRIP_ADJ:
 389         numVerts = 2;
 390         break;
 391     case TOP_TRIANGLE_LIST:
 392     case TOP_TRIANGLE_STRIP:
 393     case TOP_TRIANGLE_FAN:
 394     case TOP_TRI_LIST_ADJ:
 395     case TOP_TRI_STRIP_ADJ:
 396     case TOP_TRI_STRIP_REVERSE:
 397     case TOP_RECT_LIST:
 398         numVerts = 3;
 399         break;
 400     case TOP_QUAD_LIST:
 401     case TOP_QUAD_STRIP:
 402         numVerts = 4;
 403         break;
 404     case TOP_PATCHLIST_1:
 405     case TOP_PATCHLIST_2:
 406     case TOP_PATCHLIST_3:
 407     case TOP_PATCHLIST_4:
 408     case TOP_PATCHLIST_5:
 409     case TOP_PATCHLIST_6:
 410     case TOP_PATCHLIST_7:
 411     case TOP_PATCHLIST_8:
 412     case TOP_PATCHLIST_9:
 413     case TOP_PATCHLIST_10:
 414     case TOP_PATCHLIST_11:
 415     case TOP_PATCHLIST_12:
 416     case TOP_PATCHLIST_13:
 417     case TOP_PATCHLIST_14:
 418     case TOP_PATCHLIST_15:
 419     case TOP_PATCHLIST_16:
 420     case TOP_PATCHLIST_17:
 421     case TOP_PATCHLIST_18:
 422     case TOP_PATCHLIST_19:
 423     case TOP_PATCHLIST_20:
 424     case TOP_PATCHLIST_21:
 425     case TOP_PATCHLIST_22:
 426     case TOP_PATCHLIST_23:
 427     case TOP_PATCHLIST_24:
 428     case TOP_PATCHLIST_25:
 429     case TOP_PATCHLIST_26:
 430     case TOP_PATCHLIST_27:
 431     case TOP_PATCHLIST_28:
 432     case TOP_PATCHLIST_29:
 433     case TOP_PATCHLIST_30:
 434     case TOP_PATCHLIST_31:
 435     case TOP_PATCHLIST_32:
 436         numVerts = topology - TOP_PATCHLIST_BASE;
 437         break;
 438     default:
 439         SWR_ASSERT(false, "Unsupported topology: %d", topology);
 440         break;
 441     }
 442
 443     if (includeAdjVerts)
 444     {
 445         switch (topology)
 446         {
 447         case TOP_LISTSTRIP_ADJ:
 448         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
 449         case TOP_TRI_STRIP_ADJ:
 450         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
 451         default: break;
 452         }
 453     }
 454
 455     return numVerts;
 456 }
 457
 458 //////////////////////////////////////////////////////////////////////////
 459 /// @brief Generate mask from remaining work.
 460 /// @param numWorkItems - Number of items being worked on by a SIMD.
 461 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 462 {
 463     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
 464     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 465     return _simd_castps_si(vMask(mask));
 466 }
 467
 468
 469 //////////////////////////////////////////////////////////////////////////
 470 /// @brief  Gather scissor rect data based on per-prim viewport indices.
 471 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
 472 /// @param pViewportIndex - array of per-primitive vewport indexes.
 473 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
 474 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
 475 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
 476 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
 477 //
 478 /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
 479 template<size_t SimdWidth>
 480 struct GatherScissors
 481 {
 482     static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
 483         simdscalari &scisXmin, simdscalari &scisYmin,
 484         simdscalari &scisXmax, simdscalari &scisYmax)
 485     {
 486         SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
 487     }
 488 };
 489
 490 template<>
 491 struct GatherScissors<8>
 492 {
 493     static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
 494         simdscalari &scisXmin, simdscalari &scisYmin,
 495         simdscalari &scisXmax, simdscalari &scisYmax)
 496     {
 497         scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
 498             pScissorsInFixedPoint[pViewportIndex[1]].xmin,
 499             pScissorsInFixedPoint[pViewportIndex[2]].xmin,
 500             pScissorsInFixedPoint[pViewportIndex[3]].xmin,
 501             pScissorsInFixedPoint[pViewportIndex[4]].xmin,
 502             pScissorsInFixedPoint[pViewportIndex[5]].xmin,
 503             pScissorsInFixedPoint[pViewportIndex[6]].xmin,
 504             pScissorsInFixedPoint[pViewportIndex[7]].xmin);
 505         scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
 506             pScissorsInFixedPoint[pViewportIndex[1]].ymin,
 507             pScissorsInFixedPoint[pViewportIndex[2]].ymin,
 508             pScissorsInFixedPoint[pViewportIndex[3]].ymin,
 509             pScissorsInFixedPoint[pViewportIndex[4]].ymin,
 510             pScissorsInFixedPoint[pViewportIndex[5]].ymin,
 511             pScissorsInFixedPoint[pViewportIndex[6]].ymin,
 512             pScissorsInFixedPoint[pViewportIndex[7]].ymin);
 513         scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
 514             pScissorsInFixedPoint[pViewportIndex[1]].xmax,
 515             pScissorsInFixedPoint[pViewportIndex[2]].xmax,
 516             pScissorsInFixedPoint[pViewportIndex[3]].xmax,
 517             pScissorsInFixedPoint[pViewportIndex[4]].xmax,
 518             pScissorsInFixedPoint[pViewportIndex[5]].xmax,
 519             pScissorsInFixedPoint[pViewportIndex[6]].xmax,
 520             pScissorsInFixedPoint[pViewportIndex[7]].xmax);
 521         scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
 522             pScissorsInFixedPoint[pViewportIndex[1]].ymax,
 523             pScissorsInFixedPoint[pViewportIndex[2]].ymax,
 524             pScissorsInFixedPoint[pViewportIndex[3]].ymax,
 525             pScissorsInFixedPoint[pViewportIndex[4]].ymax,
 526             pScissorsInFixedPoint[pViewportIndex[5]].ymax,
 527             pScissorsInFixedPoint[pViewportIndex[6]].ymax,
 528             pScissorsInFixedPoint[pViewportIndex[7]].ymax);
 529     }
 530 };
 531
 532 //////////////////////////////////////////////////////////////////////////
 533 /// @brief StreamOut - Streams vertex data out to SO buffers.
 534 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 535 /// @param pDC - pointer to draw context.
 536 /// @param workerId - thread's worker id. Even thread has a unique id.
 537 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 538 static void StreamOut(
 539     DRAW_CONTEXT* pDC,
 540     PA_STATE& pa,
 541     uint32_t workerId,
 542     uint32_t* pPrimData,
 543     uint32_t streamIndex)
 544 {
 545     RDTSC_START(FEStreamout);
 546
 547     const API_STATE& state = GetApiState(pDC);
 548     const SWR_STREAMOUT_STATE &soState = state.soState;
 549
 550     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 551
 552     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
 553     uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
 554
 555     SWR_STREAMOUT_CONTEXT soContext = { 0 };
 556
 557     // Setup buffer state pointers.
 558     for (uint32_t i = 0; i < 4; ++i)
 559     {
 560         soContext.pBuffer[i] = &state.soBuffer[i];
 561     }
 562
 563     uint32_t numPrims = pa.NumPrims();
 564     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 565     {
 566         DWORD slot = 0;
 567         uint32_t soMask = soState.streamMasks[streamIndex];
 568
 569         // Write all entries into primitive data buffer for SOS.
 570         while (_BitScanForward(&slot, soMask))
 571         {
 572             __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
 573             uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
 574             pa.AssembleSingle(paSlot, primIndex, attrib);
 575
 576             // Attribute offset is relative offset from start of vertex.
 577             // Note that attributes start at slot 1 in the PA buffer. We need to write this
 578             // to prim data starting at slot 0. Which is why we do (slot - 1).
 579             // Also note: GL works slightly differently, and needs slot 0
 580             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
 581
 582             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
 583             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
 584             {
 585                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 586
 587                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 588             }
 589             soMask &= ~(1 << slot);
 590         }
 591
 592         // Update pPrimData pointer
 593         soContext.pPrimData = pPrimData;
 594
 595         // Call SOS
 596         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
 597         state.pfnSoFunc[streamIndex](soContext);
 598     }
 599
 600     // Update SO write offset. The driver provides memory for the update.
 601     for (uint32_t i = 0; i < 4; ++i)
 602     {
 603         if (state.soBuffer[i].pWriteOffset)
 604         {
 605             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 606         }
 607
 608         if (state.soBuffer[i].soWriteEnable)
 609         {
 610             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 611             pDC->dynState.SoWriteOffsetDirty[i] = true;
 612         }
 613     }
 614
 615     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
 616     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 617
 618     RDTSC_STOP(FEStreamout, 1, 0);
 619 }
 620
 621 //////////////////////////////////////////////////////////////////////////
 622 /// @brief Computes number of invocations. The current index represents
 623 ///        the start of the SIMD. The max index represents how much work
 624 ///        items are remaining. If there is less then a SIMD's xmin of work
 625 ///        then return the remaining amount of work.
 626 /// @param curIndex - The start index for the SIMD.
 627 /// @param maxIndex - The last index for all work items.
 628 static INLINE uint32_t GetNumInvocations(
 629     uint32_t curIndex,
 630     uint32_t maxIndex)
 631 {
 632     uint32_t remainder = (maxIndex - curIndex);
 633     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 634 }
 635
 636 //////////////////////////////////////////////////////////////////////////
 637 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 638 ///        The geometry shader will loop over each active streamout buffer, assembling
 639 ///        primitives for the downstream stages. When multistream output is enabled,
 640 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 641 ///        buffer for the primitive assembler.
 642 /// @param stream - stream id to generate the cut buffer for
 643 /// @param pStreamIdBase - pointer to the stream ID buffer
 644 /// @param numEmittedVerts - Number of total verts emitted by the GS
 645 /// @param pCutBuffer - output buffer to write cuts to
 646 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
 647 {
 648     SWR_ASSERT(stream < MAX_SO_STREAMS);
 649
 650     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
 651     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 652
 653     for (uint32_t b = 0; b < numOutputBytes; ++b)
 654     {
 655         uint8_t curInputByte = pStreamIdBase[2*b];
 656         uint8_t outByte = 0;
 657         for (uint32_t i = 0; i < 4; ++i)
 658         {
 659             if ((curInputByte & 0x3) != stream)
 660             {
 661                 outByte |= (1 << i);
 662             }
 663             curInputByte >>= 2;
 664         }
 665
 666         curInputByte = pStreamIdBase[2 * b + 1];
 667         for (uint32_t i = 0; i < 4; ++i)
 668         {
 669             if ((curInputByte & 0x3) != stream)
 670             {
 671                 outByte |= (1 << (i + 4));
 672             }
 673             curInputByte >>= 2;
 674         }
 675
 676         *pCutBuffer++ = outByte;
 677     }
 678 }
 679
 680 THREAD SWR_GS_CONTEXT tlsGsContext;
 681
 682 //////////////////////////////////////////////////////////////////////////
 683 /// @brief Implements GS stage.
 684 /// @param pDC - pointer to draw context.
 685 /// @param workerId - thread's worker id. Even thread has a unique id.
 686 /// @param pa - The primitive assembly object.
 687 /// @param pGsOut - output stream for GS
 688 template <
 689     typename HasStreamOutT,
 690     typename HasRastT>
 691 static void GeometryShaderStage(
 692     DRAW_CONTEXT *pDC,
 693     uint32_t workerId,
 694     PA_STATE& pa,
 695     void* pGsOut,
 696     void* pCutBuffer,
 697     void* pStreamCutBuffer,
 698     uint32_t* pSoPrimData,
 699     simdscalari primID)
 700 {
 701     RDTSC_START(FEGeometryShader);
 702
 703     const API_STATE& state = GetApiState(pDC);
 704     const SWR_GS_STATE* pState = &state.gsState;
 705
 706     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
 707     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 708
 709     tlsGsContext.pStream = (uint8_t*)pGsOut;
 710     tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
 711     tlsGsContext.PrimitiveID = primID;
 712
 713     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 714     simdvector attrib[MAX_ATTRIBUTES];
 715
 716     // assemble all attributes for the input primitive
 717     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 718     {
 719         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
 720         pa.Assemble(attribSlot, attrib);
 721
 722         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 723         {
 724             tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
 725         }
 726     }
 727
 728     // assemble position
 729     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 730     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 731     {
 732         tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
 733     }
 734
 735     const uint32_t vertexStride = sizeof(simdvertex);
 736     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 737     const uint32_t inputPrimStride = numSimdBatches * vertexStride;
 738     const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
 739     uint32_t cutPrimStride;
 740     uint32_t cutInstanceStride;
 741
 742     if (pState->isSingleStream)
 743     {
 744         cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 745         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 746     }
 747     else
 748     {
 749         cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 750         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 751     }
 752
 753     // record valid prims from the frontend to avoid over binning the newly generated
 754     // prims from the GS
 755     uint32_t numInputPrims = pa.NumPrims();
 756
 757     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 758     {
 759         tlsGsContext.InstanceID = instance;
 760         tlsGsContext.mask = GenerateMask(numInputPrims);
 761
 762         // execute the geometry shader
 763         state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 764
 765         tlsGsContext.pStream += instanceStride;
 766         tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
 767     }
 768
 769     // set up new binner and state for the GS output topology
 770     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 771     if (HasRastT::value)
 772     {
 773         switch (pState->outputTopology)
 774         {
 775         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
 776         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
 777         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
 778         default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
 779         }
 780     }
 781
 782     // foreach input prim:
 783     // - setup a new PA based on the emitted verts for that prim
 784     // - loop over the new verts, calling PA to assemble each prim
 785     uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
 786     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 787
 788     uint32_t totalPrimsGenerated = 0;
 789     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 790     {
 791         uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
 792         uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
 793         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 794         {
 795             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 796             if (numEmittedVerts == 0)
 797             {
 798                 continue;
 799             }
 800
 801             uint8_t* pBase = pInstanceBase + instance * instanceStride;
 802             uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
 803
 804             uint32_t numAttribs = state.feNumAttributes;
 805
 806             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
 807             {
 808                 bool processCutVerts = false;
 809
 810                 uint8_t* pCutBuffer = pCutBase;
 811
 812                 // assign default stream ID, only relevant when GS is outputting a single stream
 813                 uint32_t streamID = 0;
 814                 if (pState->isSingleStream)
 815                 {
 816                     processCutVerts = true;
 817                     streamID = pState->singleStreamID;
 818                     if (streamID != stream) continue;
 819                 }
 820                 else
 821                 {
 822                     // early exit if this stream is not enabled for streamout
 823                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
 824                     {
 825                         continue;
 826                     }
 827
 828                     // multi-stream output, need to translate StreamID buffer to a cut buffer
 829                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
 830                     pCutBuffer = (uint8_t*)pStreamCutBuffer;
 831                     processCutVerts = false;
 832                 }
 833
 834                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 835
 836                 while (gsPa.GetNextStreamOutput())
 837                 {
 838                     do
 839                     {
 840                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
 841
 842                         if (assemble)
 843                         {
 844                             totalPrimsGenerated += gsPa.NumPrims();
 845
 846                             if (HasStreamOutT::value)
 847                             {
 848                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
 849                             }
 850
 851                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
 852                             {
 853                                 simdscalari vPrimId;
 854                                 // pull primitiveID from the GS output if available
 855                                 if (state.gsState.emitsPrimitiveID)
 856                                 {
 857                                     simdvector primIdAttrib[3];
 858                                     gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
 859                                     vPrimId = _simd_castps_si(primIdAttrib[0].x);
 860                                 }
 861                                 else
 862                                 {
 863                                     vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
 864                                 }
 865
 866                                 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
 867                                 simdscalari vViewPortIdx;
 868                                 if (state.gsState.emitsViewportArrayIndex)
 869                                 {
 870                                     simdvector vpiAttrib[3];
 871                                     gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
 872
 873                                     // OOB indices => forced to zero.
 874                                     simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 875                                     simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
 876                                     vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
 877
 878                                     vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
 879                                 }
 880                                 else
 881                                 {
 882                                     vViewPortIdx = _simd_set1_epi32(0);
 883                                 }
 884
 885                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
 886                             }
 887                         }
 888                     } while (gsPa.NextPrim());
 889                 }
 890             }
 891         }
 892     }
 893
 894     // update GS pipeline stats
 895     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
 896     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
 897
 898     RDTSC_STOP(FEGeometryShader, 1, 0);
 899 }
 900
 901 //////////////////////////////////////////////////////////////////////////
 902 /// @brief Allocate GS buffers
 903 /// @param pDC - pointer to draw context.
 904 /// @param state - API state
 905 /// @param ppGsOut - pointer to GS output buffer allocation
 906 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
 907 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
 908     void **ppStreamCutBuffer)
 909 {
 910     auto pArena = pDC->pArena;
 911     SWR_ASSERT(pArena != nullptr);
 912     SWR_ASSERT(state.gsState.gsEnable);
 913     // allocate arena space to hold GS output verts
 914     // @todo pack attribs
 915     // @todo support multiple streams
 916     const uint32_t vertexStride = sizeof(simdvertex);
 917     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 918     uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
 919     *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
 920
 921     const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 922     const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 923     const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 924     const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 925
 926     // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
 927     // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
 928
 929     // allocate space for temporary per-stream cut buffer if multi-stream is enabled
 930     if (state.gsState.isSingleStream)
 931     {
 932         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 933         *ppStreamCutBuffer = nullptr;
 934     }
 935     else
 936     {
 937         *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
 938         *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 939     }
 940
 941 }
 942
 943 //////////////////////////////////////////////////////////////////////////
 944 /// @brief Contains all data generated by the HS and passed to the
 945 /// tessellator and DS.
 946 struct TessellationThreadLocalData
 947 {
 948     SWR_HS_CONTEXT hsContext;
 949     ScalarPatch patchData[KNOB_SIMD_WIDTH];
 950     void* pTxCtx;
 951     size_t tsCtxSize;
 952
 953     simdscalar* pDSOutput;
 954     size_t numDSOutputVectors;
 955 };
 956
 957 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
 958
 959 //////////////////////////////////////////////////////////////////////////
 960 /// @brief Allocate tessellation data for this worker thread.
 961 INLINE
 962 static void AllocateTessellationData(SWR_CONTEXT* pContext)
 963 {
 964     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
 965     if (gt_pTessellationThreadData == nullptr)
 966     {
 967         gt_pTessellationThreadData = (TessellationThreadLocalData*)
 968             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
 969         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
 970     }
 971 }
 972
 973 //////////////////////////////////////////////////////////////////////////
 974 /// @brief Implements Tessellation Stages.
 975 /// @param pDC - pointer to draw context.
 976 /// @param workerId - thread's worker id. Even thread has a unique id.
 977 /// @param pa - The primitive assembly object.
 978 /// @param pGsOut - output stream for GS
 979 template <
 980     typename HasGeometryShaderT,
 981     typename HasStreamOutT,
 982     typename HasRastT>
 983 static void TessellationStages(
 984     DRAW_CONTEXT *pDC,
 985     uint32_t workerId,
 986     PA_STATE& pa,
 987     void* pGsOut,
 988     void* pCutBuffer,
 989     void* pCutStreamBuffer,
 990     uint32_t* pSoPrimData,
 991     simdscalari primID)
 992 {
 993     const API_STATE& state = GetApiState(pDC);
 994     const SWR_TS_STATE& tsState = state.tsState;
 995
 996     SWR_ASSERT(gt_pTessellationThreadData);
 997
 998     HANDLE tsCtx = TSInitCtx(
 999         tsState.domain,
1000         tsState.partitioning,
1001         tsState.tsOutputTopology,
1002         gt_pTessellationThreadData->pTxCtx,
1003         gt_pTessellationThreadData->tsCtxSize);
1004     if (tsCtx == nullptr)
1005     {
1006         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1007         tsCtx = TSInitCtx(
1008             tsState.domain,
1009             tsState.partitioning,
1010             tsState.tsOutputTopology,
1011             gt_pTessellationThreadData->pTxCtx,
1012             gt_pTessellationThreadData->tsCtxSize);
1013     }
1014     SWR_ASSERT(tsCtx);
1015
1016     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1017     if (HasRastT::value)
1018     {
1019         switch (tsState.postDSTopology)
1020         {
1021         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1022         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
1023         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
1024         default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
1025         }
1026     }
1027
1028     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1029     hsContext.pCPout = gt_pTessellationThreadData->patchData;
1030     hsContext.PrimitiveID = primID;
1031
1032     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1033     // Max storage for one attribute for an entire simdprimitive
1034     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1035
1036     // assemble all attributes for the input primitives
1037     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1038     {
1039         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1040         pa.Assemble(attribSlot, simdattrib);
1041
1042         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1043         {
1044             hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1045         }
1046     }
1047
1048 #if defined(_DEBUG)
1049     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1050 #endif
1051
1052     uint32_t numPrims = pa.NumPrims();
1053     hsContext.mask = GenerateMask(numPrims);
1054
1055     // Run the HS
1056     RDTSC_START(FEHullShader);
1057     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1058     RDTSC_STOP(FEHullShader, 0, 0);
1059
1060     UPDATE_STAT_FE(HsInvocations, numPrims);
1061
1062     const uint32_t* pPrimId = (const uint32_t*)&primID;
1063
1064     for (uint32_t p = 0; p < numPrims; ++p)
1065     {
1066         // Run Tessellator
1067         SWR_TS_TESSELLATED_DATA tsData = { 0 };
1068         RDTSC_START(FETessellation);
1069         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1070         RDTSC_STOP(FETessellation, 0, 0);
1071
1072         if (tsData.NumPrimitives == 0)
1073         {
1074             continue;
1075         }
1076         SWR_ASSERT(tsData.NumDomainPoints);
1077
1078         // Allocate DS Output memory
1079         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1080         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1081         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1082         if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1083         {
1084             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1085             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1086             gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1087         }
1088         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1089         SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1090
1091 #if defined(_DEBUG)
1092         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1093 #endif
1094
1095         // Run Domain Shader
1096         SWR_DS_CONTEXT dsContext;
1097         dsContext.PrimitiveID = pPrimId[p];
1098         dsContext.pCpIn = &hsContext.pCPout[p];
1099         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1100         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1101         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1102         dsContext.vectorStride = requiredDSVectorInvocations;
1103
1104         uint32_t dsInvocations = 0;
1105
1106         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1107         {
1108             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1109
1110             RDTSC_START(FEDomainShader);
1111             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1112             RDTSC_STOP(FEDomainShader, 0, 0);
1113
1114             dsInvocations += KNOB_SIMD_WIDTH;
1115         }
1116         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1117
1118         PA_TESS tessPa(
1119             pDC,
1120             dsContext.pOutputData,
1121             dsContext.vectorStride,
1122             tsState.numDsOutputAttribs,
1123             tsData.ppIndices,
1124             tsData.NumPrimitives,
1125             tsState.postDSTopology);
1126
1127         while (tessPa.HasWork())
1128         {
1129             if (HasGeometryShaderT::value)
1130             {
1131                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1132                     pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1133                     _simd_set1_epi32(dsContext.PrimitiveID));
1134             }
1135             else
1136             {
1137                 if (HasStreamOutT::value)
1138                 {
1139                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1140                 }
1141
1142                 if (HasRastT::value)
1143                 {
1144                     simdvector prim[3]; // Only deal with triangles, lines, or points
1145                     RDTSC_START(FEPAAssemble);
1146 #if SWR_ENABLE_ASSERTS
1147                     bool assemble =
1148 #endif
1149                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1150                     RDTSC_STOP(FEPAAssemble, 1, 0);
1151                     SWR_ASSERT(assemble);
1152
1153                     SWR_ASSERT(pfnClipFunc);
1154                     pfnClipFunc(pDC, tessPa, workerId, prim,
1155                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1156                 }
1157             }
1158
1159             tessPa.NextPrim();
1160
1161         } // while (tessPa.HasWork())
1162     } // for (uint32_t p = 0; p < numPrims; ++p)
1163
1164     TSDestroyCtx(tsCtx);
1165 }
1166
1167 //////////////////////////////////////////////////////////////////////////
1168 /// @brief FE handler for SwrDraw.
1169 /// @tparam IsIndexedT - Is indexed drawing enabled
1170 /// @tparam HasTessellationT - Is tessellation enabled
1171 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1172 /// @tparam HasStreamOutT - Is stream-out enabled
1173 /// @tparam HasRastT - Is rasterization enabled
1174 /// @param pContext - pointer to SWR context.
1175 /// @param pDC - pointer to draw context.
1176 /// @param workerId - thread's worker id.
1177 /// @param pUserData - Pointer to DRAW_WORK
1178 template <
1179     typename IsIndexedT,
1180     typename IsCutIndexEnabledT,
1181     typename HasTessellationT,
1182     typename HasGeometryShaderT,
1183     typename HasStreamOutT,
1184     typename HasRastT>
1185 void ProcessDraw(
1186     SWR_CONTEXT *pContext,
1187     DRAW_CONTEXT *pDC,
1188     uint32_t workerId,
1189     void *pUserData)
1190 {
1191
1192 #if KNOB_ENABLE_TOSS_POINTS
1193     if (KNOB_TOSS_QUEUE_FE)
1194     {
1195         return;
1196     }
1197 #endif
1198
1199     RDTSC_START(FEProcessDraw);
1200
1201     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1202     const API_STATE&    state = GetApiState(pDC);
1203     __m256i             vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1204     SWR_VS_CONTEXT      vsContext;
1205     simdvertex          vin;
1206
1207     int indexSize = 0;
1208     uint32_t endVertex = work.numVerts;
1209
1210     const int32_t* pLastRequestedIndex = nullptr;
1211     if (IsIndexedT::value)
1212     {
1213         switch (work.type)
1214         {
1215         case R32_UINT:
1216             indexSize = sizeof(uint32_t);
1217             pLastRequestedIndex = &(work.pIB[endVertex]);
1218             break;
1219         case R16_UINT:
1220             indexSize = sizeof(uint16_t);
1221             // nasty address offset to last index
1222             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1223             break;
1224         case R8_UINT:
1225             indexSize = sizeof(uint8_t);
1226             // nasty address offset to last index
1227             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1228             break;
1229         default:
1230             SWR_ASSERT(0);
1231         }
1232     }
1233     else
1234     {
1235         // No cuts, prune partial primitives.
1236         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1237     }
1238
1239     SWR_FETCH_CONTEXT fetchInfo = { 0 };
1240     fetchInfo.pStreams = &state.vertexBuffers[0];
1241     fetchInfo.StartInstance = work.startInstance;
1242     fetchInfo.StartVertex = 0;
1243
1244     vsContext.pVin = &vin;
1245
1246     if (IsIndexedT::value)
1247     {
1248         fetchInfo.BaseVertex = work.baseVertex;
1249
1250         // if the entire index buffer isn't being consumed, set the last index
1251         // so that fetches < a SIMD wide will be masked off
1252         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1253         if (pLastRequestedIndex < fetchInfo.pLastIndex)
1254         {
1255             fetchInfo.pLastIndex = pLastRequestedIndex;
1256         }
1257     }
1258     else
1259     {
1260         fetchInfo.StartVertex = work.startVertex;
1261     }
1262
1263 #ifdef KNOB_ENABLE_RDTSC
1264     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1265 #endif
1266
1267     void* pGsOut = nullptr;
1268     void* pCutBuffer = nullptr;
1269     void* pStreamCutBuffer = nullptr;
1270     if (HasGeometryShaderT::value)
1271     {
1272         AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1273     }
1274
1275     if (HasTessellationT::value)
1276     {
1277         SWR_ASSERT(state.tsState.tsEnable == true);
1278         SWR_ASSERT(state.pfnHsFunc != nullptr);
1279         SWR_ASSERT(state.pfnDsFunc != nullptr);
1280
1281         AllocateTessellationData(pContext);
1282     }
1283     else
1284     {
1285         SWR_ASSERT(state.tsState.tsEnable == false);
1286         SWR_ASSERT(state.pfnHsFunc == nullptr);
1287         SWR_ASSERT(state.pfnDsFunc == nullptr);
1288     }
1289
1290     // allocate space for streamout input prim data
1291     uint32_t* pSoPrimData = nullptr;
1292     if (HasStreamOutT::value)
1293     {
1294         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1295     }
1296
1297     // choose primitive assembler
1298     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1299     PA_STATE& pa = paFactory.GetPA();
1300
1301     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1302     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1303     {
1304         simdscalari vIndex;
1305         uint32_t  i = 0;
1306
1307         if (IsIndexedT::value)
1308         {
1309             fetchInfo.pIndices = work.pIB;
1310         }
1311         else
1312         {
1313             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1314             fetchInfo.pIndices = (const int32_t*)&vIndex;
1315         }
1316
1317         fetchInfo.CurInstance = instanceNum;
1318         vsContext.InstanceID = instanceNum;
1319
1320         while (pa.HasWork())
1321         {
1322             // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1323             // So we need to keep this outside of (i < endVertex) check.
1324             simdmask* pvCutIndices = nullptr;
1325             if (IsIndexedT::value)
1326             {
1327                 pvCutIndices = &pa.GetNextVsIndices();
1328             }
1329
1330             simdvertex& vout = pa.GetNextVsOutput();
1331             vsContext.pVout = &vout;
1332
1333             if (i < endVertex)
1334             {
1335
1336                 // 1. Execute FS/VS for a single SIMD.
1337                 RDTSC_START(FEFetchShader);
1338                 state.pfnFetchFunc(fetchInfo, vin);
1339                 RDTSC_STOP(FEFetchShader, 0, 0);
1340
1341                 // forward fetch generated vertex IDs to the vertex shader
1342                 vsContext.VertexID = fetchInfo.VertexID;
1343
1344                 // Setup active mask for vertex shader.
1345                 vsContext.mask = GenerateMask(endVertex - i);
1346
1347                 // forward cut mask to the PA
1348                 if (IsIndexedT::value)
1349                 {
1350                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1351                 }
1352
1353                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1354
1355 #if KNOB_ENABLE_TOSS_POINTS
1356                 if (!KNOB_TOSS_FETCH)
1357 #endif
1358                 {
1359                     RDTSC_START(FEVertexShader);
1360                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1361                     RDTSC_STOP(FEVertexShader, 0, 0);
1362
1363                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1364                 }
1365             }
1366
1367             // 2. Assemble primitives given the last two SIMD.
1368             do
1369             {
1370                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1371                 // PaAssemble returns false if there is not enough verts to assemble.
1372                 RDTSC_START(FEPAAssemble);
1373                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1374                 RDTSC_STOP(FEPAAssemble, 1, 0);
1375
1376 #if KNOB_ENABLE_TOSS_POINTS
1377                 if (!KNOB_TOSS_FETCH)
1378 #endif
1379                 {
1380 #if KNOB_ENABLE_TOSS_POINTS
1381                     if (!KNOB_TOSS_VS)
1382 #endif
1383                     {
1384                         if (assemble)
1385                         {
1386                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1387
1388                             if (HasTessellationT::value)
1389                             {
1390                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1391                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1392                             }
1393                             else if (HasGeometryShaderT::value)
1394                             {
1395                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1396                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1397                             }
1398                             else
1399                             {
1400                                 // If streamout is enabled then stream vertices out to memory.
1401                                 if (HasStreamOutT::value)
1402                                 {
1403                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1404                                 }
1405
1406                                 if (HasRastT::value)
1407                                 {
1408                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
1409                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1410                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1411                                 }
1412                             }
1413                         }
1414                     }
1415                 }
1416             } while (pa.NextPrim());
1417
1418             i += KNOB_SIMD_WIDTH;
1419             if (IsIndexedT::value)
1420             {
1421                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1422             }
1423             else
1424             {
1425                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1426             }
1427         }
1428         pa.Reset();
1429     }
1430
1431     RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
1432 }
1433
1434 struct FEDrawChooser
1435 {
1436     typedef PFN_FE_WORK_FUNC FuncType;
1437
1438     template <typename... ArgsB>
1439     static FuncType GetFunc()
1440     {
1441         return ProcessDraw<ArgsB...>;
1442     }
1443 };
1444
1445
1446 // Selector for correct templated Draw front-end function
1447 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1448     bool IsIndexed,
1449     bool IsCutIndexEnabled,
1450     bool HasTessellation,
1451     bool HasGeometryShader,
1452     bool HasStreamOut,
1453     bool HasRasterization)
1454 {
1455     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1456 }
1457
1458 //////////////////////////////////////////////////////////////////////////
1459 /// @brief Processes attributes for the backend based on linkage mask and
1460 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
1461 /// @param pDC - Draw context
1462 /// @param pa - Primitive Assembly state
1463 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1464 /// @param pLinkageMap - maps VS attribute slot to PS slot
1465 /// @param triIndex - Triangle to process attributes for
1466 /// @param pBuffer - Output result
1467 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
1468 INLINE void ProcessAttributes(
1469     DRAW_CONTEXT *pDC,
1470     PA_STATE&pa,
1471     uint32_t triIndex,
1472     uint32_t primId,
1473     float *pBuffer)
1474 {
1475     static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
1476     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1477     // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
1478     LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
1479     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1480     const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
1481
1482     static const float constTable[3][4] = {
1483         {0.0f, 0.0f, 0.0f, 0.0f},
1484         {0.0f, 0.0f, 0.0f, 1.0f},
1485         {1.0f, 1.0f, 1.0f, 1.0f}
1486     };
1487
1488     for (uint32_t i = 0; i < backendState.numAttributes; ++i)
1489     {
1490         uint32_t inputSlot;
1491         if (IsSwizzledT::value)
1492         {
1493             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
1494             inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
1495
1496         }
1497         else
1498         {
1499             inputSlot = VERTEX_ATTRIB_START_SLOT + i;
1500         }
1501
1502         __m128 attrib[3];    // triangle attribs (always 4 wide)
1503         float* pAttribStart = pBuffer;
1504
1505         if (HasConstantInterpT::value || IsDegenerate::value)
1506         {
1507             if (_bittest(&constantInterpMask, i))
1508             {
1509                 uint32_t vid;
1510                 uint32_t adjustedTriIndex;
1511                 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
1512                 static const int32_t quadProvokingTri[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} };
1513                 static const uint32_t quadProvokingVertex[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} };
1514                 static const int32_t qstripProvokingTri[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} };
1515                 static const uint32_t qstripProvokingVertex[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} };
1516
1517                 switch (topo) {
1518                 case TOP_QUAD_LIST:
1519                     adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
1520                     vid = quadProvokingVertex[triIndex & 1][provokingVertex];
1521                     break;
1522                 case TOP_QUAD_STRIP:
1523                     adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
1524                     vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
1525                     break;
1526                 case TOP_TRIANGLE_STRIP:
1527                     adjustedTriIndex = triIndex;
1528                     vid = (triIndex & 1)
1529                         ? tristripProvokingVertex[provokingVertex]
1530                         : provokingVertex;
1531                     break;
1532                 default:
1533                     adjustedTriIndex = triIndex;
1534                     vid = provokingVertex;
1535                     break;
1536                 }
1537
1538                 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
1539
1540                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1541                 {
1542                     _mm_store_ps(pBuffer, attrib[vid]);
1543                     pBuffer += 4;
1544                 }
1545             }
1546             else
1547             {
1548                 pa.AssembleSingle(inputSlot, triIndex, attrib);
1549
1550                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1551                 {
1552                     _mm_store_ps(pBuffer, attrib[i]);
1553                     pBuffer += 4;
1554                 }
1555             }
1556         }
1557         else
1558         {
1559             pa.AssembleSingle(inputSlot, triIndex, attrib);
1560
1561             for (uint32_t i = 0; i < NumVertsT::value; ++i)
1562             {
1563                 _mm_store_ps(pBuffer, attrib[i]);
1564                 pBuffer += 4;
1565             }
1566         }
1567
1568         // pad out the attrib buffer to 3 verts to ensure the triangle
1569         // interpolation code in the pixel shader works correctly for the
1570         // 3 topologies - point, line, tri.  This effectively zeros out the
1571         // effect of the missing vertices in the triangle interpolation.
1572         for (uint32_t v = NumVertsT::value; v < 3; ++v)
1573         {
1574             _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
1575             pBuffer += 4;
1576         }
1577
1578         // check for constant source overrides
1579         if (IsSwizzledT::value)
1580         {
1581             uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
1582             if (mask)
1583             {
1584                 DWORD comp;
1585                 while (_BitScanForward(&comp, mask))
1586                 {
1587                     mask &= ~(1 << comp);
1588
1589                     float constantValue = 0.0f;
1590                     switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
1591                     {
1592                     case SWR_CONSTANT_SOURCE_CONST_0000:
1593                     case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
1594                     case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
1595                         constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
1596                         break;
1597                     case SWR_CONSTANT_SOURCE_PRIM_ID:
1598                         constantValue = *(float*)&primId;
1599                         break;
1600                     }
1601
1602                     // apply constant value to all 3 vertices
1603                     for (uint32_t v = 0; v < 3; ++v)
1604                     {
1605                         pAttribStart[comp + v * 4] = constantValue;
1606                     }
1607                 }
1608             }
1609         }
1610     }
1611 }
1612
1613
1614 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
1615
1616 struct ProcessAttributesChooser
1617 {
1618     typedef PFN_PROCESS_ATTRIBUTES FuncType;
1619
1620     template <typename... ArgsB>
1621     static FuncType GetFunc()
1622     {
1623         return ProcessAttributes<ArgsB...>;
1624     }
1625 };
1626
1627 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
1628 {
1629     return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
1630 }
1631
1632 //////////////////////////////////////////////////////////////////////////
1633 /// @brief Processes enabled user clip distances. Loads the active clip
1634 ///        distances from the PA, sets up barycentric equations, and
1635 ///        stores the results to the output buffer
1636 /// @param pa - Primitive Assembly state
1637 /// @param primIndex - primitive index to process
1638 /// @param clipDistMask - mask of enabled clip distances
1639 /// @param pUserClipBuffer - buffer to store results
1640 template<uint32_t NumVerts>
1641 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1642 {
1643     DWORD clipDist;
1644     while (_BitScanForward(&clipDist, clipDistMask))
1645     {
1646         clipDistMask &= ~(1 << clipDist);
1647         uint32_t clipSlot = clipDist >> 2;
1648         uint32_t clipComp = clipDist & 0x3;
1649         uint32_t clipAttribSlot = clipSlot == 0 ?
1650             VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1651
1652         __m128 primClipDist[3];
1653         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1654
1655         float vertClipDist[NumVerts];
1656         for (uint32_t e = 0; e < NumVerts; ++e)
1657         {
1658             OSALIGNSIMD(float) aVertClipDist[4];
1659             _mm_store_ps(aVertClipDist, primClipDist[e]);
1660             vertClipDist[e] = aVertClipDist[clipComp];
1661         };
1662
1663         // setup plane equations for barycentric interpolation in the backend
1664         float baryCoeff[NumVerts];
1665         for (uint32_t e = 0; e < NumVerts - 1; ++e)
1666         {
1667             baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1668         }
1669         baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1670
1671         for (uint32_t e = 0; e < NumVerts; ++e)
1672         {
1673             *(pUserClipBuffer++) = baryCoeff[e];
1674         }
1675     }
1676 }
1677
1678 //////////////////////////////////////////////////////////////////////////
1679 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
1680 /// Point precision from FP32.
1681 template <typename PT = FixedPointTraits<Fixed_16_8>>
1682 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
1683 {
1684     simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
1685     return _simd_cvtps_epi32(vFixed);
1686 }
1687
1688 //////////////////////////////////////////////////////////////////////////
1689 /// @brief Helper function to set the X,Y coords of a triangle to the
1690 /// requested Fixed Point precision from FP32.
1691 /// @param tri: simdvector[3] of FP triangle verts
1692 /// @param vXi: fixed point X coords of tri verts
1693 /// @param vYi: fixed point Y coords of tri verts
1694 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3])
1695 {
1696     vXi[0] = fpToFixedPointVertical(tri[0].x);
1697     vYi[0] = fpToFixedPointVertical(tri[0].y);
1698     vXi[1] = fpToFixedPointVertical(tri[1].x);
1699     vYi[1] = fpToFixedPointVertical(tri[1].y);
1700     vXi[2] = fpToFixedPointVertical(tri[2].x);
1701     vYi[2] = fpToFixedPointVertical(tri[2].y);
1702 }
1703
1704 //////////////////////////////////////////////////////////////////////////
1705 /// @brief Calculate bounding box for current triangle
1706 /// @tparam CT: ConservativeRastFETraits type
1707 /// @param vX: fixed point X position for triangle verts
1708 /// @param vY: fixed point Y position for triangle verts
1709 /// @param bbox: fixed point bbox
1710 /// *Note*: expects vX, vY to be in the correct precision for the type
1711 /// of rasterization. This avoids unnecessary FP->fixed conversions.
1712 template <typename CT>
1713 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1714 {
1715     simdscalari vMinX = vX[0];
1716     vMinX = _simd_min_epi32(vMinX, vX[1]);
1717     vMinX = _simd_min_epi32(vMinX, vX[2]);
1718
1719     simdscalari vMaxX = vX[0];
1720     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1721     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1722
1723     simdscalari vMinY = vY[0];
1724     vMinY = _simd_min_epi32(vMinY, vY[1]);
1725     vMinY = _simd_min_epi32(vMinY, vY[2]);
1726
1727     simdscalari vMaxY = vY[0];
1728     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1729     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1730
1731     bbox.xmin = vMinX;
1732     bbox.xmax = vMaxX;
1733     bbox.ymin = vMinY;
1734     bbox.ymax = vMaxY;
1735 }
1736
1737 //////////////////////////////////////////////////////////////////////////
1738 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
1739 /// Offsets BBox for conservative rast
1740 template <>
1741 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1742 {
1743     // FE conservative rast traits
1744     typedef FEConservativeRastT CT;
1745
1746     simdscalari vMinX = vX[0];
1747     vMinX = _simd_min_epi32(vMinX, vX[1]);
1748     vMinX = _simd_min_epi32(vMinX, vX[2]);
1749
1750     simdscalari vMaxX = vX[0];
1751     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1752     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1753
1754     simdscalari vMinY = vY[0];
1755     vMinY = _simd_min_epi32(vMinY, vY[1]);
1756     vMinY = _simd_min_epi32(vMinY, vY[2]);
1757
1758     simdscalari vMaxY = vY[0];
1759     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1760     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1761
1762     /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
1763     /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
1764     bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1765     bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1766     bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1767     bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1768 }
1769
1770 //////////////////////////////////////////////////////////////////////////
1771 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1772 ///        culling, viewport transform, etc.
1773 /// @param pDC - pointer to draw context.
1774 /// @param pa - The primitive assembly object.
1775 /// @param workerId - thread's worker id. Even thread has a unique id.
1776 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1777 /// @param primID - Primitive ID for each triangle.
1778 /// @param viewportIdx - viewport array index for each triangle.
1779 /// @tparam CT - ConservativeRastFETraits
1780 template <typename CT>
1781 void BinTriangles(
1782     DRAW_CONTEXT *pDC,
1783     PA_STATE& pa,
1784     uint32_t workerId,
1785     simdvector tri[3],
1786     uint32_t triMask,
1787     simdscalari primID,
1788     simdscalari viewportIdx)
1789 {
1790     RDTSC_START(FEBinTriangles);
1791
1792     const API_STATE& state = GetApiState(pDC);
1793     const SWR_RASTSTATE& rastState = state.rastState;
1794     const SWR_FRONTEND_STATE& feState = state.frontendState;
1795     const SWR_GS_STATE& gsState = state.gsState;
1796     MacroTileMgr *pTileMgr = pDC->pTileMgr;
1797
1798
1799     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1800     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1801     simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1802
1803     if (feState.vpTransformDisable)
1804     {
1805         // RHW is passed in directly when VP transform is disabled
1806         vRecipW0 = tri[0].v[3];
1807         vRecipW1 = tri[1].v[3];
1808         vRecipW2 = tri[2].v[3];
1809     }
1810     else
1811     {
1812         // Perspective divide
1813         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1814         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1815         vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1816
1817         tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1818         tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1819         tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1820
1821         tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1822         tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1823         tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1824
1825         tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1826         tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1827         tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1828
1829         // Viewport transform to screen space coords
1830         if (state.gsState.emitsViewportArrayIndex)
1831         {
1832             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
1833         }
1834         else
1835         {
1836             viewportTransform<3>(tri, state.vpMatrices);
1837         }
1838     }
1839
1840     // Adjust for pixel center location
1841     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1842     tri[0].x = _simd_add_ps(tri[0].x, offset);
1843     tri[0].y = _simd_add_ps(tri[0].y, offset);
1844
1845     tri[1].x = _simd_add_ps(tri[1].x, offset);
1846     tri[1].y = _simd_add_ps(tri[1].y, offset);
1847
1848     tri[2].x = _simd_add_ps(tri[2].x, offset);
1849     tri[2].y = _simd_add_ps(tri[2].y, offset);
1850
1851     simdscalari vXi[3], vYi[3];
1852     // Set vXi, vYi to required fixed point precision
1853     FPToFixedPoint(tri, vXi, vYi);
1854
1855     // triangle setup
1856     simdscalari vAi[3], vBi[3];
1857     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1858
1859     // determinant
1860     simdscalari vDet[2];
1861     calcDeterminantIntVertical(vAi, vBi, vDet);
1862
1863     // cull zero area
1864     int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1865     int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1866
1867     int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
1868
1869     uint32_t origTriMask = triMask;
1870     // don't cull degenerate triangles if we're conservatively rasterizing
1871     if(!CT::IsConservativeT::value)
1872     {
1873         triMask &= ~cullZeroAreaMask;
1874     }
1875
1876     // determine front winding tris
1877     // CW  +det
1878     // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1879     maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1880     maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1881     int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1882
1883     uint32_t frontWindingTris;
1884     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1885     {
1886         frontWindingTris = cwTriMask;
1887     }
1888     else
1889     {
1890         frontWindingTris = ~cwTriMask;
1891     }
1892
1893     // cull
1894     uint32_t cullTris;
1895     switch ((SWR_CULLMODE)rastState.cullMode)
1896     {
1897     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
1898     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
1899     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1900     // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1901     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
1902     default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1903     }
1904
1905     triMask &= ~cullTris;
1906
1907     if (origTriMask ^ triMask)
1908     {
1909         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1910     }
1911
1912     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1913     // compute per tri backface
1914     uint32_t frontFaceMask = frontWindingTris;
1915     uint32_t *pPrimID = (uint32_t *)&primID;
1916     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1917     DWORD triIndex = 0;
1918     // for center sample pattern, all samples are at pixel center; calculate coverage
1919     // once at center and broadcast the results in the backend
1920     const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1921     uint32_t edgeEnable;
1922     PFN_WORK_FUNC pfnWork;
1923     if(CT::IsConservativeT::value)
1924     {
1925         // determine which edges of the degenerate tri, if any, are valid to rasterize.
1926         // used to call the appropriate templated rasterizer function
1927         if(cullZeroAreaMask > 0)
1928         {
1929             // e0 = v1-v0
1930             simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
1931             simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
1932             uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
1933
1934             // e1 = v2-v1
1935             simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
1936             simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
1937             uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
1938
1939             // e2 = v0-v2
1940             // if v0 == v1 & v1 == v2, v0 == v2
1941             uint32_t e2Mask = e0Mask & e1Mask;
1942             SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
1943
1944             // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1945             // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1946             e0Mask = pdep_u32(e0Mask, 0x00249249);
1947             // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1948             e1Mask = pdep_u32(e1Mask, 0x00492492);
1949             // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1950             e2Mask = pdep_u32(e2Mask, 0x00924924);
1951
1952             edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
1953         }
1954         else
1955         {
1956             edgeEnable = 0x00FFFFFF;
1957         }
1958     }
1959     else
1960     {
1961         // degenerate triangles won't be sent to rasterizer; just enable all edges
1962         pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
1963                                     (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
1964                                     (rastState.scissorEnable > 0));
1965     }
1966
1967     if (!triMask)
1968     {
1969         goto endBinTriangles;
1970     }
1971
1972     // Calc bounding box of triangles
1973     simdBBox bbox;
1974     calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
1975
1976     // determine if triangle falls between pixel centers and discard
1977     // only discard for non-MSAA case and when conservative rast is disabled
1978     // (xmin + 127) & ~255
1979     // (xmax + 128) & ~255
1980     if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
1981     {
1982         origTriMask = triMask;
1983
1984         int cullCenterMask;
1985         {
1986             simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
1987             xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
1988             simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
1989             xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
1990
1991             simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
1992
1993             simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
1994             ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
1995             simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
1996             ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
1997
1998             simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
1999             vMaskV = _simd_or_si(vMaskH, vMaskV);
2000             cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
2001         }
2002
2003         triMask &= ~cullCenterMask;
2004
2005         if(origTriMask ^ triMask)
2006         {
2007             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
2008         }
2009     }
2010
2011     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2012     // Gather the AOS effective scissor rects based on the per-prim VP index.
2013     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
2014     simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2015     if (state.gsState.emitsViewportArrayIndex)
2016     {
2017         GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2018                                                 scisXmin, scisYmin, scisXmax, scisYmax);
2019     }
2020     else // broadcast fast path for non-VPAI case.
2021     {
2022         scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2023         scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2024         scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2025         scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2026     }
2027
2028     bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2029     bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2030     bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2031     bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2032
2033     if(CT::IsConservativeT::value)
2034     {
2035         // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
2036         // some area. Bump the xmax/ymax edges out
2037         simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
2038         bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
2039         simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
2040         bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
2041     }
2042
2043     // Cull tris completely outside scissor
2044     {
2045         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2046         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2047         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2048         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2049         triMask = triMask & ~maskOutsideScissor;
2050     }
2051
2052     if (!triMask)
2053     {
2054         goto endBinTriangles;
2055     }
2056
2057     // Convert triangle bbox to macrotile units.
2058     bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2059     bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2060     bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2061     bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2062
2063     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2064     _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2065     _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2066     _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2067     _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2068
2069     // transpose verts needed for backend
2070     /// @todo modify BE to take non-transformed verts
2071     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2072     vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
2073     vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
2074     vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
2075     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
2076
2077     // store render target array index
2078     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2079     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2080     {
2081         simdvector vRtai[3];
2082         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2083         simdscalari vRtaii;
2084         vRtaii = _simd_castps_si(vRtai[0].x);
2085         _simd_store_si((simdscalari*)aRTAI, vRtaii);
2086     }
2087     else
2088     {
2089         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2090     }
2091
2092     // scan remaining valid triangles and bin each separately
2093     while (_BitScanForward(&triIndex, triMask))
2094     {
2095         uint32_t linkageCount = state.backendState.numAttributes;
2096         uint32_t numScalarAttribs = linkageCount * 4;
2097
2098         BE_WORK work;
2099         work.type = DRAW;
2100
2101         bool isDegenerate;
2102         if(CT::IsConservativeT::value)
2103         {
2104             // only rasterize valid edges if we have a degenerate primitive
2105             int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
2106             work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
2107                                         (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
2108                                         (rastState.scissorEnable > 0));
2109
2110             // Degenerate triangles are required to be constant interpolated
2111             isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
2112         }
2113         else
2114         {
2115             isDegenerate = false;
2116             work.pfnWork = pfnWork;
2117         }
2118
2119         // Select attribute processor
2120         PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
2121             state.backendState.swizzleEnable,  state.backendState.constantInterpolationMask, isDegenerate);
2122
2123         TRIANGLE_WORK_DESC &desc = work.desc.tri;
2124
2125         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
2126         desc.triFlags.primID = pPrimID[triIndex];
2127         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
2128         desc.triFlags.viewportIndex = pViewportIndex[triIndex];
2129
2130         auto pArena = pDC->pArena;
2131         SWR_ASSERT(pArena != nullptr);
2132
2133         // store active attribs
2134         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2135         desc.pAttribs = pAttribs;
2136         desc.numAttribs = linkageCount;
2137         pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
2138
2139         // store triangle vertex data
2140         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2141
2142         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
2143         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
2144         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
2145         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
2146
2147         // store user clip distances
2148         if (rastState.clipDistanceMask)
2149         {
2150             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2151             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
2152             ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2153         }
2154
2155         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
2156         {
2157             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
2158             {
2159 #if KNOB_ENABLE_TOSS_POINTS
2160                 if (!KNOB_TOSS_SETUP_TRIS)
2161 #endif
2162                 {
2163                     pTileMgr->enqueue(x, y, &work);
2164                 }
2165             }
2166         }
2167         triMask &= ~(1 << triIndex);
2168     }
2169
2170 endBinTriangles:
2171     RDTSC_STOP(FEBinTriangles, 1, 0);
2172 }
2173
2174 struct FEBinTrianglesChooser
2175 {
2176     typedef PFN_PROCESS_PRIMS FuncType;
2177
2178     template <typename... ArgsB>
2179     static FuncType GetFunc()
2180     {
2181         return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
2182     }
2183 };
2184
2185 // Selector for correct templated BinTrinagles function
2186 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
2187 {
2188     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
2189 }
2190
2191 //////////////////////////////////////////////////////////////////////////
2192 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
2193 /// @param pDC - pointer to draw context.
2194 /// @param pa - The primitive assembly object.
2195 /// @param workerId - thread's worker id. Even thread has a unique id.
2196 /// @param tri - Contains point position data for SIMDs worth of points.
2197 /// @param primID - Primitive ID for each point.
2198 void BinPoints(
2199     DRAW_CONTEXT *pDC,
2200     PA_STATE& pa,
2201     uint32_t workerId,
2202     simdvector prim[3],
2203     uint32_t primMask,
2204     simdscalari primID,
2205     simdscalari viewportIdx)
2206 {
2207     RDTSC_START(FEBinPoints);
2208
2209     simdvector& primVerts = prim[0];
2210
2211     const API_STATE& state = GetApiState(pDC);
2212     const SWR_FRONTEND_STATE& feState = state.frontendState;
2213     const SWR_GS_STATE& gsState = state.gsState;
2214     const SWR_RASTSTATE& rastState = state.rastState;
2215     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2216
2217     // Select attribute processor
2218     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
2219         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2220
2221     if (!feState.vpTransformDisable)
2222     {
2223         // perspective divide
2224         simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
2225         primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
2226         primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
2227         primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
2228
2229         // viewport transform to screen coords
2230         if (state.gsState.emitsViewportArrayIndex)
2231         {
2232             viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
2233         }
2234         else
2235         {
2236             viewportTransform<1>(&primVerts, state.vpMatrices);
2237         }
2238     }
2239
2240     // adjust for pixel center location
2241     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2242     primVerts.x = _simd_add_ps(primVerts.x, offset);
2243     primVerts.y = _simd_add_ps(primVerts.y, offset);
2244
2245     // convert to fixed point
2246     simdscalari vXi, vYi;
2247     vXi = fpToFixedPointVertical(primVerts.x);
2248     vYi = fpToFixedPointVertical(primVerts.y);
2249
2250     if (CanUseSimplePoints(pDC))
2251     {
2252         // adjust for ymin-xmin rule
2253         vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
2254         vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
2255
2256         // cull points off the ymin-xmin edge of the viewport
2257         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
2258         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
2259
2260         // compute macro tile coordinates
2261         simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2262         simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2263
2264         OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
2265         _simd_store_si((simdscalari*)aMacroX, macroX);
2266         _simd_store_si((simdscalari*)aMacroY, macroY);
2267
2268         // compute raster tile coordinates
2269         simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
2270         simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
2271
2272         // compute raster tile relative x,y for coverage mask
2273         simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
2274         simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
2275
2276         simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
2277         simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
2278
2279         OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
2280         OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
2281         _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
2282         _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
2283
2284         OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
2285         OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
2286         _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
2287         _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
2288
2289         OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
2290         _simd_store_ps((float*)aZ, primVerts.z);
2291
2292         // store render target array index
2293         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2294         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2295         {
2296             simdvector vRtai;
2297             pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
2298             simdscalari vRtaii = _simd_castps_si(vRtai.x);
2299             _simd_store_si((simdscalari*)aRTAI, vRtaii);
2300         }
2301         else
2302         {
2303             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2304         }
2305
2306         uint32_t *pPrimID = (uint32_t *)&primID;
2307         DWORD primIndex = 0;
2308
2309         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
2310
2311         // scan remaining valid triangles and bin each separately
2312         while (_BitScanForward(&primIndex, primMask))
2313         {
2314             uint32_t linkageCount = backendState.numAttributes;
2315             uint32_t numScalarAttribs = linkageCount * 4;
2316
2317             BE_WORK work;
2318             work.type = DRAW;
2319
2320             TRIANGLE_WORK_DESC &desc = work.desc.tri;
2321
2322             // points are always front facing
2323             desc.triFlags.frontFacing = 1;
2324             desc.triFlags.primID = pPrimID[primIndex];
2325             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2326             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2327
2328             work.pfnWork = RasterizeSimplePoint;
2329
2330             auto pArena = pDC->pArena;
2331             SWR_ASSERT(pArena != nullptr);
2332
2333             // store attributes
2334             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
2335             desc.pAttribs = pAttribs;
2336             desc.numAttribs = linkageCount;
2337
2338             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
2339
2340             // store raster tile aligned x, y, perspective correct z
2341             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2342             desc.pTriBuffer = pTriBuffer;
2343             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
2344             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
2345             *pTriBuffer = aZ[primIndex];
2346
2347             uint32_t tX = aTileRelativeX[primIndex];
2348             uint32_t tY = aTileRelativeY[primIndex];
2349
2350             // pack the relative x,y into the coverageMask, the rasterizer will
2351             // generate the true coverage mask from it
2352             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2353
2354             // bin it
2355             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2356 #if KNOB_ENABLE_TOSS_POINTS
2357             if (!KNOB_TOSS_SETUP_TRIS)
2358 #endif
2359             {
2360                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2361             }
2362             primMask &= ~(1 << primIndex);
2363         }
2364     }
2365     else
2366     {
2367         // non simple points need to be potentially binned to multiple macro tiles
2368         simdscalar vPointSize;
2369         if (rastState.pointParam)
2370         {
2371             simdvector size[3];
2372             pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2373             vPointSize = size[0].x;
2374         }
2375         else
2376         {
2377             vPointSize = _simd_set1_ps(rastState.pointSize);
2378         }
2379
2380         // bloat point to bbox
2381         simdBBox bbox;
2382         bbox.xmin = bbox.xmax = vXi;
2383         bbox.ymin = bbox.ymax = vYi;
2384
2385         simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2386         simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2387         bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2388         bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2389         bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2390         bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2391
2392         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2393         // Gather the AOS effective scissor rects based on the per-prim VP index.
2394         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
2395         simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2396         if (state.gsState.emitsViewportArrayIndex)
2397         {
2398             GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2399                 scisXmin, scisYmin, scisXmax, scisYmax);
2400         }
2401         else // broadcast fast path for non-VPAI case.
2402         {
2403             scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2404             scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2405             scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2406             scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2407         }
2408
2409         bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2410         bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2411         bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2412         bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2413
2414         // Cull bloated points completely outside scissor
2415         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2416         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2417         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2418         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2419         primMask = primMask & ~maskOutsideScissor;
2420
2421         // Convert bbox to macrotile units.
2422         bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2423         bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2424         bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2425         bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2426
2427         OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2428         _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2429         _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2430         _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2431         _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2432
2433         // store render target array index
2434         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2435         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2436         {
2437             simdvector vRtai[2];
2438             pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2439             simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2440             _simd_store_si((simdscalari*)aRTAI, vRtaii);
2441         }
2442         else
2443         {
2444             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2445         }
2446
2447         OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2448         _simd_store_ps((float*)aPointSize, vPointSize);
2449
2450         uint32_t *pPrimID = (uint32_t *)&primID;
2451
2452         OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2453         OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2454         OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2455
2456         _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2457         _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2458         _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2459
2460         // scan remaining valid prims and bin each separately
2461         const SWR_BACKEND_STATE& backendState = state.backendState;
2462         DWORD primIndex;
2463         while (_BitScanForward(&primIndex, primMask))
2464         {
2465             uint32_t linkageCount = backendState.numAttributes;
2466             uint32_t numScalarAttribs = linkageCount * 4;
2467
2468             BE_WORK work;
2469             work.type = DRAW;
2470
2471             TRIANGLE_WORK_DESC &desc = work.desc.tri;
2472
2473             desc.triFlags.frontFacing = 1;
2474             desc.triFlags.primID = pPrimID[primIndex];
2475             desc.triFlags.pointSize = aPointSize[primIndex];
2476             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2477             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2478
2479             work.pfnWork = RasterizeTriPoint;
2480
2481             auto pArena = pDC->pArena;
2482             SWR_ASSERT(pArena != nullptr);
2483
2484             // store active attribs
2485             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2486             desc.numAttribs = linkageCount;
2487             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2488
2489             // store point vertex data
2490             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2491             desc.pTriBuffer = pTriBuffer;
2492             *pTriBuffer++ = aPrimVertsX[primIndex];
2493             *pTriBuffer++ = aPrimVertsY[primIndex];
2494             *pTriBuffer = aPrimVertsZ[primIndex];
2495
2496             // store user clip distances
2497             if (rastState.clipDistanceMask)
2498             {
2499                 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2500                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2501                 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2502             }
2503
2504             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2505             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2506             {
2507                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2508                 {
2509 #if KNOB_ENABLE_TOSS_POINTS
2510                     if (!KNOB_TOSS_SETUP_TRIS)
2511 #endif
2512                     {
2513                         pTileMgr->enqueue(x, y, &work);
2514                     }
2515                 }
2516             }
2517
2518             primMask &= ~(1 << primIndex);
2519         }
2520     }
2521
2522
2523
2524
2525     RDTSC_STOP(FEBinPoints, 1, 0);
2526 }
2527
2528 //////////////////////////////////////////////////////////////////////////
2529 /// @brief Bin SIMD lines to the backend.
2530 /// @param pDC - pointer to draw context.
2531 /// @param pa - The primitive assembly object.
2532 /// @param workerId - thread's worker id. Even thread has a unique id.
2533 /// @param tri - Contains line position data for SIMDs worth of points.
2534 /// @param primID - Primitive ID for each line.
2535 /// @param viewportIdx - Viewport Array Index for each line.
2536 void BinLines(
2537     DRAW_CONTEXT *pDC,
2538     PA_STATE& pa,
2539     uint32_t workerId,
2540     simdvector prim[],
2541     uint32_t primMask,
2542     simdscalari primID,
2543     simdscalari viewportIdx)
2544 {
2545     RDTSC_START(FEBinLines);
2546
2547     const API_STATE& state = GetApiState(pDC);
2548     const SWR_RASTSTATE& rastState = state.rastState;
2549     const SWR_FRONTEND_STATE& feState = state.frontendState;
2550     const SWR_GS_STATE& gsState = state.gsState;
2551
2552     // Select attribute processor
2553     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2554     state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2555
2556     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2557     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2558
2559     if (!feState.vpTransformDisable)
2560     {
2561         // perspective divide
2562         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2563         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2564
2565         prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2566         prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2567
2568         prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2569         prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2570
2571         prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2572         prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2573
2574         // viewport transform to screen coords
2575         if (state.gsState.emitsViewportArrayIndex)
2576         {
2577             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2578         }
2579         else
2580         {
2581             viewportTransform<2>(prim, state.vpMatrices);
2582         }
2583     }
2584
2585     // adjust for pixel center location
2586     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2587     prim[0].x = _simd_add_ps(prim[0].x, offset);
2588     prim[0].y = _simd_add_ps(prim[0].y, offset);
2589
2590     prim[1].x = _simd_add_ps(prim[1].x, offset);
2591     prim[1].y = _simd_add_ps(prim[1].y, offset);
2592
2593     // convert to fixed point
2594     simdscalari vXi[2], vYi[2];
2595     vXi[0] = fpToFixedPointVertical(prim[0].x);
2596     vYi[0] = fpToFixedPointVertical(prim[0].y);
2597     vXi[1] = fpToFixedPointVertical(prim[1].x);
2598     vYi[1] = fpToFixedPointVertical(prim[1].y);
2599
2600     // compute x-major vs y-major mask
2601     simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2602     simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2603     simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2604     uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2605
2606     // cull zero-length lines
2607     simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2608     vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2609
2610     primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2611
2612     uint32_t *pPrimID = (uint32_t *)&primID;
2613     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2614
2615     simdscalar vUnused = _simd_setzero_ps();
2616
2617     // Calc bounding box of lines
2618     simdBBox bbox;
2619     bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
2620     bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
2621     bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
2622     bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
2623
2624     // bloat bbox by line width along minor axis
2625     simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2626     simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2627     simdBBox bloatBox;
2628     bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2629     bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2630     bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2631     bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2632
2633     bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
2634     bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
2635     bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
2636     bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
2637
2638     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2639     simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2640     if (state.gsState.emitsViewportArrayIndex)
2641     {
2642         GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2643             scisXmin, scisYmin, scisXmax, scisYmax);
2644     }
2645     else // broadcast fast path for non-VPAI case.
2646     {
2647         scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2648         scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2649         scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2650         scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2651     }
2652
2653     bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2654     bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2655     bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2656     bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2657
2658     // Cull prims completely outside scissor
2659     {
2660         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2661         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2662         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2663         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2664         primMask = primMask & ~maskOutsideScissor;
2665     }
2666
2667     if (!primMask)
2668     {
2669         goto endBinLines;
2670     }
2671
2672     // Convert triangle bbox to macrotile units.
2673     bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2674     bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2675     bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2676     bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2677
2678     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2679     _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2680     _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2681     _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2682     _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2683
2684     // transpose verts needed for backend
2685     /// @todo modify BE to take non-transformed verts
2686     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2687     vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2688     vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2689     vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2690     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2691
2692     // store render target array index
2693     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2694     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2695     {
2696         simdvector vRtai[2];
2697         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2698         simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2699         _simd_store_si((simdscalari*)aRTAI, vRtaii);
2700     }
2701     else
2702     {
2703         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2704     }
2705
2706     // scan remaining valid prims and bin each separately
2707     DWORD primIndex;
2708     while (_BitScanForward(&primIndex, primMask))
2709     {
2710         uint32_t linkageCount = state.backendState.numAttributes;
2711         uint32_t numScalarAttribs = linkageCount * 4;
2712
2713         BE_WORK work;
2714         work.type = DRAW;
2715
2716         TRIANGLE_WORK_DESC &desc = work.desc.tri;
2717
2718         desc.triFlags.frontFacing = 1;
2719         desc.triFlags.primID = pPrimID[primIndex];
2720         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2721         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2722         desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2723
2724         work.pfnWork = RasterizeLine;
2725
2726         auto pArena = pDC->pArena;
2727         SWR_ASSERT(pArena != nullptr);
2728
2729         // store active attribs
2730         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2731         desc.numAttribs = linkageCount;
2732         pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2733
2734         // store line vertex data
2735         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2736         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2737         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2738         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2739         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2740
2741         // store user clip distances
2742         if (rastState.clipDistanceMask)
2743         {
2744             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2745             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2746             ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2747         }
2748
2749         MacroTileMgr *pTileMgr = pDC->pTileMgr;
2750         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2751         {
2752             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2753             {
2754 #if KNOB_ENABLE_TOSS_POINTS
2755                 if (!KNOB_TOSS_SETUP_TRIS)
2756 #endif
2757                 {
2758                     pTileMgr->enqueue(x, y, &work);
2759                 }
2760             }
2761         }
2762
2763         primMask &= ~(1 << primIndex);
2764     }
2765
2766 endBinLines:
2767
2768     RDTSC_STOP(FEBinLines, 1, 0);
2769 }