src/gallium/drivers/swr/rasterizer/core/frontend.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file frontend.cpp
  24  *
  25  * @brief Implementation for Frontend which handles vertex processing,
  26  *        primitive assembly, clipping, binning, etc.
  27  *
  28  ******************************************************************************/
  29
  30 #include "api.h"
  31 #include "frontend.h"
  32 #include "backend.h"
  33 #include "context.h"
  34 #include "rdtsc_core.h"
  35 #include "utils.h"
  36 #include "threads.h"
  37 #include "pa.h"
  38 #include "clip.h"
  39 #include "tilemgr.h"
  40 #include "tessellator.h"
  41 #include <limits>
  42 #include <iostream>
  43
  44 //////////////////////////////////////////////////////////////////////////
  45 /// @brief Helper macro to generate a bitmask
  46 static INLINE uint32_t GenMask(uint32_t numBits)
  47 {
  48     SWR_ASSERT(
  49         numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
  50     return ((1U << numBits) - 1);
  51 }
  52
  53 //////////////////////////////////////////////////////////////////////////
  54 /// @brief FE handler for SwrSync.
  55 /// @param pContext - pointer to SWR context.
  56 /// @param pDC - pointer to draw context.
  57 /// @param workerId - thread's worker id. Even thread has a unique id.
  58 /// @param pUserData - Pointer to user data passed back to sync callback.
  59 /// @todo This should go away when we switch this to use compute threading.
  60 void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
  61 {
  62     BE_WORK work;
  63     work.type    = SYNC;
  64     work.pfnWork = ProcessSyncBE;
  65
  66     MacroTileMgr* pTileMgr = pDC->pTileMgr;
  67     pTileMgr->enqueue(0, 0, &work);
  68 }
  69
  70 //////////////////////////////////////////////////////////////////////////
  71 /// @brief FE handler for SwrDestroyContext.
  72 /// @param pContext - pointer to SWR context.
  73 /// @param pDC - pointer to draw context.
  74 /// @param workerId - thread's worker id. Even thread has a unique id.
  75 /// @param pUserData - Pointer to user data passed back to sync callback.
  76 void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
  77 {
  78     BE_WORK work;
  79     work.type    = SHUTDOWN;
  80     work.pfnWork = ProcessShutdownBE;
  81
  82     MacroTileMgr* pTileMgr = pDC->pTileMgr;
  83     // Enqueue at least 1 work item for each worker thread
  84     // account for number of numa nodes
  85     uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
  86
  87     for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
  88     {
  89         for (uint32_t n = 0; n < numNumaNodes; ++n)
  90         {
  91             pTileMgr->enqueue(i, n, &work);
  92         }
  93     }
  94 }
  95
  96 //////////////////////////////////////////////////////////////////////////
  97 /// @brief FE handler for SwrClearRenderTarget.
  98 /// @param pContext - pointer to SWR context.
  99 /// @param pDC - pointer to draw context.
 100 /// @param workerId - thread's worker id. Even thread has a unique id.
 101 /// @param pUserData - Pointer to user data passed back to clear callback.
 102 /// @todo This should go away when we switch this to use compute threading.
 103 void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
 104 {
 105     CLEAR_DESC*   pDesc    = (CLEAR_DESC*)pUserData;
 106     MacroTileMgr* pTileMgr = pDC->pTileMgr;
 107
 108     // queue a clear to each macro tile
 109     // compute macro tile bounds for the specified rect
 110     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 111     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 112     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 113     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 114
 115     BE_WORK work;
 116     work.type       = CLEAR;
 117     work.pfnWork    = ProcessClearBE;
 118     work.desc.clear = *pDesc;
 119
 120     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 121     {
 122         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 123         {
 124             pTileMgr->enqueue(x, y, &work);
 125         }
 126     }
 127 }
 128
 129 //////////////////////////////////////////////////////////////////////////
 130 /// @brief FE handler for SwrStoreTiles.
 131 /// @param pContext - pointer to SWR context.
 132 /// @param pDC - pointer to draw context.
 133 /// @param workerId - thread's worker id. Even thread has a unique id.
 134 /// @param pUserData - Pointer to user data passed back to callback.
 135 /// @todo This should go away when we switch this to use compute threading.
 136 void ProcessStoreTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
 137 {
 138     RDTSC_BEGIN(FEProcessStoreTiles, pDC->drawId);
 139     MacroTileMgr*     pTileMgr = pDC->pTileMgr;
 140     STORE_TILES_DESC* pDesc    = (STORE_TILES_DESC*)pUserData;
 141
 142     // queue a store to each macro tile
 143     // compute macro tile bounds for the specified rect
 144     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 145     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 146     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 147     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 148
 149     // store tiles
 150     BE_WORK work;
 151     work.type            = STORETILES;
 152     work.pfnWork         = ProcessStoreTilesBE;
 153     work.desc.storeTiles = *pDesc;
 154
 155     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 156     {
 157         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 158         {
 159             pTileMgr->enqueue(x, y, &work);
 160         }
 161     }
 162
 163     RDTSC_END(FEProcessStoreTiles, 0);
 164 }
 165
 166 //////////////////////////////////////////////////////////////////////////
 167 /// @brief FE handler for SwrInvalidateTiles.
 168 /// @param pContext - pointer to SWR context.
 169 /// @param pDC - pointer to draw context.
 170 /// @param workerId - thread's worker id. Even thread has a unique id.
 171 /// @param pUserData - Pointer to user data passed back to callback.
 172 /// @todo This should go away when we switch this to use compute threading.
 173 void ProcessDiscardInvalidateTiles(SWR_CONTEXT*  pContext,
 174                                    DRAW_CONTEXT* pDC,
 175                                    uint32_t      workerId,
 176                                    void*         pUserData)
 177 {
 178     RDTSC_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
 179     DISCARD_INVALIDATE_TILES_DESC* pDesc    = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
 180     MacroTileMgr*                  pTileMgr = pDC->pTileMgr;
 181
 182     // compute macro tile bounds for the specified rect
 183     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
 184     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
 185     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
 186     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
 187
 188     if (pDesc->fullTilesOnly == false)
 189     {
 190         // include partial tiles
 191         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 192         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 193         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 194         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 195     }
 196
 197     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
 198     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
 199
 200     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
 201     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
 202
 203     // load tiles
 204     BE_WORK work;
 205     work.type                        = DISCARDINVALIDATETILES;
 206     work.pfnWork                     = ProcessDiscardInvalidateTilesBE;
 207     work.desc.discardInvalidateTiles = *pDesc;
 208
 209     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 210     {
 211         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 212         {
 213             pTileMgr->enqueue(x, y, &work);
 214         }
 215     }
 216
 217     RDTSC_END(FEProcessInvalidateTiles, 0);
 218 }
 219
 220 //////////////////////////////////////////////////////////////////////////
 221 /// @brief Computes the number of primitives given the number of verts.
 222 /// @param mode - primitive topology for draw operation.
 223 /// @param numPrims - number of vertices or indices for draw.
 224 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 225 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
 226 {
 227     switch (mode)
 228     {
 229     case TOP_POINT_LIST:
 230         return numPrims;
 231     case TOP_TRIANGLE_LIST:
 232         return numPrims / 3;
 233     case TOP_TRIANGLE_STRIP:
 234         return numPrims < 3 ? 0 : numPrims - 2;
 235     case TOP_TRIANGLE_FAN:
 236         return numPrims < 3 ? 0 : numPrims - 2;
 237     case TOP_TRIANGLE_DISC:
 238         return numPrims < 2 ? 0 : numPrims - 1;
 239     case TOP_QUAD_LIST:
 240         return numPrims / 4;
 241     case TOP_QUAD_STRIP:
 242         return numPrims < 4 ? 0 : (numPrims - 2) / 2;
 243     case TOP_LINE_STRIP:
 244         return numPrims < 2 ? 0 : numPrims - 1;
 245     case TOP_LINE_LIST:
 246         return numPrims / 2;
 247     case TOP_LINE_LOOP:
 248         return numPrims;
 249     case TOP_RECT_LIST:
 250         return numPrims / 3;
 251     case TOP_LINE_LIST_ADJ:
 252         return numPrims / 4;
 253     case TOP_LISTSTRIP_ADJ:
 254         return numPrims < 3 ? 0 : numPrims - 3;
 255     case TOP_TRI_LIST_ADJ:
 256         return numPrims / 6;
 257     case TOP_TRI_STRIP_ADJ:
 258         return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 259
 260     case TOP_PATCHLIST_1:
 261     case TOP_PATCHLIST_2:
 262     case TOP_PATCHLIST_3:
 263     case TOP_PATCHLIST_4:
 264     case TOP_PATCHLIST_5:
 265     case TOP_PATCHLIST_6:
 266     case TOP_PATCHLIST_7:
 267     case TOP_PATCHLIST_8:
 268     case TOP_PATCHLIST_9:
 269     case TOP_PATCHLIST_10:
 270     case TOP_PATCHLIST_11:
 271     case TOP_PATCHLIST_12:
 272     case TOP_PATCHLIST_13:
 273     case TOP_PATCHLIST_14:
 274     case TOP_PATCHLIST_15:
 275     case TOP_PATCHLIST_16:
 276     case TOP_PATCHLIST_17:
 277     case TOP_PATCHLIST_18:
 278     case TOP_PATCHLIST_19:
 279     case TOP_PATCHLIST_20:
 280     case TOP_PATCHLIST_21:
 281     case TOP_PATCHLIST_22:
 282     case TOP_PATCHLIST_23:
 283     case TOP_PATCHLIST_24:
 284     case TOP_PATCHLIST_25:
 285     case TOP_PATCHLIST_26:
 286     case TOP_PATCHLIST_27:
 287     case TOP_PATCHLIST_28:
 288     case TOP_PATCHLIST_29:
 289     case TOP_PATCHLIST_30:
 290     case TOP_PATCHLIST_31:
 291     case TOP_PATCHLIST_32:
 292         return numPrims / (mode - TOP_PATCHLIST_BASE);
 293
 294     case TOP_POLYGON:
 295     case TOP_POINT_LIST_BF:
 296     case TOP_LINE_STRIP_CONT:
 297     case TOP_LINE_STRIP_BF:
 298     case TOP_LINE_STRIP_CONT_BF:
 299     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 300     case TOP_TRI_STRIP_REVERSE:
 301     case TOP_PATCHLIST_BASE:
 302     case TOP_UNKNOWN:
 303         SWR_INVALID("Unsupported topology: %d", mode);
 304         return 0;
 305     }
 306
 307     return 0;
 308 }
 309
 310 //////////////////////////////////////////////////////////////////////////
 311 /// @brief Computes the number of verts given the number of primitives.
 312 /// @param mode - primitive topology for draw operation.
 313 /// @param numPrims - number of primitives for draw.
 314 uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
 315 {
 316     switch (mode)
 317     {
 318     case TOP_POINT_LIST:
 319         return numPrims;
 320     case TOP_TRIANGLE_LIST:
 321         return numPrims * 3;
 322     case TOP_TRIANGLE_STRIP:
 323         return numPrims ? numPrims + 2 : 0;
 324     case TOP_TRIANGLE_FAN:
 325         return numPrims ? numPrims + 2 : 0;
 326     case TOP_TRIANGLE_DISC:
 327         return numPrims ? numPrims + 1 : 0;
 328     case TOP_QUAD_LIST:
 329         return numPrims * 4;
 330     case TOP_QUAD_STRIP:
 331         return numPrims ? numPrims * 2 + 2 : 0;
 332     case TOP_LINE_STRIP:
 333         return numPrims ? numPrims + 1 : 0;
 334     case TOP_LINE_LIST:
 335         return numPrims * 2;
 336     case TOP_LINE_LOOP:
 337         return numPrims;
 338     case TOP_RECT_LIST:
 339         return numPrims * 3;
 340     case TOP_LINE_LIST_ADJ:
 341         return numPrims * 4;
 342     case TOP_LISTSTRIP_ADJ:
 343         return numPrims ? numPrims + 3 : 0;
 344     case TOP_TRI_LIST_ADJ:
 345         return numPrims * 6;
 346     case TOP_TRI_STRIP_ADJ:
 347         return numPrims ? (numPrims + 2) * 2 : 0;
 348
 349     case TOP_PATCHLIST_1:
 350     case TOP_PATCHLIST_2:
 351     case TOP_PATCHLIST_3:
 352     case TOP_PATCHLIST_4:
 353     case TOP_PATCHLIST_5:
 354     case TOP_PATCHLIST_6:
 355     case TOP_PATCHLIST_7:
 356     case TOP_PATCHLIST_8:
 357     case TOP_PATCHLIST_9:
 358     case TOP_PATCHLIST_10:
 359     case TOP_PATCHLIST_11:
 360     case TOP_PATCHLIST_12:
 361     case TOP_PATCHLIST_13:
 362     case TOP_PATCHLIST_14:
 363     case TOP_PATCHLIST_15:
 364     case TOP_PATCHLIST_16:
 365     case TOP_PATCHLIST_17:
 366     case TOP_PATCHLIST_18:
 367     case TOP_PATCHLIST_19:
 368     case TOP_PATCHLIST_20:
 369     case TOP_PATCHLIST_21:
 370     case TOP_PATCHLIST_22:
 371     case TOP_PATCHLIST_23:
 372     case TOP_PATCHLIST_24:
 373     case TOP_PATCHLIST_25:
 374     case TOP_PATCHLIST_26:
 375     case TOP_PATCHLIST_27:
 376     case TOP_PATCHLIST_28:
 377     case TOP_PATCHLIST_29:
 378     case TOP_PATCHLIST_30:
 379     case TOP_PATCHLIST_31:
 380     case TOP_PATCHLIST_32:
 381         return numPrims * (mode - TOP_PATCHLIST_BASE);
 382
 383     case TOP_POLYGON:
 384     case TOP_POINT_LIST_BF:
 385     case TOP_LINE_STRIP_CONT:
 386     case TOP_LINE_STRIP_BF:
 387     case TOP_LINE_STRIP_CONT_BF:
 388     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 389     case TOP_TRI_STRIP_REVERSE:
 390     case TOP_PATCHLIST_BASE:
 391     case TOP_UNKNOWN:
 392         SWR_INVALID("Unsupported topology: %d", mode);
 393         return 0;
 394     }
 395
 396     return 0;
 397 }
 398
 399 //////////////////////////////////////////////////////////////////////////
 400 /// @brief Return number of verts per primitive.
 401 /// @param topology - topology
 402 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 403 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 404 {
 405     uint32_t numVerts = 0;
 406     switch (topology)
 407     {
 408     case TOP_POINT_LIST:
 409     case TOP_POINT_LIST_BF:
 410         numVerts = 1;
 411         break;
 412     case TOP_LINE_LIST:
 413     case TOP_LINE_STRIP:
 414     case TOP_LINE_LIST_ADJ:
 415     case TOP_LINE_LOOP:
 416     case TOP_LINE_STRIP_CONT:
 417     case TOP_LINE_STRIP_BF:
 418     case TOP_LISTSTRIP_ADJ:
 419         numVerts = 2;
 420         break;
 421     case TOP_TRIANGLE_LIST:
 422     case TOP_TRIANGLE_STRIP:
 423     case TOP_TRIANGLE_FAN:
 424     case TOP_TRI_LIST_ADJ:
 425     case TOP_TRI_STRIP_ADJ:
 426     case TOP_TRI_STRIP_REVERSE:
 427     case TOP_RECT_LIST:
 428         numVerts = 3;
 429         break;
 430     case TOP_QUAD_LIST:
 431     case TOP_QUAD_STRIP:
 432         numVerts = 4;
 433         break;
 434     case TOP_PATCHLIST_1:
 435     case TOP_PATCHLIST_2:
 436     case TOP_PATCHLIST_3:
 437     case TOP_PATCHLIST_4:
 438     case TOP_PATCHLIST_5:
 439     case TOP_PATCHLIST_6:
 440     case TOP_PATCHLIST_7:
 441     case TOP_PATCHLIST_8:
 442     case TOP_PATCHLIST_9:
 443     case TOP_PATCHLIST_10:
 444     case TOP_PATCHLIST_11:
 445     case TOP_PATCHLIST_12:
 446     case TOP_PATCHLIST_13:
 447     case TOP_PATCHLIST_14:
 448     case TOP_PATCHLIST_15:
 449     case TOP_PATCHLIST_16:
 450     case TOP_PATCHLIST_17:
 451     case TOP_PATCHLIST_18:
 452     case TOP_PATCHLIST_19:
 453     case TOP_PATCHLIST_20:
 454     case TOP_PATCHLIST_21:
 455     case TOP_PATCHLIST_22:
 456     case TOP_PATCHLIST_23:
 457     case TOP_PATCHLIST_24:
 458     case TOP_PATCHLIST_25:
 459     case TOP_PATCHLIST_26:
 460     case TOP_PATCHLIST_27:
 461     case TOP_PATCHLIST_28:
 462     case TOP_PATCHLIST_29:
 463     case TOP_PATCHLIST_30:
 464     case TOP_PATCHLIST_31:
 465     case TOP_PATCHLIST_32:
 466         numVerts = topology - TOP_PATCHLIST_BASE;
 467         break;
 468     default:
 469         SWR_INVALID("Unsupported topology: %d", topology);
 470         break;
 471     }
 472
 473     if (includeAdjVerts)
 474     {
 475         switch (topology)
 476         {
 477         case TOP_LISTSTRIP_ADJ:
 478         case TOP_LINE_LIST_ADJ:
 479             numVerts = 4;
 480             break;
 481         case TOP_TRI_STRIP_ADJ:
 482         case TOP_TRI_LIST_ADJ:
 483             numVerts = 6;
 484             break;
 485         default:
 486             break;
 487         }
 488     }
 489
 490     return numVerts;
 491 }
 492
 493 //////////////////////////////////////////////////////////////////////////
 494 /// @brief Generate mask from remaining work.
 495 /// @param numWorkItems - Number of items being worked on by a SIMD.
 496 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 497 {
 498     uint32_t numActive =
 499         (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
 500     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 501     return _simd_castps_si(_simd_vmask_ps(mask));
 502 }
 503
 504 static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
 505 {
 506     uint32_t numActive =
 507         (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
 508     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 509     return _simd16_castps_si(_simd16_vmask_ps(mask));
 510 }
 511
 512 //////////////////////////////////////////////////////////////////////////
 513 /// @brief StreamOut - Streams vertex data out to SO buffers.
 514 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 515 /// @param pDC - pointer to draw context.
 516 /// @param workerId - thread's worker id. Even thread has a unique id.
 517 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 518 static void StreamOut(
 519     DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t workerId, uint32_t* pPrimData, uint32_t streamIndex)
 520 {
 521     RDTSC_BEGIN(FEStreamout, pDC->drawId);
 522
 523     const API_STATE&           state   = GetApiState(pDC);
 524     const SWR_STREAMOUT_STATE& soState = state.soState;
 525
 526     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 527
 528     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each
 529     // vertex.
 530     uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
 531
 532     SWR_STREAMOUT_CONTEXT soContext = {0};
 533
 534     // Setup buffer state pointers.
 535     for (uint32_t i = 0; i < 4; ++i)
 536     {
 537         soContext.pBuffer[i] = &state.soBuffer[i];
 538     }
 539
 540     uint32_t numPrims = pa.NumPrims();
 541
 542     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 543     {
 544         DWORD    slot   = 0;
 545         uint64_t soMask = soState.streamMasks[streamIndex];
 546
 547         // Write all entries into primitive data buffer for SOS.
 548         while (_BitScanForward64(&slot, soMask))
 549         {
 550             simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
 551             uint32_t    paSlot = slot + soState.vertexAttribOffset[streamIndex];
 552             pa.AssembleSingle(paSlot, primIndex, attrib);
 553
 554             // Attribute offset is relative offset from start of vertex.
 555             // Note that attributes start at slot 1 in the PA buffer. We need to write this
 556             // to prim data starting at slot 0. Which is why we do (slot - 1).
 557             // Also note: GL works slightly differently, and needs slot 0
 558             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
 559
 560             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
 561             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
 562             {
 563                 uint32_t* pPrimDataAttrib =
 564                     pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 565
 566                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 567             }
 568
 569             soMask &= ~(uint64_t(1) << slot);
 570         }
 571
 572         // Update pPrimData pointer
 573         soContext.pPrimData = pPrimData;
 574
 575         // Call SOS
 576         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr,
 577                    "Trying to execute uninitialized streamout jit function.");
 578         state.pfnSoFunc[streamIndex](soContext);
 579     }
 580
 581     // Update SO write offset. The driver provides memory for the update.
 582     for (uint32_t i = 0; i < 4; ++i)
 583     {
 584         if (state.soBuffer[i].pWriteOffset)
 585         {
 586             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 587         }
 588
 589         if (state.soBuffer[i].soWriteEnable)
 590         {
 591             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 592             pDC->dynState.SoWriteOffsetDirty[i] = true;
 593         }
 594     }
 595
 596     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
 597     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 598
 599     RDTSC_END(FEStreamout, 1);
 600 }
 601
 602 #if USE_SIMD16_FRONTEND
 603 //////////////////////////////////////////////////////////////////////////
 604 /// Is value an even number (a multiple of two)
 605 ///
 606 template <typename T>
 607 INLINE static bool IsEven(T value)
 608 {
 609     return (value & 1) == 0;
 610 }
 611
 612 //////////////////////////////////////////////////////////////////////////
 613 /// Round up value to an even number (a multiple of two)
 614 ///
 615 template <typename T>
 616 INLINE static T RoundUpEven(T value)
 617 {
 618     return (value + 1) & ~1;
 619 }
 620
 621 //////////////////////////////////////////////////////////////////////////
 622 /// Round down value to an even number (a multiple of two)
 623 ///
 624 template <typename T>
 625 INLINE static T RoundDownEven(T value)
 626 {
 627     return value & ~1;
 628 }
 629
 630 //////////////////////////////////////////////////////////////////////////
 631 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
 632 ///
 633 /// vertexCount is in terms of the source simdvertexes and must be even
 634 ///
 635 /// attribCount will limit the vector copies to those attribs specified
 636 ///
 637 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
 638 ///
 639 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex*     vertex_simd16,
 640                                            const simdvertex* vertex,
 641                                            uint32_t          vertexCount,
 642                                            uint32_t          attribCount)
 643 {
 644     SWR_ASSERT(vertex);
 645     SWR_ASSERT(vertex_simd16);
 646     SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
 647
 648     simd16vertex temp;
 649
 650     for (uint32_t i = 0; i < vertexCount; i += 2)
 651     {
 652         for (uint32_t j = 0; j < attribCount; j += 1)
 653         {
 654             for (uint32_t k = 0; k < 4; k += 1)
 655             {
 656                 temp.attrib[j][k] =
 657                     _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
 658
 659                 if ((i + 1) < vertexCount)
 660                 {
 661                     temp.attrib[j][k] =
 662                         _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
 663                 }
 664             }
 665         }
 666
 667         for (uint32_t j = 0; j < attribCount; j += 1)
 668         {
 669             vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
 670         }
 671     }
 672 }
 673
 674 #endif
 675 //////////////////////////////////////////////////////////////////////////
 676 /// @brief Computes number of invocations. The current index represents
 677 ///        the start of the SIMD. The max index represents how much work
 678 ///        items are remaining. If there is less then a SIMD's xmin of work
 679 ///        then return the remaining amount of work.
 680 /// @param curIndex - The start index for the SIMD.
 681 /// @param maxIndex - The last index for all work items.
 682 static INLINE uint32_t GetNumInvocations(uint32_t curIndex, uint32_t maxIndex)
 683 {
 684     uint32_t remainder = (maxIndex - curIndex);
 685 #if USE_SIMD16_FRONTEND
 686     return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
 687 #else
 688     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 689 #endif
 690 }
 691
 692 //////////////////////////////////////////////////////////////////////////
 693 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 694 ///        The geometry shader will loop over each active streamout buffer, assembling
 695 ///        primitives for the downstream stages. When multistream output is enabled,
 696 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 697 ///        buffer for the primitive assembler.
 698 /// @param stream - stream id to generate the cut buffer for
 699 /// @param pStreamIdBase - pointer to the stream ID buffer
 700 /// @param numEmittedVerts - Number of total verts emitted by the GS
 701 /// @param pCutBuffer - output buffer to write cuts to
 702 void ProcessStreamIdBuffer(uint32_t stream,
 703                            uint8_t* pStreamIdBase,
 704                            uint32_t numEmittedVerts,
 705                            uint8_t* pCutBuffer)
 706 {
 707     SWR_ASSERT(stream < MAX_SO_STREAMS);
 708
 709     uint32_t numInputBytes  = (numEmittedVerts * 2 + 7) / 8;
 710     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 711
 712     for (uint32_t b = 0; b < numOutputBytes; ++b)
 713     {
 714         uint8_t curInputByte = pStreamIdBase[2 * b];
 715         uint8_t outByte      = 0;
 716         for (uint32_t i = 0; i < 4; ++i)
 717         {
 718             if ((curInputByte & 0x3) != stream)
 719             {
 720                 outByte |= (1 << i);
 721             }
 722             curInputByte >>= 2;
 723         }
 724
 725         curInputByte = pStreamIdBase[2 * b + 1];
 726         for (uint32_t i = 0; i < 4; ++i)
 727         {
 728             if ((curInputByte & 0x3) != stream)
 729             {
 730                 outByte |= (1 << (i + 4));
 731             }
 732             curInputByte >>= 2;
 733         }
 734
 735         *pCutBuffer++ = outByte;
 736     }
 737 }
 738
 739 // Buffers that are allocated if GS is enabled
 740 struct GsBuffers
 741 {
 742     uint8_t* pGsIn;
 743     uint8_t* pGsOut[KNOB_SIMD_WIDTH];
 744     uint8_t* pGsTransposed;
 745     void*    pStreamCutBuffer;
 746 };
 747
 748 //////////////////////////////////////////////////////////////////////////
 749 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
 750 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive
 751 /// assembler
 752 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
 753 /// @param numVerts - Number of vertices outputted by the GS
 754 /// @param numAttribs - Number of attributes per vertex
 755 template <typename SIMD_T, uint32_t SimdWidth>
 756 void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
 757 {
 758     uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
 759     uint32_t dstVertexStride = numAttribs * sizeof(Float<SIMD_T>) * 4;
 760
 761     OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
 762
 763     for (uint32_t i = 0; i < SimdWidth; ++i)
 764     {
 765         gatherOffsets[i] = srcVertexStride * i;
 766     }
 767     auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]);
 768
 769     uint32_t numSimd        = AlignUp(numVerts, SimdWidth) / SimdWidth;
 770     uint32_t remainingVerts = numVerts;
 771
 772     for (uint32_t s = 0; s < numSimd; ++s)
 773     {
 774         uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
 775         uint8_t* pDstBase = pDst + s * dstVertexStride;
 776
 777         // Compute mask to prevent src overflow
 778         uint32_t mask = std::min(remainingVerts, SimdWidth);
 779         mask          = GenMask(mask);
 780         auto vMask    = SIMD_T::vmask_ps(mask);
 781         auto viMask   = SIMD_T::castps_si(vMask);
 782
 783         for (uint32_t a = 0; a < numAttribs; ++a)
 784         {
 785             auto attribGatherX = SIMD_T::mask_i32gather_ps(
 786                 SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
 787             auto attribGatherY = SIMD_T::mask_i32gather_ps(
 788                 SIMD_T::setzero_ps(),
 789                 (const float*)(pSrcBase + sizeof(float)),
 790                 vGatherOffsets,
 791                 vMask);
 792             auto attribGatherZ = SIMD_T::mask_i32gather_ps(
 793                 SIMD_T::setzero_ps(),
 794                 (const float*)(pSrcBase + sizeof(float) * 2),
 795                 vGatherOffsets,
 796                 vMask);
 797             auto attribGatherW = SIMD_T::mask_i32gather_ps(
 798                 SIMD_T::setzero_ps(),
 799                 (const float*)(pSrcBase + sizeof(float) * 3),
 800                 vGatherOffsets,
 801                 vMask);
 802
 803             SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
 804             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
 805             SIMD_T::maskstore_ps(
 806                 (float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ);
 807             SIMD_T::maskstore_ps(
 808                 (float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW);
 809
 810             pSrcBase += sizeof(float) * 4;
 811             pDstBase += sizeof(Float<SIMD_T>) * 4;
 812         }
 813         remainingVerts -= SimdWidth;
 814     }
 815 }
 816
 817
 818 //////////////////////////////////////////////////////////////////////////
 819 /// @brief Implements GS stage.
 820 /// @param pDC - pointer to draw context.
 821 /// @param workerId - thread's worker id. Even thread has a unique id.
 822 /// @param pa - The primitive assembly object.
 823 /// @param pGsOut - output stream for GS
 824 template <typename HasStreamOutT, typename HasRastT>
 825 static void GeometryShaderStage(DRAW_CONTEXT* pDC,
 826                                 uint32_t      workerId,
 827                                 PA_STATE&     pa,
 828                                 GsBuffers*    pGsBuffers,
 829                                 uint32_t*     pSoPrimData,
 830 #if USE_SIMD16_FRONTEND
 831                                 uint32_t numPrims_simd8,
 832 #endif
 833                                 simdscalari const& primID)
 834 {
 835     RDTSC_BEGIN(FEGeometryShader, pDC->drawId);
 836
 837     void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 838
 839     const API_STATE&    state  = GetApiState(pDC);
 840     const SWR_GS_STATE* pState = &state.gsState;
 841     SWR_GS_CONTEXT      gsContext;
 842
 843     static uint8_t sNullBuffer[128] = {0};
 844
 845     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
 846     {
 847         gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
 848     }
 849     gsContext.pVerts      = (simdvector*)pGsBuffers->pGsIn;
 850     gsContext.PrimitiveID = primID;
 851
 852     uint32_t   numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 853     simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
 854
 855     // assemble all attributes for the input primitive
 856     gsContext.inputVertStride = pState->inputVertStride;
 857     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 858     {
 859         uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
 860         uint32_t attribSlot    = pState->vertexAttribOffset + slot;
 861         pa.Assemble(srcAttribSlot, attrib);
 862
 863         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 864         {
 865             gsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];
 866         }
 867     }
 868
 869     // assemble position
 870     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 871     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 872     {
 873         gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];
 874     }
 875
 876     // record valid prims from the frontend to avoid over binning the newly generated
 877     // prims from the GS
 878 #if USE_SIMD16_FRONTEND
 879     uint32_t numInputPrims = numPrims_simd8;
 880 #else
 881     uint32_t          numInputPrims = pa.NumPrims();
 882 #endif
 883
 884     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 885     {
 886         gsContext.InstanceID = instance;
 887         gsContext.mask       = GenerateMask(numInputPrims);
 888
 889         // execute the geometry shader
 890         state.pfnGsFunc(GetPrivateState(pDC), pWorkerData, &gsContext);
 891         AR_EVENT(GSStats((HANDLE)&gsContext.stats));
 892
 893         for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
 894         {
 895             gsContext.pStreams[i] += pState->allocationSize;
 896         }
 897     }
 898
 899     // set up new binner and state for the GS output topology
 900 #if USE_SIMD16_FRONTEND
 901     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
 902     if (HasRastT::value)
 903     {
 904         switch (pState->outputTopology)
 905         {
 906         case TOP_RECT_LIST:
 907             pfnClipFunc = ClipRectangles_simd16;
 908             break;
 909         case TOP_TRIANGLE_STRIP:
 910             pfnClipFunc = ClipTriangles_simd16;
 911             break;
 912         case TOP_LINE_STRIP:
 913             pfnClipFunc = ClipLines_simd16;
 914             break;
 915         case TOP_POINT_LIST:
 916             pfnClipFunc = ClipPoints_simd16;
 917             break;
 918         default:
 919             SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
 920         }
 921     }
 922
 923 #else
 924     PFN_PROCESS_PRIMS pfnClipFunc   = nullptr;
 925     if (HasRastT::value)
 926     {
 927         switch (pState->outputTopology)
 928         {
 929         case TOP_RECT_LIST:
 930             pfnClipFunc = ClipRectangles;
 931             break;
 932         case TOP_TRIANGLE_STRIP:
 933             pfnClipFunc = ClipTriangles;
 934             break;
 935         case TOP_LINE_STRIP:
 936             pfnClipFunc = ClipLines;
 937             break;
 938         case TOP_POINT_LIST:
 939             pfnClipFunc = ClipPoints;
 940             break;
 941         default:
 942             SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
 943         }
 944     }
 945
 946 #endif
 947     // foreach input prim:
 948     // - setup a new PA based on the emitted verts for that prim
 949     // - loop over the new verts, calling PA to assemble each prim
 950     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 951
 952     uint32_t totalPrimsGenerated = 0;
 953     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 954     {
 955         uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
 956
 957         // Vertex count is either emitted by shader or static
 958         uint32_t vertexCount = 0;
 959         if (pState->staticVertexCount)
 960         {
 961             vertexCount = pState->staticVertexCount;
 962         }
 963         else
 964         {
 965             // If emitted in shader, it should be the stored in the first dword of the output buffer
 966             vertexCount = *(uint32_t*)pInstanceBase;
 967         }
 968
 969         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 970         {
 971             uint32_t numEmittedVerts = vertexCount;
 972             if (numEmittedVerts == 0)
 973             {
 974                 continue;
 975             }
 976
 977             uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
 978             uint8_t* pCutBase =
 979                 pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
 980             uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
 981
 982 #if USE_SIMD16_FRONTEND
 983             TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
 984                                                           pVertexBaseAOS,
 985                                                           vertexCount,
 986                                                           pState->outputVertexSize);
 987 #else
 988             TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
 989                                                         pVertexBaseAOS,
 990                                                         vertexCount,
 991                                                         pState->outputVertexSize);
 992 #endif
 993
 994             uint32_t numAttribs = state.feNumAttributes;
 995
 996             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
 997             {
 998                 bool     processCutVerts = false;
 999                 uint8_t* pCutBuffer      = pCutBase;
1000
1001                 // assign default stream ID, only relevant when GS is outputting a single stream
1002                 uint32_t streamID = 0;
1003                 if (pState->isSingleStream)
1004                 {
1005                     processCutVerts = true;
1006                     streamID        = pState->singleStreamID;
1007                     if (streamID != stream)
1008                         continue;
1009                 }
1010                 else
1011                 {
1012                     // early exit if this stream is not enabled for streamout
1013                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
1014                     {
1015                         continue;
1016                     }
1017
1018                     // multi-stream output, need to translate StreamID buffer to a cut buffer
1019                     ProcessStreamIdBuffer(
1020                         stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
1021                     pCutBuffer      = (uint8_t*)pGsBuffers->pStreamCutBuffer;
1022                     processCutVerts = false;
1023                 }
1024
1025 #if USE_SIMD16_FRONTEND
1026                 PA_STATE_CUT gsPa(pDC,
1027                                   (uint8_t*)pGsBuffers->pGsTransposed,
1028                                   numEmittedVerts,
1029                                   pState->outputVertexSize,
1030                                   reinterpret_cast<simd16mask*>(pCutBuffer),
1031                                   numEmittedVerts,
1032                                   numAttribs,
1033                                   pState->outputTopology,
1034                                   processCutVerts,
1035                                   pa.numVertsPerPrim);
1036
1037 #else
1038                 PA_STATE_CUT gsPa(pDC,
1039                                   (uint8_t*)pGsBuffers->pGsTransposed,
1040                                   numEmittedVerts,
1041                                   pState->outputVertexSize,
1042                                   pCutBuffer,
1043                                   numEmittedVerts,
1044                                   numAttribs,
1045                                   pState->outputTopology,
1046                                   processCutVerts,
1047                                   pa.numVertsPerPrim);
1048
1049 #endif
1050                 while (gsPa.GetNextStreamOutput())
1051                 {
1052                     do
1053                     {
1054 #if USE_SIMD16_FRONTEND
1055                         simd16vector attrib_simd16[3];
1056
1057                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
1058
1059 #else
1060                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
1061
1062 #endif
1063                         if (assemble)
1064                         {
1065                             totalPrimsGenerated += gsPa.NumPrims();
1066
1067                             if (HasStreamOutT::value)
1068                             {
1069 #if ENABLE_AVX512_SIMD16
1070                                 gsPa.useAlternateOffset = false;
1071 #endif
1072                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
1073                             }
1074
1075                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
1076                             {
1077 #if USE_SIMD16_FRONTEND
1078                                 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
1079
1080                                 // Gather data from the SVG if provided.
1081                                 simd16scalari vViewportIdx = SIMD16::setzero_si();
1082                                 simd16scalari vRtIdx       = SIMD16::setzero_si();
1083                                 SIMD16::Vec4  svgAttrib[4];
1084
1085                                 if (state.backendState.readViewportArrayIndex ||
1086                                     state.backendState.readRenderTargetArrayIndex)
1087                                 {
1088                                     gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1089                                 }
1090
1091                                 if (state.backendState.readViewportArrayIndex)
1092                                 {
1093                                     vViewportIdx =
1094                                         SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1095                                     gsPa.viewportArrayActive = true;
1096                                 }
1097                                 if (state.backendState.readRenderTargetArrayIndex)
1098                                 {
1099                                     vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1100                                     gsPa.rtArrayActive = true;
1101                                 }
1102
1103                                 {
1104                                     // OOB VPAI indices => forced to zero.
1105                                     vViewportIdx =
1106                                         SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1107                                     simd16scalari vNumViewports =
1108                                         SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1109                                     simd16scalari vClearMask =
1110                                         SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1111                                     vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1112
1113                                     gsPa.useAlternateOffset = false;
1114                                     pfnClipFunc(pDC,
1115                                                 gsPa,
1116                                                 workerId,
1117                                                 attrib_simd16,
1118                                                 GenMask(gsPa.NumPrims()),
1119                                                 vPrimId,
1120                                                 vViewportIdx,
1121                                                 vRtIdx);
1122                                 }
1123 #else
1124                                 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
1125
1126                                 // Gather data from the SVG if provided.
1127                                 simdscalari vViewportIdx = SIMD::setzero_si();
1128                                 simdscalari vRtIdx       = SIMD::setzero_si();
1129                                 SIMD::Vec4  svgAttrib[4];
1130
1131                                 if (state.backendState.readViewportArrayIndex ||
1132                                     state.backendState.readRenderTargetArrayIndex)
1133                                 {
1134                                     gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1135                                 }
1136
1137                                 if (state.backendState.readViewportArrayIndex)
1138                                 {
1139                                     vViewportIdx =
1140                                         SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1141
1142                                     // OOB VPAI indices => forced to zero.
1143                                     vViewportIdx =
1144                                         SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1145                                     simdscalari vNumViewports =
1146                                         SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1147                                     simdscalari vClearMask =
1148                                         SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1149                                     vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
1150                                     gsPa.viewportArrayActive = true;
1151                                 }
1152                                 if (state.backendState.readRenderTargetArrayIndex)
1153                                 {
1154                                     vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1155                                     gsPa.rtArrayActive = true;
1156                                 }
1157
1158                                 pfnClipFunc(pDC,
1159                                             gsPa,
1160                                             workerId,
1161                                             attrib,
1162                                             GenMask(gsPa.NumPrims()),
1163                                             vPrimId,
1164                                             vViewportIdx,
1165                                             vRtIdx);
1166 #endif
1167                             }
1168                         }
1169                     } while (gsPa.NextPrim());
1170                 }
1171             }
1172         }
1173     }
1174
1175     // update GS pipeline stats
1176     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
1177     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
1178     AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim * numInputPrims));
1179     RDTSC_END(FEGeometryShader, 1);
1180 }
1181
1182 //////////////////////////////////////////////////////////////////////////
1183 /// @brief Allocate GS buffers
1184 /// @param pDC - pointer to draw context.
1185 /// @param state - API state
1186 /// @param ppGsOut - pointer to GS output buffer allocation
1187 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1188 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1189 static INLINE void AllocateGsBuffers(DRAW_CONTEXT*    pDC,
1190                                      const API_STATE& state,
1191                                      uint32_t         vertsPerPrim,
1192                                      GsBuffers*       pGsBuffers)
1193 {
1194     auto pArena = pDC->pArena;
1195     SWR_ASSERT(pArena != nullptr);
1196     SWR_ASSERT(state.gsState.gsEnable);
1197
1198     const SWR_GS_STATE& gsState = state.gsState;
1199
1200     // Allocate storage for vertex inputs
1201     uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
1202     pGsBuffers->pGsIn           = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
1203
1204     // Allocate arena space to hold GS output verts
1205     const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
1206
1207     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1208     {
1209         pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
1210     }
1211
1212     // Allocate storage for transposed GS output
1213     uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
1214     uint32_t transposedBufferSize =
1215         numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>);
1216     pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
1217
1218     // Allocate storage to hold temporary stream->cut buffer, if necessary
1219     if (state.gsState.isSingleStream)
1220     {
1221         pGsBuffers->pStreamCutBuffer = nullptr;
1222     }
1223     else
1224     {
1225         pGsBuffers->pStreamCutBuffer =
1226             (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
1227     }
1228 }
1229
1230 //////////////////////////////////////////////////////////////////////////
1231 /// @brief Contains all data generated by the HS and passed to the
1232 /// tessellator and DS.
1233 struct TessellationThreadLocalData
1234 {
1235     SWR_HS_CONTEXT hsContext;
1236     ScalarPatch    patchData[KNOB_SIMD_WIDTH];
1237     void*          pTxCtx;
1238     size_t         tsCtxSize;
1239
1240     simdscalar* pDSOutput;
1241     size_t      dsOutputAllocSize;
1242 };
1243
1244 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1245
1246 //////////////////////////////////////////////////////////////////////////
1247 /// @brief Allocate tessellation data for this worker thread.
1248 INLINE
1249 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1250 {
1251     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
1252     if (gt_pTessellationThreadData == nullptr)
1253     {
1254         gt_pTessellationThreadData =
1255             (TessellationThreadLocalData*)AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1256         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1257     }
1258 }
1259
1260 //////////////////////////////////////////////////////////////////////////
1261 /// @brief Implements Tessellation Stages.
1262 /// @param pDC - pointer to draw context.
1263 /// @param workerId - thread's worker id. Even thread has a unique id.
1264 /// @param pa - The primitive assembly object.
1265 /// @param pGsOut - output stream for GS
1266 template <typename HasGeometryShaderT, typename HasStreamOutT, typename HasRastT>
1267 static void TessellationStages(DRAW_CONTEXT* pDC,
1268                                uint32_t      workerId,
1269                                PA_STATE&     pa,
1270                                GsBuffers*    pGsBuffers,
1271                                uint32_t*     pSoPrimData,
1272 #if USE_SIMD16_FRONTEND
1273                                uint32_t numPrims_simd8,
1274 #endif
1275                                simdscalari const& primID)
1276 {
1277     const API_STATE&    state   = GetApiState(pDC);
1278     const SWR_TS_STATE& tsState = state.tsState;
1279     void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1280
1281     SWR_ASSERT(gt_pTessellationThreadData);
1282
1283     HANDLE tsCtx = TSInitCtx(tsState.domain,
1284                              tsState.partitioning,
1285                              tsState.tsOutputTopology,
1286                              gt_pTessellationThreadData->pTxCtx,
1287                              gt_pTessellationThreadData->tsCtxSize);
1288     if (tsCtx == nullptr)
1289     {
1290         gt_pTessellationThreadData->pTxCtx =
1291             AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1292         tsCtx = TSInitCtx(tsState.domain,
1293                           tsState.partitioning,
1294                           tsState.tsOutputTopology,
1295                           gt_pTessellationThreadData->pTxCtx,
1296                           gt_pTessellationThreadData->tsCtxSize);
1297     }
1298     SWR_ASSERT(tsCtx);
1299
1300 #if USE_SIMD16_FRONTEND
1301     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1302     if (HasRastT::value)
1303     {
1304         switch (tsState.postDSTopology)
1305         {
1306         case TOP_TRIANGLE_LIST:
1307             pfnClipFunc = ClipTriangles_simd16;
1308             break;
1309         case TOP_LINE_LIST:
1310             pfnClipFunc = ClipLines_simd16;
1311             break;
1312         case TOP_POINT_LIST:
1313             pfnClipFunc = ClipPoints_simd16;
1314             break;
1315         default:
1316             SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1317         }
1318     }
1319
1320 #else
1321     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1322     if (HasRastT::value)
1323     {
1324         switch (tsState.postDSTopology)
1325         {
1326         case TOP_TRIANGLE_LIST:
1327             pfnClipFunc = ClipTriangles;
1328             break;
1329         case TOP_LINE_LIST:
1330             pfnClipFunc = ClipLines;
1331             break;
1332         case TOP_POINT_LIST:
1333             pfnClipFunc = ClipPoints;
1334             break;
1335         default:
1336             SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1337         }
1338     }
1339
1340 #endif
1341     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1342     hsContext.pCPout          = gt_pTessellationThreadData->patchData;
1343     hsContext.PrimitiveID     = primID;
1344
1345     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1346     // Max storage for one attribute for an entire simdprimitive
1347     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1348
1349     // assemble all attributes for the input primitives
1350     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1351     {
1352         uint32_t attribSlot = tsState.vertexAttribOffset + slot;
1353         pa.Assemble(attribSlot, simdattrib);
1354
1355         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1356         {
1357             hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i];
1358         }
1359     }
1360
1361 #if defined(_DEBUG)
1362     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1363 #endif
1364
1365 #if USE_SIMD16_FRONTEND
1366     uint32_t numPrims = numPrims_simd8;
1367 #else
1368     uint32_t numPrims = pa.NumPrims();
1369 #endif
1370     hsContext.mask = GenerateMask(numPrims);
1371
1372     // Run the HS
1373     RDTSC_BEGIN(FEHullShader, pDC->drawId);
1374     state.pfnHsFunc(GetPrivateState(pDC), pWorkerData, &hsContext);
1375     RDTSC_END(FEHullShader, 0);
1376
1377     UPDATE_STAT_FE(HsInvocations, numPrims);
1378     AR_EVENT(HSStats((HANDLE)&hsContext.stats));
1379
1380     const uint32_t* pPrimId = (const uint32_t*)&primID;
1381
1382     for (uint32_t p = 0; p < numPrims; ++p)
1383     {
1384         // Run Tessellator
1385         SWR_TS_TESSELLATED_DATA tsData = {0};
1386         RDTSC_BEGIN(FETessellation, pDC->drawId);
1387         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1388         AR_EVENT(TessPrimCount(1));
1389         RDTSC_END(FETessellation, 0);
1390
1391         if (tsData.NumPrimitives == 0)
1392         {
1393             continue;
1394         }
1395         SWR_ASSERT(tsData.NumDomainPoints);
1396
1397         // Allocate DS Output memory
1398         uint32_t requiredDSVectorInvocations =
1399             AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1400 #if USE_SIMD16_FRONTEND
1401         size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) *
1402                                    tsState.dsAllocationSize; // simd8 -> simd16, padding
1403 #else
1404         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
1405         size_t requiredAllocSize       = sizeof(simdvector) * requiredDSOutputVectors;
1406 #endif
1407         if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
1408         {
1409             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1410             gt_pTessellationThreadData->pDSOutput =
1411                 (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1412             gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
1413         }
1414         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1415         SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);
1416
1417 #if defined(_DEBUG)
1418         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1419 #endif
1420
1421         // Run Domain Shader
1422         SWR_DS_CONTEXT dsContext;
1423         dsContext.PrimitiveID           = pPrimId[p];
1424         dsContext.pCpIn                 = &hsContext.pCPout[p];
1425         dsContext.pDomainU              = (simdscalar*)tsData.pDomainPointsU;
1426         dsContext.pDomainV              = (simdscalar*)tsData.pDomainPointsV;
1427         dsContext.pOutputData           = gt_pTessellationThreadData->pDSOutput;
1428         dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
1429 #if USE_SIMD16_FRONTEND
1430         dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
1431 #else
1432         dsContext.vectorStride         = requiredDSVectorInvocations;
1433 #endif
1434
1435         uint32_t dsInvocations = 0;
1436
1437         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations;
1438              ++dsContext.vectorOffset)
1439         {
1440             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1441
1442             RDTSC_BEGIN(FEDomainShader, pDC->drawId);
1443             state.pfnDsFunc(GetPrivateState(pDC), pWorkerData, &dsContext);
1444             RDTSC_END(FEDomainShader, 0);
1445
1446             AR_EVENT(DSStats((HANDLE)&dsContext.stats));
1447
1448             dsInvocations += KNOB_SIMD_WIDTH;
1449         }
1450         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1451
1452 #if USE_SIMD16_FRONTEND
1453         SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
1454
1455 #endif
1456         PA_TESS tessPa(
1457             pDC,
1458 #if USE_SIMD16_FRONTEND
1459             reinterpret_cast<const simd16scalar*>(dsContext.pOutputData), // simd8 -> simd16
1460             dsContext.vectorStride / 2,                                   // simd8 -> simd16
1461 #else
1462             dsContext.pOutputData,
1463             dsContext.vectorStride,
1464 #endif
1465             SWR_VTX_NUM_SLOTS,
1466             tsState.numDsOutputAttribs + tsState.dsOutVtxAttribOffset,
1467             tsData.ppIndices,
1468             tsData.NumPrimitives,
1469             tsState.postDSTopology,
1470             NumVertsPerPrim(tsState.postDSTopology, false));
1471
1472         while (tessPa.HasWork())
1473         {
1474 #if USE_SIMD16_FRONTEND
1475             const uint32_t numPrims    = tessPa.NumPrims();
1476             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1477             const uint32_t numPrims_hi =
1478                 std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1479
1480             const simd16scalari primID    = _simd16_set1_epi32(dsContext.PrimitiveID);
1481             const simdscalari   primID_lo = _simd16_extract_si(primID, 0);
1482             const simdscalari   primID_hi = _simd16_extract_si(primID, 1);
1483
1484 #endif
1485             if (HasGeometryShaderT::value)
1486             {
1487 #if USE_SIMD16_FRONTEND
1488                 tessPa.useAlternateOffset = false;
1489                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1490                     pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1491
1492                 if (numPrims_hi)
1493                 {
1494                     tessPa.useAlternateOffset = true;
1495                     GeometryShaderStage<HasStreamOutT, HasRastT>(
1496                         pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1497                 }
1498 #else
1499                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1500                     pDC,
1501                     workerId,
1502                     tessPa,
1503                     pGsBuffers,
1504                     pSoPrimData,
1505                     _simd_set1_epi32(dsContext.PrimitiveID));
1506 #endif
1507             }
1508             else
1509             {
1510                 if (HasStreamOutT::value)
1511                 {
1512 #if ENABLE_AVX512_SIMD16
1513                     tessPa.useAlternateOffset = false;
1514 #endif
1515                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1516                 }
1517
1518                 if (HasRastT::value)
1519                 {
1520 #if USE_SIMD16_FRONTEND
1521                     simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
1522 #else
1523                     simdvector prim[3]; // Only deal with triangles, lines, or points
1524 #endif
1525                     RDTSC_BEGIN(FEPAAssemble, pDC->drawId);
1526                     bool assemble =
1527 #if USE_SIMD16_FRONTEND
1528                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1529 #else
1530                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1531 #endif
1532                     RDTSC_END(FEPAAssemble, 1);
1533                     SWR_ASSERT(assemble);
1534
1535                     SWR_ASSERT(pfnClipFunc);
1536 #if USE_SIMD16_FRONTEND
1537                     // Gather data from the SVG if provided.
1538                     simd16scalari vViewportIdx = SIMD16::setzero_si();
1539                     simd16scalari vRtIdx       = SIMD16::setzero_si();
1540                     SIMD16::Vec4  svgAttrib[4];
1541
1542                     if (state.backendState.readViewportArrayIndex ||
1543                         state.backendState.readRenderTargetArrayIndex)
1544                     {
1545                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1546                     }
1547
1548                     if (state.backendState.readViewportArrayIndex)
1549                     {
1550                         vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1551                         tessPa.viewportArrayActive = true;
1552                     }
1553                     if (state.backendState.readRenderTargetArrayIndex)
1554                     {
1555                         vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1556                         tessPa.rtArrayActive = true;
1557                     }
1558
1559
1560                     {
1561                         // OOB VPAI indices => forced to zero.
1562                         vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1563                         simd16scalari vNumViewports =
1564                             SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1565                         simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1566                         vViewportIdx             = SIMD16::and_si(vClearMask, vViewportIdx);
1567
1568                         tessPa.useAlternateOffset = false;
1569                         pfnClipFunc(pDC,
1570                                     tessPa,
1571                                     workerId,
1572                                     prim_simd16,
1573                                     GenMask(numPrims),
1574                                     primID,
1575                                     vViewportIdx,
1576                                     vRtIdx);
1577                     }
1578 #else
1579                     // Gather data from the SGV if provided.
1580                     simdscalari vViewportIdx = SIMD::setzero_si();
1581                     simdscalari vRtIdx       = SIMD::setzero_si();
1582                     SIMD::Vec4  svgAttrib[4];
1583
1584                     if (state.backendState.readViewportArrayIndex ||
1585                         state.backendState.readRenderTargetArrayIndex)
1586                     {
1587                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1588                     }
1589
1590                     if (state.backendState.readViewportArrayIndex)
1591                     {
1592                         vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1593
1594                         // OOB VPAI indices => forced to zero.
1595                         vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1596                         simdscalari vNumViewports  = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1597                         simdscalari vClearMask     = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1598                         vViewportIdx               = SIMD::and_si(vClearMask, vViewportIdx);
1599                         tessPa.viewportArrayActive = true;
1600                     }
1601                     if (state.backendState.readRenderTargetArrayIndex)
1602                     {
1603                         vRtIdx               = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1604                         tessPa.rtArrayActive = true;
1605                     }
1606                     pfnClipFunc(pDC,
1607                                 tessPa,
1608                                 workerId,
1609                                 prim,
1610                                 GenMask(tessPa.NumPrims()),
1611                                 _simd_set1_epi32(dsContext.PrimitiveID),
1612                                 vViewportIdx,
1613                                 vRtIdx);
1614 #endif
1615                 }
1616             }
1617
1618             tessPa.NextPrim();
1619
1620         } // while (tessPa.HasWork())
1621     }     // for (uint32_t p = 0; p < numPrims; ++p)
1622
1623 #if USE_SIMD16_FRONTEND
1624     if (gt_pTessellationThreadData->pDSOutput != nullptr)
1625     {
1626         AlignedFree(gt_pTessellationThreadData->pDSOutput);
1627         gt_pTessellationThreadData->pDSOutput = nullptr;
1628     }
1629     gt_pTessellationThreadData->dsOutputAllocSize = 0;
1630
1631 #endif
1632     TSDestroyCtx(tsCtx);
1633 }
1634
1635 THREAD PA_STATE::SIMDVERTEX* gpVertexStore = nullptr;
1636 THREAD uint32_t gVertexStoreSize           = 0;
1637
1638 //////////////////////////////////////////////////////////////////////////
1639 /// @brief FE handler for SwrDraw.
1640 /// @tparam IsIndexedT - Is indexed drawing enabled
1641 /// @tparam HasTessellationT - Is tessellation enabled
1642 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1643 /// @tparam HasStreamOutT - Is stream-out enabled
1644 /// @tparam HasRastT - Is rasterization enabled
1645 /// @param pContext - pointer to SWR context.
1646 /// @param pDC - pointer to draw context.
1647 /// @param workerId - thread's worker id.
1648 /// @param pUserData - Pointer to DRAW_WORK
1649 template <typename IsIndexedT,
1650           typename IsCutIndexEnabledT,
1651           typename HasTessellationT,
1652           typename HasGeometryShaderT,
1653           typename HasStreamOutT,
1654           typename HasRastT>
1655 void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
1656 {
1657 #if KNOB_ENABLE_TOSS_POINTS
1658     if (KNOB_TOSS_QUEUE_FE)
1659     {
1660         return;
1661     }
1662 #endif
1663
1664     RDTSC_BEGIN(FEProcessDraw, pDC->drawId);
1665
1666     void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1667
1668     DRAW_WORK&       work  = *(DRAW_WORK*)pUserData;
1669     const API_STATE& state = GetApiState(pDC);
1670
1671     uint32_t indexSize = 0;
1672     uint32_t endVertex = work.numVerts;
1673
1674     gfxptr_t xpLastRequestedIndex = 0;
1675     if (IsIndexedT::value)
1676     {
1677         switch (work.type)
1678         {
1679         case R32_UINT:
1680             indexSize = sizeof(uint32_t);
1681             break;
1682         case R16_UINT:
1683             indexSize = sizeof(uint16_t);
1684             break;
1685         case R8_UINT:
1686             indexSize = sizeof(uint8_t);
1687             break;
1688         default:
1689             SWR_INVALID("Invalid work.type: %d", work.type);
1690         }
1691         xpLastRequestedIndex = work.xpIB + endVertex * indexSize;
1692     }
1693     else
1694     {
1695         // No cuts, prune partial primitives.
1696         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1697     }
1698
1699 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1700     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1701 #endif
1702
1703     GsBuffers gsBuffers;
1704     if (HasGeometryShaderT::value)
1705     {
1706 #if USE_SIMD16_FRONTEND
1707         AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(
1708             pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1709 #else
1710         AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(
1711             pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1712 #endif
1713     }
1714
1715     if (HasTessellationT::value)
1716     {
1717         SWR_ASSERT(state.tsState.tsEnable == true);
1718         SWR_ASSERT(state.pfnHsFunc != nullptr);
1719         SWR_ASSERT(state.pfnDsFunc != nullptr);
1720
1721         AllocateTessellationData(pContext);
1722     }
1723     else
1724     {
1725         SWR_ASSERT(state.tsState.tsEnable == false);
1726         SWR_ASSERT(state.pfnHsFunc == nullptr);
1727         SWR_ASSERT(state.pfnDsFunc == nullptr);
1728     }
1729
1730     // allocate space for streamout input prim data
1731     uint32_t* pSoPrimData = nullptr;
1732     if (HasStreamOutT::value)
1733     {
1734         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1735     }
1736
1737     const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
1738 #if USE_SIMD16_FRONTEND
1739     uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
1740 #else
1741     uint32_t          simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
1742 #endif
1743
1744     SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
1745
1746     // Compute storage requirements for vertex store
1747     // TODO: allocation needs to be rethought for better cut support
1748     uint32_t numVerts        = vertexCount + 2; // Need extra space for PA state machine
1749     uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
1750
1751     // grow the vertex store for the PA as necessary
1752     if (gVertexStoreSize < vertexStoreSize)
1753     {
1754         if (gpVertexStore != nullptr)
1755         {
1756             AlignedFree(gpVertexStore);
1757             gpVertexStore = nullptr;
1758         }
1759
1760         SWR_ASSERT(gpVertexStore == nullptr);
1761
1762         gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX*>(AlignedMalloc(vertexStoreSize, 64));
1763         gVertexStoreSize = vertexStoreSize;
1764
1765         SWR_ASSERT(gpVertexStore != nullptr);
1766     }
1767
1768     // choose primitive assembler
1769
1770     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC,
1771                                                          state.topology,
1772                                                          work.numVerts,
1773                                                          gpVertexStore,
1774                                                          numVerts,
1775                                                          state.frontendState.vsVertexSize,
1776                                                          GetNumVerts(state.topology, 1));
1777     PA_STATE&                                  pa = paFactory.GetPA();
1778
1779 #if USE_SIMD16_FRONTEND
1780 #if USE_SIMD16_SHADERS
1781     simd16vertex vin;
1782 #else
1783     simdvertex vin_lo;
1784     simdvertex vin_hi;
1785 #endif
1786     SWR_VS_CONTEXT vsContext_lo;
1787     SWR_VS_CONTEXT vsContext_hi;
1788
1789 #if USE_SIMD16_SHADERS
1790     vsContext_lo.pVin = reinterpret_cast<simdvertex*>(&vin);
1791     vsContext_hi.pVin = reinterpret_cast<simdvertex*>(&vin);
1792 #else
1793     vsContext_lo.pVin = &vin_lo;
1794     vsContext_hi.pVin = &vin_hi;
1795 #endif
1796     vsContext_lo.AlternateOffset = 0;
1797     vsContext_hi.AlternateOffset = 1;
1798
1799     SWR_FETCH_CONTEXT fetchInfo_lo = {0};
1800
1801     fetchInfo_lo.pStreams      = &state.vertexBuffers[0];
1802     fetchInfo_lo.StartInstance = work.startInstance;
1803     fetchInfo_lo.StartVertex   = 0;
1804
1805     if (IsIndexedT::value)
1806     {
1807         fetchInfo_lo.BaseVertex = work.baseVertex;
1808
1809         // if the entire index buffer isn't being consumed, set the last index
1810         // so that fetches < a SIMD wide will be masked off
1811         fetchInfo_lo.xpLastIndex = state.indexBuffer.xpIndices + state.indexBuffer.size;
1812         if (xpLastRequestedIndex < fetchInfo_lo.xpLastIndex)
1813         {
1814             fetchInfo_lo.xpLastIndex = xpLastRequestedIndex;
1815         }
1816     }
1817     else
1818     {
1819         fetchInfo_lo.StartVertex = work.startVertex;
1820     }
1821
1822     SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
1823
1824     const simd16scalari vScale =
1825         _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1826
1827     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1828     {
1829         uint32_t i = 0;
1830
1831         simd16scalari vIndex;
1832
1833         if (IsIndexedT::value)
1834         {
1835             fetchInfo_lo.xpIndices = work.xpIB;
1836             fetchInfo_hi.xpIndices =
1837                 fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize; // 1/2 of KNOB_SIMD16_WIDTH
1838         }
1839         else
1840         {
1841             vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1842
1843             fetchInfo_lo.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex);
1844             fetchInfo_hi.xpIndices = pDC->pContext->pfnMakeGfxPtr(
1845                 GetPrivateState(pDC),
1846                 &vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
1847         }
1848
1849         fetchInfo_lo.CurInstance = instanceNum;
1850         fetchInfo_hi.CurInstance = instanceNum;
1851
1852         vsContext_lo.InstanceID = instanceNum;
1853         vsContext_hi.InstanceID = instanceNum;
1854
1855         while (pa.HasWork())
1856         {
1857             // GetNextVsOutput currently has the side effect of updating some PA state machine
1858             // state. So we need to keep this outside of (i < endVertex) check.
1859
1860             simdmask* pvCutIndices_lo = nullptr;
1861             simdmask* pvCutIndices_hi = nullptr;
1862
1863             if (IsIndexedT::value)
1864             {
1865                 // simd16mask <=> simdmask[2]
1866
1867                 pvCutIndices_lo = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[0];
1868                 pvCutIndices_hi = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[1];
1869             }
1870
1871             simd16vertex& vout = pa.GetNextVsOutput();
1872
1873             vsContext_lo.pVout = reinterpret_cast<simdvertex*>(&vout);
1874             vsContext_hi.pVout = reinterpret_cast<simdvertex*>(&vout);
1875
1876             if (i < endVertex)
1877             {
1878                 if (!IsIndexedT::value)
1879                 {
1880                     fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices;
1881                     uint32_t offset;
1882                     offset = std::min(endVertex - i, (uint32_t)KNOB_SIMD16_WIDTH);
1883                     offset *= 4; // convert from index to address
1884 #if USE_SIMD16_SHADERS
1885                     fetchInfo_lo.xpLastIndex += offset;
1886 #else
1887                     fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t)KNOB_SIMD_WIDTH);
1888                     uint32_t offset2 =
1889                         std::min(offset, (uint32_t)KNOB_SIMD16_WIDTH) - KNOB_SIMD_WIDTH;
1890                     assert(offset >= 0);
1891                     fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices;
1892                     fetchInfo_hi.xpLastIndex += offset2;
1893 #endif
1894                 }
1895                 // 1. Execute FS/VS for a single SIMD.
1896                 RDTSC_BEGIN(FEFetchShader, pDC->drawId);
1897 #if USE_SIMD16_SHADERS
1898                 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin);
1899 #else
1900                 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin_lo);
1901
1902                 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1903                 {
1904                     state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_hi, vin_hi);
1905                 }
1906 #endif
1907                 RDTSC_END(FEFetchShader, 0);
1908
1909                 // forward fetch generated vertex IDs to the vertex shader
1910 #if USE_SIMD16_SHADERS
1911 #if USE_SIMD16_VS
1912                 vsContext_lo.VertexID16 =
1913                     _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
1914                 vsContext_lo.VertexID16 =
1915                     _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
1916 #else
1917                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1918                 vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
1919 #endif
1920 #else
1921                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1922                 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1923 #endif
1924
1925                 // Setup active mask for vertex shader.
1926 #if USE_SIMD16_VS
1927                 vsContext_lo.mask16 = GenerateMask16(endVertex - i);
1928 #else
1929                 vsContext_lo.mask     = GenerateMask(endVertex - i);
1930                 vsContext_hi.mask     = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1931 #endif
1932
1933                 // forward cut mask to the PA
1934                 if (IsIndexedT::value)
1935                 {
1936 #if USE_SIMD16_SHADERS
1937                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1938                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
1939 #else
1940                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1941                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1942 #endif
1943                 }
1944
1945                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1946
1947 #if KNOB_ENABLE_TOSS_POINTS
1948                 if (!KNOB_TOSS_FETCH)
1949 #endif
1950                 {
1951                     RDTSC_BEGIN(FEVertexShader, pDC->drawId);
1952 #if USE_SIMD16_VS
1953                     state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
1954                     AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
1955 #else
1956                     state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
1957                     AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
1958
1959                     if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1960                     {
1961                         state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_hi);
1962                         AR_EVENT(VSStats((HANDLE)&vsContext_hi.stats));
1963                     }
1964 #endif
1965                     RDTSC_END(FEVertexShader, 0);
1966
1967                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1968                 }
1969             }
1970
1971             // 2. Assemble primitives given the last two SIMD.
1972             do
1973             {
1974                 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1975
1976                 RDTSC_START(FEPAAssemble);
1977                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1978                 RDTSC_STOP(FEPAAssemble, 1, 0);
1979
1980 #if KNOB_ENABLE_TOSS_POINTS
1981                 if (!KNOB_TOSS_FETCH)
1982 #endif
1983                 {
1984 #if KNOB_ENABLE_TOSS_POINTS
1985                     if (!KNOB_TOSS_VS)
1986 #endif
1987                     {
1988                         if (assemble)
1989                         {
1990                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1991
1992                             const uint32_t numPrims = pa.NumPrims();
1993                             const uint32_t numPrims_lo =
1994                                 std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1995                             const uint32_t numPrims_hi =
1996                                 std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1997
1998                             const simd16scalari primID    = pa.GetPrimID(work.startPrimID);
1999                             const simdscalari   primID_lo = _simd16_extract_si(primID, 0);
2000                             const simdscalari   primID_hi = _simd16_extract_si(primID, 1);
2001
2002                             if (HasTessellationT::value)
2003                             {
2004                                 pa.useAlternateOffset = false;
2005                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2006                                     pDC,
2007                                     workerId,
2008                                     pa,
2009                                     &gsBuffers,
2010                                     pSoPrimData,
2011                                     numPrims_lo,
2012                                     primID_lo);
2013
2014                                 if (numPrims_hi)
2015                                 {
2016                                     pa.useAlternateOffset = true;
2017                                     TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2018                                         pDC,
2019                                         workerId,
2020                                         pa,
2021                                         &gsBuffers,
2022                                         pSoPrimData,
2023                                         numPrims_hi,
2024                                         primID_hi);
2025                                 }
2026                             }
2027                             else if (HasGeometryShaderT::value)
2028                             {
2029                                 pa.useAlternateOffset = false;
2030                                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
2031                                                                              workerId,
2032                                                                              pa,
2033                                                                              &gsBuffers,
2034                                                                              pSoPrimData,
2035                                                                              numPrims_lo,
2036                                                                              primID_lo);
2037
2038                                 if (numPrims_hi)
2039                                 {
2040                                     pa.useAlternateOffset = true;
2041                                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
2042                                                                                  workerId,
2043                                                                                  pa,
2044                                                                                  &gsBuffers,
2045                                                                                  pSoPrimData,
2046                                                                                  numPrims_hi,
2047                                                                                  primID_hi);
2048                                 }
2049                             }
2050                             else
2051                             {
2052                                 // If streamout is enabled then stream vertices out to memory.
2053                                 if (HasStreamOutT::value)
2054                                 {
2055                                     pa.useAlternateOffset = false;
2056                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2057                                 }
2058
2059                                 if (HasRastT::value)
2060                                 {
2061                                     SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
2062                                     // Gather data from the SVG if provided.
2063                                     simd16scalari vpai = SIMD16::setzero_si();
2064                                     simd16scalari rtai = SIMD16::setzero_si();
2065                                     SIMD16::Vec4  svgAttrib[4];
2066
2067                                     if (state.backendState.readViewportArrayIndex ||
2068                                         state.backendState.readRenderTargetArrayIndex)
2069                                     {
2070                                         pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2071                                     }
2072
2073                                     if (state.backendState.readViewportArrayIndex)
2074                                     {
2075                                         vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2076                                         pa.viewportArrayActive = true;
2077                                     }
2078                                     if (state.backendState.readRenderTargetArrayIndex)
2079                                     {
2080                                         rtai =
2081                                             SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2082                                         pa.rtArrayActive = true;
2083                                     }
2084
2085                                     {
2086                                         // OOB VPAI indices => forced to zero.
2087                                         vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
2088                                         simd16scalari vNumViewports =
2089                                             SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2090                                         simd16scalari vClearMask =
2091                                             SIMD16::cmplt_epi32(vpai, vNumViewports);
2092                                         vpai = SIMD16::and_si(vClearMask, vpai);
2093
2094                                         pa.useAlternateOffset = false;
2095                                         pDC->pState->pfnProcessPrims_simd16(pDC,
2096                                                                             pa,
2097                                                                             workerId,
2098                                                                             prim_simd16,
2099                                                                             GenMask(numPrims),
2100                                                                             primID,
2101                                                                             vpai,
2102                                                                             rtai);
2103                                     }
2104                                 }
2105                             }
2106                         }
2107                     }
2108                 }
2109             } while (pa.NextPrim());
2110
2111             if (IsIndexedT::value)
2112             {
2113                 fetchInfo_lo.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
2114                 fetchInfo_hi.xpIndices = fetchInfo_hi.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
2115             }
2116             else
2117             {
2118                 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
2119             }
2120
2121             i += KNOB_SIMD16_WIDTH;
2122         }
2123
2124         pa.Reset();
2125     }
2126
2127 #else
2128     SWR_VS_CONTEXT    vsContext;
2129     SWR_FETCH_CONTEXT fetchInfo = {0};
2130
2131     fetchInfo.pStreams      = &state.vertexBuffers[0];
2132     fetchInfo.StartInstance = work.startInstance;
2133     fetchInfo.StartVertex   = 0;
2134
2135     if (IsIndexedT::value)
2136     {
2137         fetchInfo.BaseVertex = work.baseVertex;
2138
2139         // if the entire index buffer isn't being consumed, set the last index
2140         // so that fetches < a SIMD wide will be masked off
2141         fetchInfo.pLastIndex =
2142             (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
2143         if (xpLastRequestedIndex < fetchInfo.pLastIndex)
2144         {
2145             fetchInfo.pLastIndex = xpLastRequestedIndex;
2146         }
2147     }
2148     else
2149     {
2150         fetchInfo.StartVertex = work.startVertex;
2151     }
2152
2153     const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2154
2155     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
2156     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
2157     {
2158         simdscalari vIndex;
2159         uint32_t    i = 0;
2160
2161         if (IsIndexedT::value)
2162         {
2163             fetchInfo.pIndices = work.pIB;
2164         }
2165         else
2166         {
2167             vIndex             = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
2168             fetchInfo.pIndices = (const int32_t*)&vIndex;
2169         }
2170
2171         fetchInfo.CurInstance = instanceNum;
2172         vsContext.InstanceID  = instanceNum;
2173
2174         while (pa.HasWork())
2175         {
2176             // GetNextVsOutput currently has the side effect of updating some PA state machine
2177             // state. So we need to keep this outside of (i < endVertex) check.
2178             simdmask* pvCutIndices = nullptr;
2179             if (IsIndexedT::value)
2180             {
2181                 pvCutIndices = &pa.GetNextVsIndices();
2182             }
2183
2184             simdvertex& vout = pa.GetNextVsOutput();
2185             vsContext.pVin   = &vout;
2186             vsContext.pVout  = &vout;
2187
2188             if (i < endVertex)
2189             {
2190                 // 1. Execute FS/VS for a single SIMD.
2191                 RDTSC_BEGIN(FEFetchShader, pDC->drawId);
2192                 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo, vout);
2193                 RDTSC_END(FEFetchShader, 0);
2194
2195                 // forward fetch generated vertex IDs to the vertex shader
2196                 vsContext.VertexID = fetchInfo.VertexID;
2197
2198                 // Setup active mask for vertex shader.
2199                 vsContext.mask = GenerateMask(endVertex - i);
2200
2201                 // forward cut mask to the PA
2202                 if (IsIndexedT::value)
2203                 {
2204                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
2205                 }
2206
2207                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
2208
2209 #if KNOB_ENABLE_TOSS_POINTS
2210                 if (!KNOB_TOSS_FETCH)
2211 #endif
2212                 {
2213                     RDTSC_BEGIN(FEVertexShader, pDC->drawId);
2214                     state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext);
2215                     RDTSC_END(FEVertexShader, 0);
2216
2217                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
2218                     AR_EVENT(VSStats((HANDLE)&vsContext.stats));
2219                 }
2220             }
2221
2222             // 2. Assemble primitives given the last two SIMD.
2223             do
2224             {
2225                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
2226                 // PaAssemble returns false if there is not enough verts to assemble.
2227                 RDTSC_BEGIN(FEPAAssemble, pDC->drawId);
2228                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
2229                 RDTSC_END(FEPAAssemble, 1);
2230
2231 #if KNOB_ENABLE_TOSS_POINTS
2232                 if (!KNOB_TOSS_FETCH)
2233 #endif
2234                 {
2235 #if KNOB_ENABLE_TOSS_POINTS
2236                     if (!KNOB_TOSS_VS)
2237 #endif
2238                     {
2239                         if (assemble)
2240                         {
2241                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2242
2243                             if (HasTessellationT::value)
2244                             {
2245                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2246                                     pDC,
2247                                     workerId,
2248                                     pa,
2249                                     &gsBuffers,
2250                                     pSoPrimData,
2251                                     pa.GetPrimID(work.startPrimID));
2252                             }
2253                             else if (HasGeometryShaderT::value)
2254                             {
2255                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
2256                                     pDC,
2257                                     workerId,
2258                                     pa,
2259                                     &gsBuffers,
2260                                     pSoPrimData,
2261                                     pa.GetPrimID(work.startPrimID));
2262                             }
2263                             else
2264                             {
2265                                 // If streamout is enabled then stream vertices out to memory.
2266                                 if (HasStreamOutT::value)
2267                                 {
2268                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2269                                 }
2270
2271                                 if (HasRastT::value)
2272                                 {
2273                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
2274
2275                                     // Gather data from the SVG if provided.
2276                                     simdscalari vViewportIdx = SIMD::setzero_si();
2277                                     simdscalari vRtIdx       = SIMD::setzero_si();
2278                                     SIMD::Vec4  svgAttrib[4];
2279
2280                                     if (state.backendState.readViewportArrayIndex ||
2281                                         state.backendState.readRenderTargetArrayIndex)
2282                                     {
2283                                         pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2284                                     }
2285
2286                                     if (state.backendState.readViewportArrayIndex)
2287                                     {
2288                                         vViewportIdx =
2289                                             SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2290
2291                                         // OOB VPAI indices => forced to zero.
2292                                         vViewportIdx =
2293                                             SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
2294                                         simdscalari vNumViewports =
2295                                             SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2296                                         simdscalari vClearMask =
2297                                             SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
2298                                         vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
2299                                         pa.viewportArrayActive = true;
2300                                     }
2301                                     if (state.backendState.readRenderTargetArrayIndex)
2302                                     {
2303                                         vRtIdx =
2304                                             SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2305                                         pa.rtArrayActive = true;
2306                                     }
2307
2308                                     pDC->pState->pfnProcessPrims(pDC,
2309                                                                  pa,
2310                                                                  workerId,
2311                                                                  prim,
2312                                                                  GenMask(pa.NumPrims()),
2313                                                                  pa.GetPrimID(work.startPrimID),
2314                                                                  vViewportIdx,
2315                                                                  vRtIdx);
2316                                 }
2317                             }
2318                         }
2319                     }
2320                 }
2321             } while (pa.NextPrim());
2322
2323             if (IsIndexedT::value)
2324             {
2325                 fetchInfo.pIndices =
2326                     (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
2327             }
2328             else
2329             {
2330                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
2331             }
2332
2333             i += KNOB_SIMD_WIDTH;
2334         }
2335         pa.Reset();
2336     }
2337
2338 #endif
2339
2340     RDTSC_END(FEProcessDraw, numPrims * work.numInstances);
2341 }
2342
2343 struct FEDrawChooser
2344 {
2345     typedef PFN_FE_WORK_FUNC FuncType;
2346
2347     template <typename... ArgsB>
2348     static FuncType GetFunc()
2349     {
2350         return ProcessDraw<ArgsB...>;
2351     }
2352 };
2353
2354 // Selector for correct templated Draw front-end function
2355 PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
2356                                     bool IsCutIndexEnabled,
2357                                     bool HasTessellation,
2358                                     bool HasGeometryShader,
2359                                     bool HasStreamOut,
2360                                     bool HasRasterization)
2361 {
2362     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed,
2363                                                        IsCutIndexEnabled,
2364                                                        HasTessellation,
2365                                                        HasGeometryShader,
2366                                                        HasStreamOut,
2367                                                        HasRasterization);
2368 }