src/gallium/drivers/swr/rasterizer/core/frontend.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file frontend.cpp
  24  *
  25  * @brief Implementation for Frontend which handles vertex processing,
  26  *        primitive assembly, clipping, binning, etc.
  27  *
  28  ******************************************************************************/
  29
  30 #include "api.h"
  31 #include "frontend.h"
  32 #include "backend.h"
  33 #include "context.h"
  34 #include "rdtsc_core.h"
  35 #include "utils.h"
  36 #include "threads.h"
  37 #include "pa.h"
  38 #include "clip.h"
  39 #include "tilemgr.h"
  40 #include "tessellator.h"
  41 #include <limits>
  42 #include <iostream>
  43
  44 //////////////////////////////////////////////////////////////////////////
  45 /// @brief FE handler for SwrSync.
  46 /// @param pContext - pointer to SWR context.
  47 /// @param pDC - pointer to draw context.
  48 /// @param workerId - thread's worker id. Even thread has a unique id.
  49 /// @param pUserData - Pointer to user data passed back to sync callback.
  50 /// @todo This should go away when we switch this to use compute threading.
  51 void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
  52 {
  53     BE_WORK work;
  54     work.type    = SYNC;
  55     work.pfnWork = ProcessSyncBE;
  56
  57     MacroTileMgr* pTileMgr = pDC->pTileMgr;
  58     pTileMgr->enqueue(0, 0, &work);
  59 }
  60
  61 //////////////////////////////////////////////////////////////////////////
  62 /// @brief FE handler for SwrDestroyContext.
  63 /// @param pContext - pointer to SWR context.
  64 /// @param pDC - pointer to draw context.
  65 /// @param workerId - thread's worker id. Even thread has a unique id.
  66 /// @param pUserData - Pointer to user data passed back to sync callback.
  67 void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
  68 {
  69     BE_WORK work;
  70     work.type    = SHUTDOWN;
  71     work.pfnWork = ProcessShutdownBE;
  72
  73     MacroTileMgr* pTileMgr = pDC->pTileMgr;
  74     // Enqueue at least 1 work item for each worker thread
  75     // account for number of numa nodes
  76     uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
  77
  78     for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
  79     {
  80         for (uint32_t n = 0; n < numNumaNodes; ++n)
  81         {
  82             pTileMgr->enqueue(i, n, &work);
  83         }
  84     }
  85 }
  86
  87 //////////////////////////////////////////////////////////////////////////
  88 /// @brief FE handler for SwrClearRenderTarget.
  89 /// @param pContext - pointer to SWR context.
  90 /// @param pDC - pointer to draw context.
  91 /// @param workerId - thread's worker id. Even thread has a unique id.
  92 /// @param pUserData - Pointer to user data passed back to clear callback.
  93 /// @todo This should go away when we switch this to use compute threading.
  94 void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
  95 {
  96     CLEAR_DESC*   pDesc    = (CLEAR_DESC*)pUserData;
  97     MacroTileMgr* pTileMgr = pDC->pTileMgr;
  98
  99     // queue a clear to each macro tile
 100     // compute macro tile bounds for the specified rect
 101     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 102     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 103     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 104     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 105
 106     BE_WORK work;
 107     work.type       = CLEAR;
 108     work.pfnWork    = ProcessClearBE;
 109     work.desc.clear = *pDesc;
 110
 111     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 112     {
 113         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 114         {
 115             pTileMgr->enqueue(x, y, &work);
 116         }
 117     }
 118 }
 119
 120 //////////////////////////////////////////////////////////////////////////
 121 /// @brief FE handler for SwrStoreTiles.
 122 /// @param pContext - pointer to SWR context.
 123 /// @param pDC - pointer to draw context.
 124 /// @param workerId - thread's worker id. Even thread has a unique id.
 125 /// @param pUserData - Pointer to user data passed back to callback.
 126 /// @todo This should go away when we switch this to use compute threading.
 127 void ProcessStoreTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
 128 {
 129     RDTSC_BEGIN(pContext->pBucketMgr, FEProcessStoreTiles, pDC->drawId);
 130     MacroTileMgr*     pTileMgr = pDC->pTileMgr;
 131     STORE_TILES_DESC* pDesc    = (STORE_TILES_DESC*)pUserData;
 132
 133     // queue a store to each macro tile
 134     // compute macro tile bounds for the specified rect
 135     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 136     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 137     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 138     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 139
 140     // store tiles
 141     BE_WORK work;
 142     work.type            = STORETILES;
 143     work.pfnWork         = ProcessStoreTilesBE;
 144     work.desc.storeTiles = *pDesc;
 145
 146     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 147     {
 148         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 149         {
 150             pTileMgr->enqueue(x, y, &work);
 151         }
 152     }
 153
 154     RDTSC_END(pContext->pBucketMgr, FEProcessStoreTiles, 0);
 155 }
 156
 157 //////////////////////////////////////////////////////////////////////////
 158 /// @brief FE handler for SwrInvalidateTiles.
 159 /// @param pContext - pointer to SWR context.
 160 /// @param pDC - pointer to draw context.
 161 /// @param workerId - thread's worker id. Even thread has a unique id.
 162 /// @param pUserData - Pointer to user data passed back to callback.
 163 /// @todo This should go away when we switch this to use compute threading.
 164 void ProcessDiscardInvalidateTiles(SWR_CONTEXT*  pContext,
 165                                    DRAW_CONTEXT* pDC,
 166                                    uint32_t      workerId,
 167                                    void*         pUserData)
 168 {
 169     RDTSC_BEGIN(pContext->pBucketMgr, FEProcessInvalidateTiles, pDC->drawId);
 170     DISCARD_INVALIDATE_TILES_DESC* pDesc    = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
 171     MacroTileMgr*                  pTileMgr = pDC->pTileMgr;
 172
 173     // compute macro tile bounds for the specified rect
 174     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
 175     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
 176     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
 177     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
 178
 179     if (pDesc->fullTilesOnly == false)
 180     {
 181         // include partial tiles
 182         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 183         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 184         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 185         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 186     }
 187
 188     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
 189     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
 190
 191     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
 192     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
 193
 194     // load tiles
 195     BE_WORK work;
 196     work.type                        = DISCARDINVALIDATETILES;
 197     work.pfnWork                     = ProcessDiscardInvalidateTilesBE;
 198     work.desc.discardInvalidateTiles = *pDesc;
 199
 200     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 201     {
 202         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 203         {
 204             pTileMgr->enqueue(x, y, &work);
 205         }
 206     }
 207
 208     RDTSC_END(pContext->pBucketMgr, FEProcessInvalidateTiles, 0);
 209 }
 210
 211 //////////////////////////////////////////////////////////////////////////
 212 /// @brief Computes the number of primitives given the number of verts.
 213 /// @param mode - primitive topology for draw operation.
 214 /// @param numPrims - number of vertices or indices for draw.
 215 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 216 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
 217 {
 218     switch (mode)
 219     {
 220     case TOP_POINT_LIST:
 221         return numPrims;
 222     case TOP_TRIANGLE_LIST:
 223         return numPrims / 3;
 224     case TOP_TRIANGLE_STRIP:
 225         return numPrims < 3 ? 0 : numPrims - 2;
 226     case TOP_TRIANGLE_FAN:
 227         return numPrims < 3 ? 0 : numPrims - 2;
 228     case TOP_TRIANGLE_DISC:
 229         return numPrims < 2 ? 0 : numPrims - 1;
 230     case TOP_QUAD_LIST:
 231         return numPrims / 4;
 232     case TOP_QUAD_STRIP:
 233         return numPrims < 4 ? 0 : (numPrims - 2) / 2;
 234     case TOP_LINE_STRIP:
 235         return numPrims < 2 ? 0 : numPrims - 1;
 236     case TOP_LINE_LIST:
 237         return numPrims / 2;
 238     case TOP_LINE_LOOP:
 239         return numPrims;
 240     case TOP_RECT_LIST:
 241         return numPrims / 3;
 242     case TOP_LINE_LIST_ADJ:
 243         return numPrims / 4;
 244     case TOP_LISTSTRIP_ADJ:
 245         return numPrims < 3 ? 0 : numPrims - 3;
 246     case TOP_TRI_LIST_ADJ:
 247         return numPrims / 6;
 248     case TOP_TRI_STRIP_ADJ:
 249         return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 250
 251     case TOP_PATCHLIST_1:
 252     case TOP_PATCHLIST_2:
 253     case TOP_PATCHLIST_3:
 254     case TOP_PATCHLIST_4:
 255     case TOP_PATCHLIST_5:
 256     case TOP_PATCHLIST_6:
 257     case TOP_PATCHLIST_7:
 258     case TOP_PATCHLIST_8:
 259     case TOP_PATCHLIST_9:
 260     case TOP_PATCHLIST_10:
 261     case TOP_PATCHLIST_11:
 262     case TOP_PATCHLIST_12:
 263     case TOP_PATCHLIST_13:
 264     case TOP_PATCHLIST_14:
 265     case TOP_PATCHLIST_15:
 266     case TOP_PATCHLIST_16:
 267     case TOP_PATCHLIST_17:
 268     case TOP_PATCHLIST_18:
 269     case TOP_PATCHLIST_19:
 270     case TOP_PATCHLIST_20:
 271     case TOP_PATCHLIST_21:
 272     case TOP_PATCHLIST_22:
 273     case TOP_PATCHLIST_23:
 274     case TOP_PATCHLIST_24:
 275     case TOP_PATCHLIST_25:
 276     case TOP_PATCHLIST_26:
 277     case TOP_PATCHLIST_27:
 278     case TOP_PATCHLIST_28:
 279     case TOP_PATCHLIST_29:
 280     case TOP_PATCHLIST_30:
 281     case TOP_PATCHLIST_31:
 282     case TOP_PATCHLIST_32:
 283         return numPrims / (mode - TOP_PATCHLIST_BASE);
 284
 285     case TOP_POLYGON:
 286     case TOP_POINT_LIST_BF:
 287     case TOP_LINE_STRIP_CONT:
 288     case TOP_LINE_STRIP_BF:
 289     case TOP_LINE_STRIP_CONT_BF:
 290     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 291     case TOP_TRI_STRIP_REVERSE:
 292     case TOP_PATCHLIST_BASE:
 293     case TOP_UNKNOWN:
 294         SWR_INVALID("Unsupported topology: %d", mode);
 295         return 0;
 296     }
 297
 298     return 0;
 299 }
 300
 301 //////////////////////////////////////////////////////////////////////////
 302 /// @brief Computes the number of verts given the number of primitives.
 303 /// @param mode - primitive topology for draw operation.
 304 /// @param numPrims - number of primitives for draw.
 305 uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
 306 {
 307     switch (mode)
 308     {
 309     case TOP_POINT_LIST:
 310         return numPrims;
 311     case TOP_TRIANGLE_LIST:
 312         return numPrims * 3;
 313     case TOP_TRIANGLE_STRIP:
 314         return numPrims ? numPrims + 2 : 0;
 315     case TOP_TRIANGLE_FAN:
 316         return numPrims ? numPrims + 2 : 0;
 317     case TOP_TRIANGLE_DISC:
 318         return numPrims ? numPrims + 1 : 0;
 319     case TOP_QUAD_LIST:
 320         return numPrims * 4;
 321     case TOP_QUAD_STRIP:
 322         return numPrims ? numPrims * 2 + 2 : 0;
 323     case TOP_LINE_STRIP:
 324         return numPrims ? numPrims + 1 : 0;
 325     case TOP_LINE_LIST:
 326         return numPrims * 2;
 327     case TOP_LINE_LOOP:
 328         return numPrims;
 329     case TOP_RECT_LIST:
 330         return numPrims * 3;
 331     case TOP_LINE_LIST_ADJ:
 332         return numPrims * 4;
 333     case TOP_LISTSTRIP_ADJ:
 334         return numPrims ? numPrims + 3 : 0;
 335     case TOP_TRI_LIST_ADJ:
 336         return numPrims * 6;
 337     case TOP_TRI_STRIP_ADJ:
 338         return numPrims ? (numPrims + 2) * 2 : 0;
 339
 340     case TOP_PATCHLIST_1:
 341     case TOP_PATCHLIST_2:
 342     case TOP_PATCHLIST_3:
 343     case TOP_PATCHLIST_4:
 344     case TOP_PATCHLIST_5:
 345     case TOP_PATCHLIST_6:
 346     case TOP_PATCHLIST_7:
 347     case TOP_PATCHLIST_8:
 348     case TOP_PATCHLIST_9:
 349     case TOP_PATCHLIST_10:
 350     case TOP_PATCHLIST_11:
 351     case TOP_PATCHLIST_12:
 352     case TOP_PATCHLIST_13:
 353     case TOP_PATCHLIST_14:
 354     case TOP_PATCHLIST_15:
 355     case TOP_PATCHLIST_16:
 356     case TOP_PATCHLIST_17:
 357     case TOP_PATCHLIST_18:
 358     case TOP_PATCHLIST_19:
 359     case TOP_PATCHLIST_20:
 360     case TOP_PATCHLIST_21:
 361     case TOP_PATCHLIST_22:
 362     case TOP_PATCHLIST_23:
 363     case TOP_PATCHLIST_24:
 364     case TOP_PATCHLIST_25:
 365     case TOP_PATCHLIST_26:
 366     case TOP_PATCHLIST_27:
 367     case TOP_PATCHLIST_28:
 368     case TOP_PATCHLIST_29:
 369     case TOP_PATCHLIST_30:
 370     case TOP_PATCHLIST_31:
 371     case TOP_PATCHLIST_32:
 372         return numPrims * (mode - TOP_PATCHLIST_BASE);
 373
 374     case TOP_POLYGON:
 375     case TOP_POINT_LIST_BF:
 376     case TOP_LINE_STRIP_CONT:
 377     case TOP_LINE_STRIP_BF:
 378     case TOP_LINE_STRIP_CONT_BF:
 379     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 380     case TOP_TRI_STRIP_REVERSE:
 381     case TOP_PATCHLIST_BASE:
 382     case TOP_UNKNOWN:
 383         SWR_INVALID("Unsupported topology: %d", mode);
 384         return 0;
 385     }
 386
 387     return 0;
 388 }
 389
 390 //////////////////////////////////////////////////////////////////////////
 391 /// @brief Return number of verts per primitive.
 392 /// @param topology - topology
 393 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 394 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 395 {
 396     uint32_t numVerts = 0;
 397     switch (topology)
 398     {
 399     case TOP_POINT_LIST:
 400     case TOP_POINT_LIST_BF:
 401         numVerts = 1;
 402         break;
 403     case TOP_LINE_LIST:
 404     case TOP_LINE_STRIP:
 405     case TOP_LINE_LIST_ADJ:
 406     case TOP_LINE_LOOP:
 407     case TOP_LINE_STRIP_CONT:
 408     case TOP_LINE_STRIP_BF:
 409     case TOP_LISTSTRIP_ADJ:
 410         numVerts = 2;
 411         break;
 412     case TOP_TRIANGLE_LIST:
 413     case TOP_TRIANGLE_STRIP:
 414     case TOP_TRIANGLE_FAN:
 415     case TOP_TRI_LIST_ADJ:
 416     case TOP_TRI_STRIP_ADJ:
 417     case TOP_TRI_STRIP_REVERSE:
 418     case TOP_RECT_LIST:
 419         numVerts = 3;
 420         break;
 421     case TOP_QUAD_LIST:
 422     case TOP_QUAD_STRIP:
 423         numVerts = 4;
 424         break;
 425     case TOP_PATCHLIST_1:
 426     case TOP_PATCHLIST_2:
 427     case TOP_PATCHLIST_3:
 428     case TOP_PATCHLIST_4:
 429     case TOP_PATCHLIST_5:
 430     case TOP_PATCHLIST_6:
 431     case TOP_PATCHLIST_7:
 432     case TOP_PATCHLIST_8:
 433     case TOP_PATCHLIST_9:
 434     case TOP_PATCHLIST_10:
 435     case TOP_PATCHLIST_11:
 436     case TOP_PATCHLIST_12:
 437     case TOP_PATCHLIST_13:
 438     case TOP_PATCHLIST_14:
 439     case TOP_PATCHLIST_15:
 440     case TOP_PATCHLIST_16:
 441     case TOP_PATCHLIST_17:
 442     case TOP_PATCHLIST_18:
 443     case TOP_PATCHLIST_19:
 444     case TOP_PATCHLIST_20:
 445     case TOP_PATCHLIST_21:
 446     case TOP_PATCHLIST_22:
 447     case TOP_PATCHLIST_23:
 448     case TOP_PATCHLIST_24:
 449     case TOP_PATCHLIST_25:
 450     case TOP_PATCHLIST_26:
 451     case TOP_PATCHLIST_27:
 452     case TOP_PATCHLIST_28:
 453     case TOP_PATCHLIST_29:
 454     case TOP_PATCHLIST_30:
 455     case TOP_PATCHLIST_31:
 456     case TOP_PATCHLIST_32:
 457         numVerts = topology - TOP_PATCHLIST_BASE;
 458         break;
 459     default:
 460         SWR_INVALID("Unsupported topology: %d", topology);
 461         break;
 462     }
 463
 464     if (includeAdjVerts)
 465     {
 466         switch (topology)
 467         {
 468         case TOP_LISTSTRIP_ADJ:
 469         case TOP_LINE_LIST_ADJ:
 470             numVerts = 4;
 471             break;
 472         case TOP_TRI_STRIP_ADJ:
 473         case TOP_TRI_LIST_ADJ:
 474             numVerts = 6;
 475             break;
 476         default:
 477             break;
 478         }
 479     }
 480
 481     return numVerts;
 482 }
 483
 484 //////////////////////////////////////////////////////////////////////////
 485 /// @brief Generate mask from remaining work.
 486 /// @param numWorkItems - Number of items being worked on by a SIMD.
 487 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 488 {
 489     uint32_t numActive =
 490         (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
 491     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 492     return _simd_castps_si(_simd_vmask_ps(mask));
 493 }
 494
 495 static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
 496 {
 497     uint32_t numActive =
 498         (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
 499     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 500     return _simd16_castps_si(_simd16_vmask_ps(mask));
 501 }
 502
 503 //////////////////////////////////////////////////////////////////////////
 504 /// @brief StreamOut - Streams vertex data out to SO buffers.
 505 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 506 /// @param pDC - pointer to draw context.
 507 /// @param workerId - thread's worker id. Even thread has a unique id.
 508 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 509 static void StreamOut(
 510     DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t workerId, uint32_t* pPrimData, uint32_t streamIndex)
 511 {
 512     RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEStreamout, pDC->drawId);
 513
 514     void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 515
 516     const API_STATE&           state   = GetApiState(pDC);
 517     const SWR_STREAMOUT_STATE& soState = state.soState;
 518
 519     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 520
 521     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each
 522     // vertex.
 523     uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
 524
 525     SWR_STREAMOUT_CONTEXT soContext = {0};
 526
 527     // Setup buffer state pointers.
 528     for (uint32_t i = 0; i < 4; ++i)
 529     {
 530         soContext.pBuffer[i] = &state.soBuffer[i];
 531     }
 532
 533     uint32_t numPrims = pa.NumPrims();
 534
 535     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 536     {
 537         DWORD    slot   = 0;
 538         uint64_t soMask = soState.streamMasks[streamIndex];
 539
 540         // Write all entries into primitive data buffer for SOS.
 541         while (_BitScanForward64(&slot, soMask))
 542         {
 543             simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
 544             uint32_t    paSlot = slot + soState.vertexAttribOffset[streamIndex];
 545             pa.AssembleSingle(paSlot, primIndex, attrib);
 546
 547             // Attribute offset is relative offset from start of vertex.
 548             // Note that attributes start at slot 1 in the PA buffer. We need to write this
 549             // to prim data starting at slot 0. Which is why we do (slot - 1).
 550             // Also note: GL works slightly differently, and needs slot 0
 551             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
 552
 553             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
 554             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
 555             {
 556                 uint32_t* pPrimDataAttrib =
 557                     pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 558
 559                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 560             }
 561
 562             soMask &= ~(uint64_t(1) << slot);
 563         }
 564
 565         // Update pPrimData pointer
 566         soContext.pPrimData = pPrimData;
 567
 568         // Call SOS
 569         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr,
 570                    "Trying to execute uninitialized streamout jit function.");
 571         state.pfnSoFunc[streamIndex](GetPrivateState(pDC), pWorkerData, soContext);
 572     }
 573
 574     // Update SO write offset. The driver provides memory for the update.
 575     for (uint32_t i = 0; i < 4; ++i)
 576     {
 577         if (state.soBuffer[i].pWriteOffset)
 578         {
 579             bool  nullTileAccessed = false;
 580             void* pWriteOffset     = pDC->pContext->pfnTranslateGfxptrForWrite(
 581                 GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed, pWorkerData);
 582             *((uint32_t*)pWriteOffset) = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 583         }
 584
 585         if (state.soBuffer[i].soWriteEnable)
 586         {
 587             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 588             pDC->dynState.SoWriteOffsetDirty[i] = true;
 589         }
 590     }
 591
 592     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
 593     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 594
 595     RDTSC_END(pDC->pContext->pBucketMgr, FEStreamout, 1);
 596 }
 597
 598 #if USE_SIMD16_FRONTEND
 599 //////////////////////////////////////////////////////////////////////////
 600 /// Is value an even number (a multiple of two)
 601 ///
 602 template <typename T>
 603 INLINE static bool IsEven(T value)
 604 {
 605     return (value & 1) == 0;
 606 }
 607
 608 //////////////////////////////////////////////////////////////////////////
 609 /// Round up value to an even number (a multiple of two)
 610 ///
 611 template <typename T>
 612 INLINE static T RoundUpEven(T value)
 613 {
 614     return (value + 1) & ~1;
 615 }
 616
 617 //////////////////////////////////////////////////////////////////////////
 618 /// Round down value to an even number (a multiple of two)
 619 ///
 620 template <typename T>
 621 INLINE static T RoundDownEven(T value)
 622 {
 623     return value & ~1;
 624 }
 625
 626 //////////////////////////////////////////////////////////////////////////
 627 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
 628 ///
 629 /// vertexCount is in terms of the source simdvertexes and must be even
 630 ///
 631 /// attribCount will limit the vector copies to those attribs specified
 632 ///
 633 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
 634 ///
 635 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex*     vertex_simd16,
 636                                            const simdvertex* vertex,
 637                                            uint32_t          vertexCount,
 638                                            uint32_t          attribCount)
 639 {
 640     SWR_ASSERT(vertex);
 641     SWR_ASSERT(vertex_simd16);
 642     SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
 643
 644     simd16vertex temp;
 645
 646     for (uint32_t i = 0; i < vertexCount; i += 2)
 647     {
 648         for (uint32_t j = 0; j < attribCount; j += 1)
 649         {
 650             for (uint32_t k = 0; k < 4; k += 1)
 651             {
 652                 temp.attrib[j][k] =
 653                     _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
 654
 655                 if ((i + 1) < vertexCount)
 656                 {
 657                     temp.attrib[j][k] =
 658                         _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
 659                 }
 660             }
 661         }
 662
 663         for (uint32_t j = 0; j < attribCount; j += 1)
 664         {
 665             vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
 666         }
 667     }
 668 }
 669
 670 #endif
 671 //////////////////////////////////////////////////////////////////////////
 672 /// @brief Computes number of invocations. The current index represents
 673 ///        the start of the SIMD. The max index represents how much work
 674 ///        items are remaining. If there is less then a SIMD's xmin of work
 675 ///        then return the remaining amount of work.
 676 /// @param curIndex - The start index for the SIMD.
 677 /// @param maxIndex - The last index for all work items.
 678 static INLINE uint32_t GetNumInvocations(uint32_t curIndex, uint32_t maxIndex)
 679 {
 680     uint32_t remainder = (maxIndex - curIndex);
 681 #if USE_SIMD16_FRONTEND
 682     return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
 683 #else
 684     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 685 #endif
 686 }
 687
 688 //////////////////////////////////////////////////////////////////////////
 689 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 690 ///        The geometry shader will loop over each active streamout buffer, assembling
 691 ///        primitives for the downstream stages. When multistream output is enabled,
 692 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 693 ///        buffer for the primitive assembler.
 694 /// @param stream - stream id to generate the cut buffer for
 695 /// @param pStreamIdBase - pointer to the stream ID buffer
 696 /// @param numEmittedVerts - Number of total verts emitted by the GS
 697 /// @param pCutBuffer - output buffer to write cuts to
 698 void ProcessStreamIdBuffer(uint32_t stream,
 699                            uint8_t* pStreamIdBase,
 700                            uint32_t numEmittedVerts,
 701                            uint8_t* pCutBuffer)
 702 {
 703     SWR_ASSERT(stream < MAX_SO_STREAMS);
 704
 705     uint32_t numInputBytes  = (numEmittedVerts * 2 + 7) / 8;
 706     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 707
 708     for (uint32_t b = 0; b < numOutputBytes; ++b)
 709     {
 710         uint8_t curInputByte = pStreamIdBase[2 * b];
 711         uint8_t outByte      = 0;
 712         for (uint32_t i = 0; i < 4; ++i)
 713         {
 714             if ((curInputByte & 0x3) != stream)
 715             {
 716                 outByte |= (1 << i);
 717             }
 718             curInputByte >>= 2;
 719         }
 720
 721         curInputByte = pStreamIdBase[2 * b + 1];
 722         for (uint32_t i = 0; i < 4; ++i)
 723         {
 724             if ((curInputByte & 0x3) != stream)
 725             {
 726                 outByte |= (1 << (i + 4));
 727             }
 728             curInputByte >>= 2;
 729         }
 730
 731         *pCutBuffer++ = outByte;
 732     }
 733 }
 734
 735 // Buffers that are allocated if GS is enabled
 736 struct GsBuffers
 737 {
 738     uint8_t* pGsIn;
 739     uint8_t* pGsOut[KNOB_SIMD_WIDTH];
 740     uint8_t* pGsTransposed;
 741     void*    pStreamCutBuffer;
 742 };
 743
 744 //////////////////////////////////////////////////////////////////////////
 745 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
 746 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive
 747 /// assembler
 748 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
 749 /// @param numVerts - Number of vertices outputted by the GS
 750 /// @param numAttribs - Number of attributes per vertex
 751 template <typename SIMD_T, uint32_t SimdWidth>
 752 void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
 753 {
 754     uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
 755     uint32_t dstVertexStride = numAttribs * sizeof(Float<SIMD_T>) * 4;
 756
 757     OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
 758
 759     for (uint32_t i = 0; i < SimdWidth; ++i)
 760     {
 761         gatherOffsets[i] = srcVertexStride * i;
 762     }
 763     auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]);
 764
 765     uint32_t numSimd        = AlignUp(numVerts, SimdWidth) / SimdWidth;
 766     uint32_t remainingVerts = numVerts;
 767
 768     for (uint32_t s = 0; s < numSimd; ++s)
 769     {
 770         uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
 771         uint8_t* pDstBase = pDst + s * dstVertexStride;
 772
 773         // Compute mask to prevent src overflow
 774         uint32_t mask = std::min(remainingVerts, SimdWidth);
 775         mask          = GenMask(mask);
 776         auto vMask    = SIMD_T::vmask_ps(mask);
 777         auto viMask   = SIMD_T::castps_si(vMask);
 778
 779         for (uint32_t a = 0; a < numAttribs; ++a)
 780         {
 781             auto attribGatherX = SIMD_T::mask_i32gather_ps(
 782                 SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
 783             auto attribGatherY = SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
 784                                                            (const float*)(pSrcBase + sizeof(float)),
 785                                                            vGatherOffsets,
 786                                                            vMask);
 787             auto attribGatherZ =
 788                 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
 789                                           (const float*)(pSrcBase + sizeof(float) * 2),
 790                                           vGatherOffsets,
 791                                           vMask);
 792             auto attribGatherW =
 793                 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
 794                                           (const float*)(pSrcBase + sizeof(float) * 3),
 795                                           vGatherOffsets,
 796                                           vMask);
 797
 798             SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
 799             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
 800             SIMD_T::maskstore_ps(
 801                 (float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ);
 802             SIMD_T::maskstore_ps(
 803                 (float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW);
 804
 805             pSrcBase += sizeof(float) * 4;
 806             pDstBase += sizeof(Float<SIMD_T>) * 4;
 807         }
 808         remainingVerts -= SimdWidth;
 809     }
 810 }
 811
 812
 813 //////////////////////////////////////////////////////////////////////////
 814 /// @brief Implements GS stage.
 815 /// @param pDC - pointer to draw context.
 816 /// @param workerId - thread's worker id. Even thread has a unique id.
 817 /// @param pa - The primitive assembly object.
 818 /// @param pGsOut - output stream for GS
 819 template <typename HasStreamOutT, typename HasRastT>
 820 static void GeometryShaderStage(DRAW_CONTEXT* pDC,
 821                                 uint32_t      workerId,
 822                                 PA_STATE&     pa,
 823                                 GsBuffers*    pGsBuffers,
 824                                 uint32_t*     pSoPrimData,
 825 #if USE_SIMD16_FRONTEND
 826                                 uint32_t numPrims_simd8,
 827 #endif
 828                                 simdscalari const& primID)
 829 {
 830     RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEGeometryShader, pDC->drawId);
 831
 832     void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 833
 834     const API_STATE&    state  = GetApiState(pDC);
 835     const SWR_GS_STATE* pState = &state.gsState;
 836     SWR_GS_CONTEXT      gsContext;
 837
 838     static uint8_t sNullBuffer[128] = {0};
 839
 840     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
 841     {
 842         gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
 843     }
 844     gsContext.pVerts      = (simdvector*)pGsBuffers->pGsIn;
 845     gsContext.PrimitiveID = primID;
 846
 847     uint32_t   numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 848     simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
 849
 850     // assemble all attributes for the input primitive
 851     gsContext.inputVertStride = pState->inputVertStride;
 852     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 853     {
 854         uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
 855         uint32_t attribSlot    = pState->vertexAttribOffset + slot;
 856         pa.Assemble(srcAttribSlot, attrib);
 857
 858         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 859         {
 860             gsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];
 861         }
 862     }
 863
 864     // assemble position
 865     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 866     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 867     {
 868         gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];
 869     }
 870
 871     // record valid prims from the frontend to avoid over binning the newly generated
 872     // prims from the GS
 873 #if USE_SIMD16_FRONTEND
 874     uint32_t numInputPrims = numPrims_simd8;
 875 #else
 876     uint32_t          numInputPrims = pa.NumPrims();
 877 #endif
 878
 879     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 880     {
 881         gsContext.InstanceID = instance;
 882         gsContext.mask       = GenerateMask(numInputPrims);
 883
 884         // execute the geometry shader
 885         state.pfnGsFunc(GetPrivateState(pDC), pWorkerData, &gsContext);
 886         AR_EVENT(GSStats((HANDLE)&gsContext.stats));
 887
 888         for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
 889         {
 890             gsContext.pStreams[i] += pState->allocationSize;
 891         }
 892     }
 893
 894     // set up new binner and state for the GS output topology
 895 #if USE_SIMD16_FRONTEND
 896     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
 897     if (HasRastT::value)
 898     {
 899         switch (pState->outputTopology)
 900         {
 901         case TOP_RECT_LIST:
 902             pfnClipFunc = ClipRectangles_simd16;
 903             break;
 904         case TOP_TRIANGLE_STRIP:
 905             pfnClipFunc = ClipTriangles_simd16;
 906             break;
 907         case TOP_LINE_STRIP:
 908             pfnClipFunc = ClipLines_simd16;
 909             break;
 910         case TOP_POINT_LIST:
 911             pfnClipFunc = ClipPoints_simd16;
 912             break;
 913         default:
 914             SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
 915         }
 916     }
 917
 918 #else
 919     PFN_PROCESS_PRIMS pfnClipFunc   = nullptr;
 920     if (HasRastT::value)
 921     {
 922         switch (pState->outputTopology)
 923         {
 924         case TOP_RECT_LIST:
 925             pfnClipFunc = ClipRectangles;
 926             break;
 927         case TOP_TRIANGLE_STRIP:
 928             pfnClipFunc = ClipTriangles;
 929             break;
 930         case TOP_LINE_STRIP:
 931             pfnClipFunc = ClipLines;
 932             break;
 933         case TOP_POINT_LIST:
 934             pfnClipFunc = ClipPoints;
 935             break;
 936         default:
 937             SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
 938         }
 939     }
 940
 941 #endif
 942     // foreach input prim:
 943     // - setup a new PA based on the emitted verts for that prim
 944     // - loop over the new verts, calling PA to assemble each prim
 945     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 946
 947     uint32_t totalPrimsGenerated = 0;
 948     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 949     {
 950         uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
 951
 952         // Vertex count is either emitted by shader or static
 953         uint32_t vertexCount = 0;
 954         if (pState->staticVertexCount)
 955         {
 956             vertexCount = pState->staticVertexCount;
 957         }
 958         else
 959         {
 960             // If emitted in shader, it should be the stored in the first dword of the output buffer
 961             vertexCount = *(uint32_t*)pInstanceBase;
 962         }
 963
 964         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 965         {
 966             uint32_t numEmittedVerts = vertexCount;
 967             if (numEmittedVerts == 0)
 968             {
 969                 continue;
 970             }
 971
 972             uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
 973             uint8_t* pCutBase =
 974                 pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
 975             uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
 976
 977 #if USE_SIMD16_FRONTEND
 978             TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
 979                                                           pVertexBaseAOS,
 980                                                           vertexCount,
 981                                                           pState->outputVertexSize);
 982 #else
 983             TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
 984                                                         pVertexBaseAOS,
 985                                                         vertexCount,
 986                                                         pState->outputVertexSize);
 987 #endif
 988
 989             uint32_t numAttribs = state.feNumAttributes;
 990
 991             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
 992             {
 993                 bool     processCutVerts = false;
 994                 uint8_t* pCutBuffer      = pCutBase;
 995
 996                 // assign default stream ID, only relevant when GS is outputting a single stream
 997                 uint32_t streamID = 0;
 998                 if (pState->isSingleStream)
 999                 {
1000                     processCutVerts = true;
1001                     streamID        = pState->singleStreamID;
1002                     if (streamID != stream)
1003                         continue;
1004                 }
1005                 else
1006                 {
1007                     // early exit if this stream is not enabled for streamout
1008                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
1009                     {
1010                         continue;
1011                     }
1012
1013                     // multi-stream output, need to translate StreamID buffer to a cut buffer
1014                     ProcessStreamIdBuffer(
1015                         stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
1016                     pCutBuffer      = (uint8_t*)pGsBuffers->pStreamCutBuffer;
1017                     processCutVerts = false;
1018                 }
1019
1020 #if USE_SIMD16_FRONTEND
1021                 PA_STATE_CUT gsPa(pDC,
1022                                   (uint8_t*)pGsBuffers->pGsTransposed,
1023                                   numEmittedVerts,
1024                                   pState->outputVertexSize,
1025                                   reinterpret_cast<simd16mask*>(pCutBuffer),
1026                                   numEmittedVerts,
1027                                   numAttribs,
1028                                   pState->outputTopology,
1029                                   processCutVerts,
1030                                   pa.numVertsPerPrim);
1031
1032 #else
1033                 PA_STATE_CUT gsPa(pDC,
1034                                   (uint8_t*)pGsBuffers->pGsTransposed,
1035                                   numEmittedVerts,
1036                                   pState->outputVertexSize,
1037                                   pCutBuffer,
1038                                   numEmittedVerts,
1039                                   numAttribs,
1040                                   pState->outputTopology,
1041                                   processCutVerts,
1042                                   pa.numVertsPerPrim);
1043
1044 #endif
1045                 while (gsPa.GetNextStreamOutput())
1046                 {
1047                     do
1048                     {
1049 #if USE_SIMD16_FRONTEND
1050                         simd16vector attrib_simd16[3];
1051
1052                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
1053
1054 #else
1055                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
1056
1057 #endif
1058                         if (assemble)
1059                         {
1060                             totalPrimsGenerated += gsPa.NumPrims();
1061
1062                             if (HasStreamOutT::value)
1063                             {
1064 #if ENABLE_AVX512_SIMD16
1065                                 gsPa.useAlternateOffset = false;
1066 #endif
1067                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
1068                             }
1069
1070                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
1071                             {
1072 #if USE_SIMD16_FRONTEND
1073                                 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
1074
1075                                 // Gather data from the SVG if provided.
1076                                 simd16scalari vViewportIdx = SIMD16::setzero_si();
1077                                 simd16scalari vRtIdx       = SIMD16::setzero_si();
1078                                 SIMD16::Vec4  svgAttrib[4];
1079
1080                                 if (state.backendState.readViewportArrayIndex ||
1081                                     state.backendState.readRenderTargetArrayIndex)
1082                                 {
1083                                     gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1084                                 }
1085
1086                                 if (state.backendState.readViewportArrayIndex)
1087                                 {
1088                                     vViewportIdx =
1089                                         SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1090                                     gsPa.viewportArrayActive = true;
1091                                 }
1092                                 if (state.backendState.readRenderTargetArrayIndex)
1093                                 {
1094                                     vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1095                                     gsPa.rtArrayActive = true;
1096                                 }
1097
1098                                 {
1099                                     // OOB VPAI indices => forced to zero.
1100                                     vViewportIdx =
1101                                         SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1102                                     simd16scalari vNumViewports =
1103                                         SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1104                                     simd16scalari vClearMask =
1105                                         SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1106                                     vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1107
1108                                     gsPa.useAlternateOffset = false;
1109                                     pfnClipFunc(pDC,
1110                                                 gsPa,
1111                                                 workerId,
1112                                                 attrib_simd16,
1113                                                 GenMask(gsPa.NumPrims()),
1114                                                 vPrimId,
1115                                                 vViewportIdx,
1116                                                 vRtIdx);
1117                                 }
1118 #else
1119                                 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
1120
1121                                 // Gather data from the SVG if provided.
1122                                 simdscalari vViewportIdx = SIMD::setzero_si();
1123                                 simdscalari vRtIdx       = SIMD::setzero_si();
1124                                 SIMD::Vec4  svgAttrib[4];
1125
1126                                 if (state.backendState.readViewportArrayIndex ||
1127                                     state.backendState.readRenderTargetArrayIndex)
1128                                 {
1129                                     gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1130                                 }
1131
1132                                 if (state.backendState.readViewportArrayIndex)
1133                                 {
1134                                     vViewportIdx =
1135                                         SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1136
1137                                     // OOB VPAI indices => forced to zero.
1138                                     vViewportIdx =
1139                                         SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1140                                     simdscalari vNumViewports =
1141                                         SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1142                                     simdscalari vClearMask =
1143                                         SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1144                                     vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
1145                                     gsPa.viewportArrayActive = true;
1146                                 }
1147                                 if (state.backendState.readRenderTargetArrayIndex)
1148                                 {
1149                                     vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1150                                     gsPa.rtArrayActive = true;
1151                                 }
1152
1153                                 pfnClipFunc(pDC,
1154                                             gsPa,
1155                                             workerId,
1156                                             attrib,
1157                                             GenMask(gsPa.NumPrims()),
1158                                             vPrimId,
1159                                             vViewportIdx,
1160                                             vRtIdx);
1161 #endif
1162                             }
1163                         }
1164                     } while (gsPa.NextPrim());
1165                 }
1166             }
1167         }
1168     }
1169
1170     // update GS pipeline stats
1171     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
1172     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
1173     AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim * numInputPrims));
1174     RDTSC_END(pDC->pContext->pBucketMgr, FEGeometryShader, 1);
1175 }
1176
1177 //////////////////////////////////////////////////////////////////////////
1178 /// @brief Allocate GS buffers
1179 /// @param pDC - pointer to draw context.
1180 /// @param state - API state
1181 /// @param ppGsOut - pointer to GS output buffer allocation
1182 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1183 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1184 static INLINE void AllocateGsBuffers(DRAW_CONTEXT*    pDC,
1185                                      const API_STATE& state,
1186                                      uint32_t         vertsPerPrim,
1187                                      GsBuffers*       pGsBuffers)
1188 {
1189     auto pArena = pDC->pArena;
1190     SWR_ASSERT(pArena != nullptr);
1191     SWR_ASSERT(state.gsState.gsEnable);
1192
1193     const SWR_GS_STATE& gsState = state.gsState;
1194
1195     // Allocate storage for vertex inputs
1196     uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
1197     pGsBuffers->pGsIn           = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
1198
1199     // Allocate arena space to hold GS output verts
1200     const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
1201
1202     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1203     {
1204         pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
1205     }
1206
1207     // Allocate storage for transposed GS output
1208     uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
1209     uint32_t transposedBufferSize =
1210         numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>);
1211     pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
1212
1213     // Allocate storage to hold temporary stream->cut buffer, if necessary
1214     if (state.gsState.isSingleStream)
1215     {
1216         pGsBuffers->pStreamCutBuffer = nullptr;
1217     }
1218     else
1219     {
1220         pGsBuffers->pStreamCutBuffer =
1221             (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
1222     }
1223 }
1224
1225 //////////////////////////////////////////////////////////////////////////
1226 /// @brief Contains all data generated by the HS and passed to the
1227 /// tessellator and DS.
1228 struct TessellationThreadLocalData
1229 {
1230     SWR_HS_CONTEXT hsContext;
1231     void*          pTxCtx;
1232     size_t         tsCtxSize;
1233
1234     uint8_t*    pHSOutput;
1235     size_t      hsOutputAllocSize;
1236
1237     simdscalar* pDSOutput;
1238     size_t      dsOutputAllocSize;
1239 };
1240
1241 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1242
1243 //////////////////////////////////////////////////////////////////////////
1244 /// @brief Allocate tessellation data for this worker thread.
1245 INLINE
1246 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1247 {
1248     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
1249     if (gt_pTessellationThreadData == nullptr)
1250     {
1251         gt_pTessellationThreadData =
1252             (TessellationThreadLocalData*)AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1253         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1254     }
1255 }
1256
1257 //////////////////////////////////////////////////////////////////////////
1258 /// @brief Implements Tessellation Stages.
1259 /// @param pDC - pointer to draw context.
1260 /// @param workerId - thread's worker id. Even thread has a unique id.
1261 /// @param pa - The primitive assembly object.
1262 /// @param pGsOut - output stream for GS
1263 template <typename HasGeometryShaderT, typename HasStreamOutT, typename HasRastT>
1264 static void TessellationStages(DRAW_CONTEXT* pDC,
1265                                uint32_t      workerId,
1266                                PA_STATE&     pa,
1267                                GsBuffers*    pGsBuffers,
1268                                uint32_t*     pSoPrimData,
1269 #if USE_SIMD16_FRONTEND
1270                                uint32_t numPrims_simd8,
1271 #endif
1272                                simdscalari const& primID)
1273 {
1274     const API_STATE&    state   = GetApiState(pDC);
1275     const SWR_TS_STATE& tsState = state.tsState;
1276     void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1277
1278     SWR_ASSERT(gt_pTessellationThreadData);
1279
1280     HANDLE tsCtx = TSInitCtx(tsState.domain,
1281                              tsState.partitioning,
1282                              tsState.tsOutputTopology,
1283                              gt_pTessellationThreadData->pTxCtx,
1284                              gt_pTessellationThreadData->tsCtxSize);
1285     if (tsCtx == nullptr)
1286     {
1287         gt_pTessellationThreadData->pTxCtx =
1288             AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1289         tsCtx = TSInitCtx(tsState.domain,
1290                           tsState.partitioning,
1291                           tsState.tsOutputTopology,
1292                           gt_pTessellationThreadData->pTxCtx,
1293                           gt_pTessellationThreadData->tsCtxSize);
1294     }
1295     SWR_ASSERT(tsCtx);
1296
1297 #if USE_SIMD16_FRONTEND
1298     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1299     if (HasRastT::value)
1300     {
1301         switch (tsState.postDSTopology)
1302         {
1303         case TOP_TRIANGLE_LIST:
1304             pfnClipFunc = ClipTriangles_simd16;
1305             break;
1306         case TOP_LINE_LIST:
1307             pfnClipFunc = ClipLines_simd16;
1308             break;
1309         case TOP_POINT_LIST:
1310             pfnClipFunc = ClipPoints_simd16;
1311             break;
1312         default:
1313             SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1314         }
1315     }
1316
1317 #else
1318     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1319     if (HasRastT::value)
1320     {
1321         switch (tsState.postDSTopology)
1322         {
1323         case TOP_TRIANGLE_LIST:
1324             pfnClipFunc = ClipTriangles;
1325             break;
1326         case TOP_LINE_LIST:
1327             pfnClipFunc = ClipLines;
1328             break;
1329         case TOP_POINT_LIST:
1330             pfnClipFunc = ClipPoints;
1331             break;
1332         default:
1333             SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1334         }
1335     }
1336
1337 #endif
1338     SWR_HS_CONTEXT& hsContext       = gt_pTessellationThreadData->hsContext;
1339     hsContext.PrimitiveID           = primID;
1340     hsContext.outputSize = tsState.hsAllocationSize;
1341
1342     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1343     // Max storage for one attribute for an entire simdprimitive
1344     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1345
1346     // assemble all attributes for the input primitives
1347     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1348     {
1349         uint32_t attribSlot = tsState.srcVertexAttribOffset + slot;
1350         pa.Assemble(attribSlot, simdattrib);
1351
1352         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1353         {
1354             hsContext.vert[i].attrib[tsState.vertexAttribOffset + slot] = simdattrib[i];
1355         }
1356     }
1357
1358     // Allocate HS output storage
1359     uint32_t requiredAllocSize = KNOB_SIMD_WIDTH * tsState.hsAllocationSize;
1360
1361     if (requiredAllocSize > gt_pTessellationThreadData->hsOutputAllocSize)
1362     {
1363         AlignedFree(gt_pTessellationThreadData->pHSOutput);
1364         gt_pTessellationThreadData->pHSOutput = (uint8_t*)AlignedMalloc(requiredAllocSize, 64);
1365         gt_pTessellationThreadData->hsOutputAllocSize = requiredAllocSize;
1366     }
1367
1368     hsContext.pCPout = (ScalarPatch*)gt_pTessellationThreadData->pHSOutput;
1369
1370 #if defined(_DEBUG)
1371     //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1372 #endif
1373
1374 #if USE_SIMD16_FRONTEND
1375     uint32_t numPrims = numPrims_simd8;
1376 #else
1377     uint32_t numPrims = pa.NumPrims();
1378 #endif
1379     hsContext.mask = GenerateMask(numPrims);
1380
1381     // Run the HS
1382     RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEHullShader, pDC->drawId);
1383     state.pfnHsFunc(GetPrivateState(pDC), pWorkerData, &hsContext);
1384     RDTSC_END(pDC->pContext->pBucketMgr, FEHullShader, 0);
1385
1386     UPDATE_STAT_FE(HsInvocations, numPrims);
1387     AR_EVENT(HSStats((HANDLE)&hsContext.stats));
1388
1389     const uint32_t* pPrimId = (const uint32_t*)&primID;
1390
1391     for (uint32_t p = 0; p < numPrims; ++p)
1392     {
1393         ScalarPatch* pCPout = (ScalarPatch*)(gt_pTessellationThreadData->pHSOutput + tsState.hsAllocationSize * p);
1394
1395         SWR_TESSELLATION_FACTORS tessFactors;
1396         tessFactors                    = hsContext.pCPout[p].tessFactors;
1397
1398         // Run Tessellator
1399         SWR_TS_TESSELLATED_DATA tsData = {0};
1400         RDTSC_BEGIN(pDC->pContext->pBucketMgr, FETessellation, pDC->drawId);
1401         TSTessellate(tsCtx, tessFactors, tsData);
1402         AR_EVENT(TessPrimCount(1));
1403         RDTSC_END(pDC->pContext->pBucketMgr, FETessellation, 0);
1404
1405         if (tsData.NumPrimitives == 0)
1406         {
1407             continue;
1408         }
1409         SWR_ASSERT(tsData.NumDomainPoints);
1410
1411         // Allocate DS Output memory
1412         uint32_t requiredDSVectorInvocations =
1413             AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1414 #if USE_SIMD16_FRONTEND
1415         size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) *
1416                                    tsState.dsAllocationSize; // simd8 -> simd16, padding
1417 #else
1418         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
1419         size_t requiredAllocSize       = sizeof(simdvector) * requiredDSOutputVectors;
1420 #endif
1421         if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
1422         {
1423             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1424             gt_pTessellationThreadData->pDSOutput =
1425                 (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1426             gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
1427         }
1428         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1429         SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);
1430
1431 #if defined(_DEBUG)
1432         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1433 #endif
1434
1435         // Run Domain Shader
1436         SWR_DS_CONTEXT dsContext;
1437         dsContext.PrimitiveID           = pPrimId[p];
1438         dsContext.pCpIn                 = pCPout;
1439         dsContext.pDomainU              = (simdscalar*)tsData.pDomainPointsU;
1440         dsContext.pDomainV              = (simdscalar*)tsData.pDomainPointsV;
1441         dsContext.pOutputData           = gt_pTessellationThreadData->pDSOutput;
1442         dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
1443 #if USE_SIMD16_FRONTEND
1444         dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
1445 #else
1446         dsContext.vectorStride         = requiredDSVectorInvocations;
1447 #endif
1448
1449         uint32_t dsInvocations = 0;
1450
1451         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations;
1452              ++dsContext.vectorOffset)
1453         {
1454             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1455
1456             RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEDomainShader, pDC->drawId);
1457             state.pfnDsFunc(GetPrivateState(pDC), pWorkerData, &dsContext);
1458             RDTSC_END(pDC->pContext->pBucketMgr, FEDomainShader, 0);
1459
1460             AR_EVENT(DSStats((HANDLE)&dsContext.stats));
1461
1462             dsInvocations += KNOB_SIMD_WIDTH;
1463         }
1464         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1465
1466 #if USE_SIMD16_FRONTEND
1467         SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
1468
1469 #endif
1470         PA_TESS tessPa(
1471             pDC,
1472 #if USE_SIMD16_FRONTEND
1473             reinterpret_cast<const simd16scalar*>(dsContext.pOutputData), // simd8 -> simd16
1474             dsContext.vectorStride / 2,                                   // simd8 -> simd16
1475 #else
1476             dsContext.pOutputData,
1477             dsContext.vectorStride,
1478 #endif
1479             SWR_VTX_NUM_SLOTS,
1480             tsState.numDsOutputAttribs + tsState.dsOutVtxAttribOffset,
1481             tsData.ppIndices,
1482             tsData.NumPrimitives,
1483             tsState.postDSTopology,
1484             NumVertsPerPrim(tsState.postDSTopology, false));
1485
1486         while (tessPa.HasWork())
1487         {
1488 #if USE_SIMD16_FRONTEND
1489             const uint32_t numPrims    = tessPa.NumPrims();
1490             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1491             const uint32_t numPrims_hi =
1492                 std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1493
1494             const simd16scalari primID    = _simd16_set1_epi32(dsContext.PrimitiveID);
1495             const simdscalari   primID_lo = _simd16_extract_si(primID, 0);
1496             const simdscalari   primID_hi = _simd16_extract_si(primID, 1);
1497
1498 #endif
1499             if (HasGeometryShaderT::value)
1500             {
1501 #if USE_SIMD16_FRONTEND
1502                 tessPa.useAlternateOffset = false;
1503                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1504                     pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1505
1506                 if (numPrims_hi)
1507                 {
1508                     tessPa.useAlternateOffset = true;
1509                     GeometryShaderStage<HasStreamOutT, HasRastT>(
1510                         pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1511                 }
1512 #else
1513                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1514                     pDC,
1515                     workerId,
1516                     tessPa,
1517                     pGsBuffers,
1518                     pSoPrimData,
1519                     _simd_set1_epi32(dsContext.PrimitiveID));
1520 #endif
1521             }
1522             else
1523             {
1524                 if (HasStreamOutT::value)
1525                 {
1526 #if ENABLE_AVX512_SIMD16
1527                     tessPa.useAlternateOffset = false;
1528 #endif
1529                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1530                 }
1531
1532                 if (HasRastT::value)
1533                 {
1534 #if USE_SIMD16_FRONTEND
1535                     simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
1536 #else
1537                     simdvector prim[3]; // Only deal with triangles, lines, or points
1538 #endif
1539                     RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
1540                     bool assemble =
1541 #if USE_SIMD16_FRONTEND
1542                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1543 #else
1544                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1545 #endif
1546                     RDTSC_END(pDC->pContext->pBucketMgr, FEPAAssemble, 1);
1547                     SWR_ASSERT(assemble);
1548
1549                     SWR_ASSERT(pfnClipFunc);
1550 #if USE_SIMD16_FRONTEND
1551                     // Gather data from the SVG if provided.
1552                     simd16scalari vViewportIdx = SIMD16::setzero_si();
1553                     simd16scalari vRtIdx       = SIMD16::setzero_si();
1554                     SIMD16::Vec4  svgAttrib[4];
1555
1556                     if (state.backendState.readViewportArrayIndex ||
1557                         state.backendState.readRenderTargetArrayIndex)
1558                     {
1559                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1560                     }
1561
1562                     if (state.backendState.readViewportArrayIndex)
1563                     {
1564                         vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1565                         tessPa.viewportArrayActive = true;
1566                     }
1567                     if (state.backendState.readRenderTargetArrayIndex)
1568                     {
1569                         vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1570                         tessPa.rtArrayActive = true;
1571                     }
1572
1573
1574                     {
1575                         // OOB VPAI indices => forced to zero.
1576                         vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1577                         simd16scalari vNumViewports =
1578                             SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1579                         simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1580                         vViewportIdx             = SIMD16::and_si(vClearMask, vViewportIdx);
1581
1582                         tessPa.useAlternateOffset = false;
1583                         pfnClipFunc(pDC,
1584                                     tessPa,
1585                                     workerId,
1586                                     prim_simd16,
1587                                     GenMask(numPrims),
1588                                     primID,
1589                                     vViewportIdx,
1590                                     vRtIdx);
1591                     }
1592 #else
1593                     // Gather data from the SGV if provided.
1594                     simdscalari vViewportIdx = SIMD::setzero_si();
1595                     simdscalari vRtIdx       = SIMD::setzero_si();
1596                     SIMD::Vec4  svgAttrib[4];
1597
1598                     if (state.backendState.readViewportArrayIndex ||
1599                         state.backendState.readRenderTargetArrayIndex)
1600                     {
1601                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1602                     }
1603
1604                     if (state.backendState.readViewportArrayIndex)
1605                     {
1606                         vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1607
1608                         // OOB VPAI indices => forced to zero.
1609                         vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1610                         simdscalari vNumViewports  = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1611                         simdscalari vClearMask     = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1612                         vViewportIdx               = SIMD::and_si(vClearMask, vViewportIdx);
1613                         tessPa.viewportArrayActive = true;
1614                     }
1615                     if (state.backendState.readRenderTargetArrayIndex)
1616                     {
1617                         vRtIdx               = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1618                         tessPa.rtArrayActive = true;
1619                     }
1620                     pfnClipFunc(pDC,
1621                                 tessPa,
1622                                 workerId,
1623                                 prim,
1624                                 GenMask(tessPa.NumPrims()),
1625                                 _simd_set1_epi32(dsContext.PrimitiveID),
1626                                 vViewportIdx,
1627                                 vRtIdx);
1628 #endif
1629                 }
1630             }
1631
1632             tessPa.NextPrim();
1633
1634         } // while (tessPa.HasWork())
1635     }     // for (uint32_t p = 0; p < numPrims; ++p)
1636
1637 #if USE_SIMD16_FRONTEND
1638     if (gt_pTessellationThreadData->pDSOutput != nullptr)
1639     {
1640         AlignedFree(gt_pTessellationThreadData->pDSOutput);
1641         gt_pTessellationThreadData->pDSOutput = nullptr;
1642     }
1643     gt_pTessellationThreadData->dsOutputAllocSize = 0;
1644
1645 #endif
1646     TSDestroyCtx(tsCtx);
1647 }
1648
1649 THREAD PA_STATE::SIMDVERTEX* gpVertexStore = nullptr;
1650 THREAD uint32_t gVertexStoreSize           = 0;
1651
1652 //////////////////////////////////////////////////////////////////////////
1653 /// @brief FE handler for SwrDraw.
1654 /// @tparam IsIndexedT - Is indexed drawing enabled
1655 /// @tparam HasTessellationT - Is tessellation enabled
1656 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1657 /// @tparam HasStreamOutT - Is stream-out enabled
1658 /// @tparam HasRastT - Is rasterization enabled
1659 /// @param pContext - pointer to SWR context.
1660 /// @param pDC - pointer to draw context.
1661 /// @param workerId - thread's worker id.
1662 /// @param pUserData - Pointer to DRAW_WORK
1663 template <typename IsIndexedT,
1664           typename IsCutIndexEnabledT,
1665           typename HasTessellationT,
1666           typename HasGeometryShaderT,
1667           typename HasStreamOutT,
1668           typename HasRastT>
1669 void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
1670 {
1671 #if KNOB_ENABLE_TOSS_POINTS
1672     if (KNOB_TOSS_QUEUE_FE)
1673     {
1674         return;
1675     }
1676 #endif
1677
1678     RDTSC_BEGIN(pContext->pBucketMgr, FEProcessDraw, pDC->drawId);
1679
1680     void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1681
1682     DRAW_WORK&       work  = *(DRAW_WORK*)pUserData;
1683     const API_STATE& state = GetApiState(pDC);
1684
1685     uint32_t indexSize = 0;
1686     uint32_t endVertex = work.numVerts;
1687
1688     gfxptr_t xpLastRequestedIndex = 0;
1689     if (IsIndexedT::value)
1690     {
1691         switch (work.type)
1692         {
1693         case R32_UINT:
1694             indexSize = sizeof(uint32_t);
1695             break;
1696         case R16_UINT:
1697             indexSize = sizeof(uint16_t);
1698             break;
1699         case R8_UINT:
1700             indexSize = sizeof(uint8_t);
1701             break;
1702         default:
1703             SWR_INVALID("Invalid work.type: %d", work.type);
1704         }
1705         xpLastRequestedIndex = work.xpIB + endVertex * indexSize;
1706     }
1707     else
1708     {
1709         // No cuts, prune partial primitives.
1710         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1711     }
1712
1713 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1714     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1715 #endif
1716
1717     GsBuffers gsBuffers;
1718     if (HasGeometryShaderT::value)
1719     {
1720 #if USE_SIMD16_FRONTEND
1721         AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(
1722             pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1723 #else
1724         AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(
1725             pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1726 #endif
1727     }
1728
1729     if (HasTessellationT::value)
1730     {
1731         SWR_ASSERT(state.tsState.tsEnable == true);
1732         SWR_ASSERT(state.pfnHsFunc != nullptr);
1733         SWR_ASSERT(state.pfnDsFunc != nullptr);
1734
1735         AllocateTessellationData(pContext);
1736     }
1737     else
1738     {
1739         SWR_ASSERT(state.tsState.tsEnable == false);
1740         SWR_ASSERT(state.pfnHsFunc == nullptr);
1741         SWR_ASSERT(state.pfnDsFunc == nullptr);
1742     }
1743
1744     // allocate space for streamout input prim data
1745     uint32_t* pSoPrimData = nullptr;
1746     if (HasStreamOutT::value)
1747     {
1748         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1749     }
1750
1751     const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
1752 #if USE_SIMD16_FRONTEND
1753     uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
1754 #else
1755     uint32_t          simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
1756 #endif
1757
1758     SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
1759
1760     // Compute storage requirements for vertex store
1761     // TODO: allocation needs to be rethought for better cut support
1762     uint32_t numVerts        = vertexCount + 2; // Need extra space for PA state machine
1763     uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
1764
1765     // grow the vertex store for the PA as necessary
1766     if (gVertexStoreSize < vertexStoreSize)
1767     {
1768         if (gpVertexStore != nullptr)
1769         {
1770             AlignedFree(gpVertexStore);
1771             gpVertexStore = nullptr;
1772         }
1773
1774         SWR_ASSERT(gpVertexStore == nullptr);
1775
1776         gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX*>(AlignedMalloc(vertexStoreSize, 64));
1777         gVertexStoreSize = vertexStoreSize;
1778
1779         SWR_ASSERT(gpVertexStore != nullptr);
1780     }
1781
1782     // choose primitive assembler
1783
1784     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC,
1785                                                          state.topology,
1786                                                          work.numVerts,
1787                                                          gpVertexStore,
1788                                                          numVerts,
1789                                                          state.frontendState.vsVertexSize,
1790                                                          GetNumVerts(state.topology, 1));
1791     PA_STATE&                                  pa = paFactory.GetPA();
1792
1793 #if USE_SIMD16_FRONTEND
1794 #if USE_SIMD16_SHADERS
1795     simd16vertex vin;
1796 #else
1797     simdvertex vin_lo;
1798     simdvertex vin_hi;
1799 #endif
1800     SWR_VS_CONTEXT vsContext_lo;
1801     SWR_VS_CONTEXT vsContext_hi;
1802
1803 #if USE_SIMD16_SHADERS
1804     vsContext_lo.pVin = reinterpret_cast<simdvertex*>(&vin);
1805     vsContext_hi.pVin = reinterpret_cast<simdvertex*>(&vin);
1806 #else
1807     vsContext_lo.pVin = &vin_lo;
1808     vsContext_hi.pVin = &vin_hi;
1809 #endif
1810     vsContext_lo.AlternateOffset = 0;
1811     vsContext_hi.AlternateOffset = 1;
1812
1813     SWR_FETCH_CONTEXT fetchInfo_lo = {0};
1814
1815     fetchInfo_lo.pStreams      = &state.vertexBuffers[0];
1816     fetchInfo_lo.StartInstance = work.startInstance;
1817     fetchInfo_lo.StartVertex   = 0;
1818
1819     if (IsIndexedT::value)
1820     {
1821         fetchInfo_lo.BaseVertex = work.baseVertex;
1822
1823         // if the entire index buffer isn't being consumed, set the last index
1824         // so that fetches < a SIMD wide will be masked off
1825         fetchInfo_lo.xpLastIndex = state.indexBuffer.xpIndices + state.indexBuffer.size;
1826         if (xpLastRequestedIndex < fetchInfo_lo.xpLastIndex)
1827         {
1828             fetchInfo_lo.xpLastIndex = xpLastRequestedIndex;
1829         }
1830     }
1831     else
1832     {
1833         fetchInfo_lo.StartVertex = work.startVertex;
1834     }
1835
1836     SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
1837
1838     const simd16scalari vScale =
1839         _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1840
1841     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1842     {
1843         uint32_t i = 0;
1844
1845         simd16scalari vIndex;
1846
1847         if (IsIndexedT::value)
1848         {
1849             fetchInfo_lo.xpIndices = work.xpIB;
1850             fetchInfo_hi.xpIndices =
1851                 fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize; // 1/2 of KNOB_SIMD16_WIDTH
1852         }
1853         else
1854         {
1855             vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1856
1857             fetchInfo_lo.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex);
1858             fetchInfo_hi.xpIndices = pDC->pContext->pfnMakeGfxPtr(
1859                 GetPrivateState(pDC),
1860                 &vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
1861         }
1862
1863         fetchInfo_lo.CurInstance = instanceNum;
1864         fetchInfo_hi.CurInstance = instanceNum;
1865
1866         vsContext_lo.InstanceID = instanceNum;
1867         vsContext_hi.InstanceID = instanceNum;
1868
1869         while (pa.HasWork())
1870         {
1871             // GetNextVsOutput currently has the side effect of updating some PA state machine
1872             // state. So we need to keep this outside of (i < endVertex) check.
1873
1874             simdmask* pvCutIndices_lo = nullptr;
1875             simdmask* pvCutIndices_hi = nullptr;
1876
1877             if (IsIndexedT::value)
1878             {
1879                 // simd16mask <=> simdmask[2]
1880
1881                 pvCutIndices_lo = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[0];
1882                 pvCutIndices_hi = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[1];
1883             }
1884
1885             simd16vertex& vout = pa.GetNextVsOutput();
1886
1887             vsContext_lo.pVout = reinterpret_cast<simdvertex*>(&vout);
1888             vsContext_hi.pVout = reinterpret_cast<simdvertex*>(&vout);
1889
1890             if (i < endVertex)
1891             {
1892                 if (!IsIndexedT::value)
1893                 {
1894                     fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices;
1895                     uint32_t offset;
1896                     offset = std::min(endVertex - i, (uint32_t)KNOB_SIMD16_WIDTH);
1897                     offset *= 4; // convert from index to address
1898 #if USE_SIMD16_SHADERS
1899                     fetchInfo_lo.xpLastIndex += offset;
1900 #else
1901                     fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t)KNOB_SIMD_WIDTH);
1902                     uint32_t offset2 =
1903                         std::min(offset, (uint32_t)KNOB_SIMD16_WIDTH) - KNOB_SIMD_WIDTH;
1904                     assert(offset >= 0);
1905                     fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices;
1906                     fetchInfo_hi.xpLastIndex += offset2;
1907 #endif
1908                 }
1909                 // 1. Execute FS/VS for a single SIMD.
1910                 RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
1911 #if USE_SIMD16_SHADERS
1912                 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin);
1913 #else
1914                 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin_lo);
1915
1916                 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1917                 {
1918                     state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_hi, vin_hi);
1919                 }
1920 #endif
1921                 RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
1922
1923                 // forward fetch generated vertex IDs to the vertex shader
1924 #if USE_SIMD16_SHADERS
1925 #if USE_SIMD16_VS
1926                 vsContext_lo.VertexID16 =
1927                     _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
1928                 vsContext_lo.VertexID16 =
1929                     _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
1930 #else
1931                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1932                 vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
1933 #endif
1934 #else
1935                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1936                 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1937 #endif
1938
1939                 // Setup active mask for vertex shader.
1940 #if USE_SIMD16_VS
1941                 vsContext_lo.mask16 = GenerateMask16(endVertex - i);
1942 #else
1943                 vsContext_lo.mask     = GenerateMask(endVertex - i);
1944                 vsContext_hi.mask     = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1945 #endif
1946
1947                 // forward cut mask to the PA
1948                 if (IsIndexedT::value)
1949                 {
1950 #if USE_SIMD16_SHADERS
1951                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1952                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
1953 #else
1954                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1955                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1956 #endif
1957                 }
1958
1959                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1960
1961 #if KNOB_ENABLE_TOSS_POINTS
1962                 if (!KNOB_TOSS_FETCH)
1963 #endif
1964                 {
1965                     RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
1966 #if USE_SIMD16_VS
1967                     state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
1968                     AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
1969 #else
1970                     state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
1971                     AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
1972
1973                     if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1974                     {
1975                         state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_hi);
1976                         AR_EVENT(VSStats((HANDLE)&vsContext_hi.stats));
1977                     }
1978 #endif
1979                     RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
1980
1981                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1982                 }
1983             }
1984
1985             // 2. Assemble primitives given the last two SIMD.
1986             do
1987             {
1988                 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1989
1990                 RDTSC_START(pContext->pBucketMgr, FEPAAssemble);
1991                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1992                 RDTSC_STOP(pContext->pBucketMgr, FEPAAssemble, 1, 0);
1993
1994 #if KNOB_ENABLE_TOSS_POINTS
1995                 if (!KNOB_TOSS_FETCH)
1996 #endif
1997                 {
1998 #if KNOB_ENABLE_TOSS_POINTS
1999                     if (!KNOB_TOSS_VS)
2000 #endif
2001                     {
2002                         if (assemble)
2003                         {
2004                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2005
2006                             const uint32_t numPrims = pa.NumPrims();
2007                             const uint32_t numPrims_lo =
2008                                 std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
2009                             const uint32_t numPrims_hi =
2010                                 std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
2011
2012                             const simd16scalari primID    = pa.GetPrimID(work.startPrimID);
2013                             const simdscalari   primID_lo = _simd16_extract_si(primID, 0);
2014                             const simdscalari   primID_hi = _simd16_extract_si(primID, 1);
2015
2016                             if (HasTessellationT::value)
2017                             {
2018                                 pa.useAlternateOffset = false;
2019                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2020                                     pDC,
2021                                     workerId,
2022                                     pa,
2023                                     &gsBuffers,
2024                                     pSoPrimData,
2025                                     numPrims_lo,
2026                                     primID_lo);
2027
2028                                 if (numPrims_hi)
2029                                 {
2030                                     pa.useAlternateOffset = true;
2031                                     TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2032                                         pDC,
2033                                         workerId,
2034                                         pa,
2035                                         &gsBuffers,
2036                                         pSoPrimData,
2037                                         numPrims_hi,
2038                                         primID_hi);
2039                                 }
2040                             }
2041                             else if (HasGeometryShaderT::value)
2042                             {
2043                                 pa.useAlternateOffset = false;
2044                                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
2045                                                                              workerId,
2046                                                                              pa,
2047                                                                              &gsBuffers,
2048                                                                              pSoPrimData,
2049                                                                              numPrims_lo,
2050                                                                              primID_lo);
2051
2052                                 if (numPrims_hi)
2053                                 {
2054                                     pa.useAlternateOffset = true;
2055                                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
2056                                                                                  workerId,
2057                                                                                  pa,
2058                                                                                  &gsBuffers,
2059                                                                                  pSoPrimData,
2060                                                                                  numPrims_hi,
2061                                                                                  primID_hi);
2062                                 }
2063                             }
2064                             else
2065                             {
2066                                 // If streamout is enabled then stream vertices out to memory.
2067                                 if (HasStreamOutT::value)
2068                                 {
2069                                     pa.useAlternateOffset = false;
2070                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2071                                 }
2072
2073                                 if (HasRastT::value)
2074                                 {
2075                                     SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
2076                                     // Gather data from the SVG if provided.
2077                                     simd16scalari vpai = SIMD16::setzero_si();
2078                                     simd16scalari rtai = SIMD16::setzero_si();
2079                                     SIMD16::Vec4  svgAttrib[4];
2080
2081                                     if (state.backendState.readViewportArrayIndex ||
2082                                         state.backendState.readRenderTargetArrayIndex)
2083                                     {
2084                                         pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2085                                     }
2086
2087                                     if (state.backendState.readViewportArrayIndex)
2088                                     {
2089                                         vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2090                                         pa.viewportArrayActive = true;
2091                                     }
2092                                     if (state.backendState.readRenderTargetArrayIndex)
2093                                     {
2094                                         rtai =
2095                                             SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2096                                         pa.rtArrayActive = true;
2097                                     }
2098
2099                                     {
2100                                         // OOB VPAI indices => forced to zero.
2101                                         vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
2102                                         simd16scalari vNumViewports =
2103                                             SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2104                                         simd16scalari vClearMask =
2105                                             SIMD16::cmplt_epi32(vpai, vNumViewports);
2106                                         vpai = SIMD16::and_si(vClearMask, vpai);
2107
2108                                         pa.useAlternateOffset = false;
2109                                         pDC->pState->pfnProcessPrims_simd16(pDC,
2110                                                                             pa,
2111                                                                             workerId,
2112                                                                             prim_simd16,
2113                                                                             GenMask(numPrims),
2114                                                                             primID,
2115                                                                             vpai,
2116                                                                             rtai);
2117                                     }
2118                                 }
2119                             }
2120                         }
2121                     }
2122                 }
2123             } while (pa.NextPrim());
2124
2125             if (IsIndexedT::value)
2126             {
2127                 fetchInfo_lo.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
2128                 fetchInfo_hi.xpIndices = fetchInfo_hi.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
2129             }
2130             else
2131             {
2132                 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
2133             }
2134
2135             i += KNOB_SIMD16_WIDTH;
2136         }
2137
2138         pa.Reset();
2139     }
2140
2141 #else
2142     SWR_VS_CONTEXT    vsContext;
2143     SWR_FETCH_CONTEXT fetchInfo = {0};
2144
2145     fetchInfo.pStreams      = &state.vertexBuffers[0];
2146     fetchInfo.StartInstance = work.startInstance;
2147     fetchInfo.StartVertex   = 0;
2148
2149     if (IsIndexedT::value)
2150     {
2151         fetchInfo.BaseVertex = work.baseVertex;
2152
2153         // if the entire index buffer isn't being consumed, set the last index
2154         // so that fetches < a SIMD wide will be masked off
2155         fetchInfo.pLastIndex =
2156             (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
2157         if (xpLastRequestedIndex < fetchInfo.pLastIndex)
2158         {
2159             fetchInfo.pLastIndex = xpLastRequestedIndex;
2160         }
2161     }
2162     else
2163     {
2164         fetchInfo.StartVertex = work.startVertex;
2165     }
2166
2167     const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2168
2169     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
2170     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
2171     {
2172         simdscalari vIndex;
2173         uint32_t    i = 0;
2174
2175         if (IsIndexedT::value)
2176         {
2177             fetchInfo.pIndices = work.pIB;
2178         }
2179         else
2180         {
2181             vIndex             = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
2182             fetchInfo.pIndices = (const int32_t*)&vIndex;
2183         }
2184
2185         fetchInfo.CurInstance = instanceNum;
2186         vsContext.InstanceID  = instanceNum;
2187
2188         while (pa.HasWork())
2189         {
2190             // GetNextVsOutput currently has the side effect of updating some PA state machine
2191             // state. So we need to keep this outside of (i < endVertex) check.
2192             simdmask* pvCutIndices = nullptr;
2193             if (IsIndexedT::value)
2194             {
2195                 pvCutIndices = &pa.GetNextVsIndices();
2196             }
2197
2198             simdvertex& vout = pa.GetNextVsOutput();
2199             vsContext.pVin   = &vout;
2200             vsContext.pVout  = &vout;
2201
2202             if (i < endVertex)
2203             {
2204                 // 1. Execute FS/VS for a single SIMD.
2205                 RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
2206                 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo, vout);
2207                 RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
2208
2209                 // forward fetch generated vertex IDs to the vertex shader
2210                 vsContext.VertexID = fetchInfo.VertexID;
2211
2212                 // Setup active mask for vertex shader.
2213                 vsContext.mask = GenerateMask(endVertex - i);
2214
2215                 // forward cut mask to the PA
2216                 if (IsIndexedT::value)
2217                 {
2218                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
2219                 }
2220
2221                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
2222
2223 #if KNOB_ENABLE_TOSS_POINTS
2224                 if (!KNOB_TOSS_FETCH)
2225 #endif
2226                 {
2227                     RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
2228                     state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext);
2229                     RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
2230
2231                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
2232                     AR_EVENT(VSStats((HANDLE)&vsContext.stats));
2233                 }
2234             }
2235
2236             // 2. Assemble primitives given the last two SIMD.
2237             do
2238             {
2239                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
2240                 // PaAssemble returns false if there is not enough verts to assemble.
2241                 RDTSC_BEGIN(pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
2242                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
2243                 RDTSC_END(pContext->pBucketMgr, FEPAAssemble, 1);
2244
2245 #if KNOB_ENABLE_TOSS_POINTS
2246                 if (!KNOB_TOSS_FETCH)
2247 #endif
2248                 {
2249 #if KNOB_ENABLE_TOSS_POINTS
2250                     if (!KNOB_TOSS_VS)
2251 #endif
2252                     {
2253                         if (assemble)
2254                         {
2255                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2256
2257                             if (HasTessellationT::value)
2258                             {
2259                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2260                                     pDC,
2261                                     workerId,
2262                                     pa,
2263                                     &gsBuffers,
2264                                     pSoPrimData,
2265                                     pa.GetPrimID(work.startPrimID));
2266                             }
2267                             else if (HasGeometryShaderT::value)
2268                             {
2269                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
2270                                     pDC,
2271                                     workerId,
2272                                     pa,
2273                                     &gsBuffers,
2274                                     pSoPrimData,
2275                                     pa.GetPrimID(work.startPrimID));
2276                             }
2277                             else
2278                             {
2279                                 // If streamout is enabled then stream vertices out to memory.
2280                                 if (HasStreamOutT::value)
2281                                 {
2282                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2283                                 }
2284
2285                                 if (HasRastT::value)
2286                                 {
2287                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
2288
2289                                     // Gather data from the SVG if provided.
2290                                     simdscalari vViewportIdx = SIMD::setzero_si();
2291                                     simdscalari vRtIdx       = SIMD::setzero_si();
2292                                     SIMD::Vec4  svgAttrib[4];
2293
2294                                     if (state.backendState.readViewportArrayIndex ||
2295                                         state.backendState.readRenderTargetArrayIndex)
2296                                     {
2297                                         pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2298                                     }
2299
2300                                     if (state.backendState.readViewportArrayIndex)
2301                                     {
2302                                         vViewportIdx =
2303                                             SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2304
2305                                         // OOB VPAI indices => forced to zero.
2306                                         vViewportIdx =
2307                                             SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
2308                                         simdscalari vNumViewports =
2309                                             SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2310                                         simdscalari vClearMask =
2311                                             SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
2312                                         vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
2313                                         pa.viewportArrayActive = true;
2314                                     }
2315                                     if (state.backendState.readRenderTargetArrayIndex)
2316                                     {
2317                                         vRtIdx =
2318                                             SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2319                                         pa.rtArrayActive = true;
2320                                     }
2321
2322                                     pDC->pState->pfnProcessPrims(pDC,
2323                                                                  pa,
2324                                                                  workerId,
2325                                                                  prim,
2326                                                                  GenMask(pa.NumPrims()),
2327                                                                  pa.GetPrimID(work.startPrimID),
2328                                                                  vViewportIdx,
2329                                                                  vRtIdx);
2330                                 }
2331                             }
2332                         }
2333                     }
2334                 }
2335             } while (pa.NextPrim());
2336
2337             if (IsIndexedT::value)
2338             {
2339                 fetchInfo.pIndices =
2340                     (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
2341             }
2342             else
2343             {
2344                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
2345             }
2346
2347             i += KNOB_SIMD_WIDTH;
2348         }
2349         pa.Reset();
2350     }
2351
2352 #endif
2353
2354     RDTSC_END(pContext->pBucketMgr, FEProcessDraw, numPrims * work.numInstances);
2355 }
2356
2357 struct FEDrawChooser
2358 {
2359     typedef PFN_FE_WORK_FUNC FuncType;
2360
2361     template <typename... ArgsB>
2362     static FuncType GetFunc()
2363     {
2364         return ProcessDraw<ArgsB...>;
2365     }
2366 };
2367
2368 // Selector for correct templated Draw front-end function
2369 PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
2370                                     bool IsCutIndexEnabled,
2371                                     bool HasTessellation,
2372                                     bool HasGeometryShader,
2373                                     bool HasStreamOut,
2374                                     bool HasRasterization)
2375 {
2376     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed,
2377                                                        IsCutIndexEnabled,
2378                                                        HasTessellation,
2379                                                        HasGeometryShader,
2380                                                        HasStreamOut,
2381                                                        HasRasterization);
2382 }