src/gallium/drivers/swr/rasterizer/core/frontend.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file frontend.cpp
  24 *
  25 * @brief Implementation for Frontend which handles vertex processing,
  26 *        primitive assembly, clipping, binning, etc.
  27 *
  28 ******************************************************************************/
  29
  30 #include "api.h"
  31 #include "frontend.h"
  32 #include "backend.h"
  33 #include "context.h"
  34 #include "rdtsc_core.h"
  35 #include "utils.h"
  36 #include "threads.h"
  37 #include "pa.h"
  38 #include "clip.h"
  39 #include "tilemgr.h"
  40 #include "tessellator.h"
  41 #include <limits>
  42 #include <iostream>
  43
  44 //////////////////////////////////////////////////////////////////////////
  45 /// @brief Helper macro to generate a bitmask
  46 static INLINE uint32_t GenMask(uint32_t numBits)
  47 {
  48     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
  49     return ((1U << numBits) - 1);
  50 }
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// @brief FE handler for SwrSync.
  54 /// @param pContext - pointer to SWR context.
  55 /// @param pDC - pointer to draw context.
  56 /// @param workerId - thread's worker id. Even thread has a unique id.
  57 /// @param pUserData - Pointer to user data passed back to sync callback.
  58 /// @todo This should go away when we switch this to use compute threading.
  59 void ProcessSync(
  60     SWR_CONTEXT *pContext,
  61     DRAW_CONTEXT *pDC,
  62     uint32_t workerId,
  63     void *pUserData)
  64 {
  65     BE_WORK work;
  66     work.type = SYNC;
  67     work.pfnWork = ProcessSyncBE;
  68
  69     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  70     pTileMgr->enqueue(0, 0, &work);
  71 }
  72
  73 //////////////////////////////////////////////////////////////////////////
  74 /// @brief FE handler for SwrDestroyContext.
  75 /// @param pContext - pointer to SWR context.
  76 /// @param pDC - pointer to draw context.
  77 /// @param workerId - thread's worker id. Even thread has a unique id.
  78 /// @param pUserData - Pointer to user data passed back to sync callback.
  79 void ProcessShutdown(
  80     SWR_CONTEXT *pContext,
  81     DRAW_CONTEXT *pDC,
  82     uint32_t workerId,
  83     void *pUserData)
  84 {
  85     BE_WORK work;
  86     work.type = SHUTDOWN;
  87     work.pfnWork = ProcessShutdownBE;
  88
  89     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  90     // Enqueue at least 1 work item for each worker thread
  91     // account for number of numa nodes
  92     uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
  93
  94     for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
  95     {
  96         for (uint32_t n = 0; n < numNumaNodes; ++n)
  97         {
  98             pTileMgr->enqueue(i, n, &work);
  99         }
 100     }
 101 }
 102
 103 //////////////////////////////////////////////////////////////////////////
 104 /// @brief FE handler for SwrClearRenderTarget.
 105 /// @param pContext - pointer to SWR context.
 106 /// @param pDC - pointer to draw context.
 107 /// @param workerId - thread's worker id. Even thread has a unique id.
 108 /// @param pUserData - Pointer to user data passed back to clear callback.
 109 /// @todo This should go away when we switch this to use compute threading.
 110 void ProcessClear(
 111     SWR_CONTEXT *pContext,
 112     DRAW_CONTEXT *pDC,
 113     uint32_t workerId,
 114     void *pUserData)
 115 {
 116     CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
 117     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 118
 119     // queue a clear to each macro tile
 120     // compute macro tile bounds for the specified rect
 121     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 122     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 123     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 124     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 125
 126     BE_WORK work;
 127     work.type = CLEAR;
 128     work.pfnWork = ProcessClearBE;
 129     work.desc.clear = *pDesc;
 130
 131     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 132     {
 133         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 134         {
 135             pTileMgr->enqueue(x, y, &work);
 136         }
 137     }
 138 }
 139
 140 //////////////////////////////////////////////////////////////////////////
 141 /// @brief FE handler for SwrStoreTiles.
 142 /// @param pContext - pointer to SWR context.
 143 /// @param pDC - pointer to draw context.
 144 /// @param workerId - thread's worker id. Even thread has a unique id.
 145 /// @param pUserData - Pointer to user data passed back to callback.
 146 /// @todo This should go away when we switch this to use compute threading.
 147 void ProcessStoreTiles(
 148     SWR_CONTEXT *pContext,
 149     DRAW_CONTEXT *pDC,
 150     uint32_t workerId,
 151     void *pUserData)
 152 {
 153     RDTSC_BEGIN(FEProcessStoreTiles, pDC->drawId);
 154     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 155     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
 156
 157     // queue a store to each macro tile
 158     // compute macro tile bounds for the specified rect
 159     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 160     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 161     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 162     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 163
 164     // store tiles
 165     BE_WORK work;
 166     work.type = STORETILES;
 167     work.pfnWork = ProcessStoreTilesBE;
 168     work.desc.storeTiles = *pDesc;
 169
 170     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 171     {
 172         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 173         {
 174             pTileMgr->enqueue(x, y, &work);
 175         }
 176     }
 177
 178     RDTSC_END(FEProcessStoreTiles, 0);
 179 }
 180
 181 //////////////////////////////////////////////////////////////////////////
 182 /// @brief FE handler for SwrInvalidateTiles.
 183 /// @param pContext - pointer to SWR context.
 184 /// @param pDC - pointer to draw context.
 185 /// @param workerId - thread's worker id. Even thread has a unique id.
 186 /// @param pUserData - Pointer to user data passed back to callback.
 187 /// @todo This should go away when we switch this to use compute threading.
 188 void ProcessDiscardInvalidateTiles(
 189     SWR_CONTEXT *pContext,
 190     DRAW_CONTEXT *pDC,
 191     uint32_t workerId,
 192     void *pUserData)
 193 {
 194     RDTSC_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
 195     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
 196     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 197
 198     // compute macro tile bounds for the specified rect
 199     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
 200     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
 201     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
 202     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
 203
 204     if (pDesc->fullTilesOnly == false)
 205     {
 206         // include partial tiles
 207         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 208         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 209         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 210         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 211     }
 212
 213     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
 214     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
 215
 216     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
 217     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
 218
 219     // load tiles
 220     BE_WORK work;
 221     work.type = DISCARDINVALIDATETILES;
 222     work.pfnWork = ProcessDiscardInvalidateTilesBE;
 223     work.desc.discardInvalidateTiles = *pDesc;
 224
 225     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 226     {
 227         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 228         {
 229             pTileMgr->enqueue(x, y, &work);
 230         }
 231     }
 232
 233     RDTSC_END(FEProcessInvalidateTiles, 0);
 234 }
 235
 236 //////////////////////////////////////////////////////////////////////////
 237 /// @brief Computes the number of primitives given the number of verts.
 238 /// @param mode - primitive topology for draw operation.
 239 /// @param numPrims - number of vertices or indices for draw.
 240 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 241 uint32_t GetNumPrims(
 242     PRIMITIVE_TOPOLOGY mode,
 243     uint32_t numPrims)
 244 {
 245     switch (mode)
 246     {
 247     case TOP_POINT_LIST: return numPrims;
 248     case TOP_TRIANGLE_LIST: return numPrims / 3;
 249     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
 250     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
 251     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
 252     case TOP_QUAD_LIST: return numPrims / 4;
 253     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
 254     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
 255     case TOP_LINE_LIST: return numPrims / 2;
 256     case TOP_LINE_LOOP: return numPrims;
 257     case TOP_RECT_LIST: return numPrims / 3;
 258     case TOP_LINE_LIST_ADJ: return numPrims / 4;
 259     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
 260     case TOP_TRI_LIST_ADJ: return numPrims / 6;
 261     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 262
 263     case TOP_PATCHLIST_1:
 264     case TOP_PATCHLIST_2:
 265     case TOP_PATCHLIST_3:
 266     case TOP_PATCHLIST_4:
 267     case TOP_PATCHLIST_5:
 268     case TOP_PATCHLIST_6:
 269     case TOP_PATCHLIST_7:
 270     case TOP_PATCHLIST_8:
 271     case TOP_PATCHLIST_9:
 272     case TOP_PATCHLIST_10:
 273     case TOP_PATCHLIST_11:
 274     case TOP_PATCHLIST_12:
 275     case TOP_PATCHLIST_13:
 276     case TOP_PATCHLIST_14:
 277     case TOP_PATCHLIST_15:
 278     case TOP_PATCHLIST_16:
 279     case TOP_PATCHLIST_17:
 280     case TOP_PATCHLIST_18:
 281     case TOP_PATCHLIST_19:
 282     case TOP_PATCHLIST_20:
 283     case TOP_PATCHLIST_21:
 284     case TOP_PATCHLIST_22:
 285     case TOP_PATCHLIST_23:
 286     case TOP_PATCHLIST_24:
 287     case TOP_PATCHLIST_25:
 288     case TOP_PATCHLIST_26:
 289     case TOP_PATCHLIST_27:
 290     case TOP_PATCHLIST_28:
 291     case TOP_PATCHLIST_29:
 292     case TOP_PATCHLIST_30:
 293     case TOP_PATCHLIST_31:
 294     case TOP_PATCHLIST_32:
 295         return numPrims / (mode - TOP_PATCHLIST_BASE);
 296
 297     case TOP_POLYGON:
 298     case TOP_POINT_LIST_BF:
 299     case TOP_LINE_STRIP_CONT:
 300     case TOP_LINE_STRIP_BF:
 301     case TOP_LINE_STRIP_CONT_BF:
 302     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 303     case TOP_TRI_STRIP_REVERSE:
 304     case TOP_PATCHLIST_BASE:
 305     case TOP_UNKNOWN:
 306         SWR_INVALID("Unsupported topology: %d", mode);
 307         return 0;
 308     }
 309
 310     return 0;
 311 }
 312
 313 //////////////////////////////////////////////////////////////////////////
 314 /// @brief Computes the number of verts given the number of primitives.
 315 /// @param mode - primitive topology for draw operation.
 316 /// @param numPrims - number of primitives for draw.
 317 uint32_t GetNumVerts(
 318     PRIMITIVE_TOPOLOGY mode,
 319     uint32_t numPrims)
 320 {
 321     switch (mode)
 322     {
 323     case TOP_POINT_LIST: return numPrims;
 324     case TOP_TRIANGLE_LIST: return numPrims * 3;
 325     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
 326     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
 327     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
 328     case TOP_QUAD_LIST: return numPrims * 4;
 329     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
 330     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
 331     case TOP_LINE_LIST: return numPrims * 2;
 332     case TOP_LINE_LOOP: return numPrims;
 333     case TOP_RECT_LIST: return numPrims * 3;
 334     case TOP_LINE_LIST_ADJ: return numPrims * 4;
 335     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
 336     case TOP_TRI_LIST_ADJ: return numPrims * 6;
 337     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
 338
 339     case TOP_PATCHLIST_1:
 340     case TOP_PATCHLIST_2:
 341     case TOP_PATCHLIST_3:
 342     case TOP_PATCHLIST_4:
 343     case TOP_PATCHLIST_5:
 344     case TOP_PATCHLIST_6:
 345     case TOP_PATCHLIST_7:
 346     case TOP_PATCHLIST_8:
 347     case TOP_PATCHLIST_9:
 348     case TOP_PATCHLIST_10:
 349     case TOP_PATCHLIST_11:
 350     case TOP_PATCHLIST_12:
 351     case TOP_PATCHLIST_13:
 352     case TOP_PATCHLIST_14:
 353     case TOP_PATCHLIST_15:
 354     case TOP_PATCHLIST_16:
 355     case TOP_PATCHLIST_17:
 356     case TOP_PATCHLIST_18:
 357     case TOP_PATCHLIST_19:
 358     case TOP_PATCHLIST_20:
 359     case TOP_PATCHLIST_21:
 360     case TOP_PATCHLIST_22:
 361     case TOP_PATCHLIST_23:
 362     case TOP_PATCHLIST_24:
 363     case TOP_PATCHLIST_25:
 364     case TOP_PATCHLIST_26:
 365     case TOP_PATCHLIST_27:
 366     case TOP_PATCHLIST_28:
 367     case TOP_PATCHLIST_29:
 368     case TOP_PATCHLIST_30:
 369     case TOP_PATCHLIST_31:
 370     case TOP_PATCHLIST_32:
 371         return numPrims * (mode - TOP_PATCHLIST_BASE);
 372
 373     case TOP_POLYGON:
 374     case TOP_POINT_LIST_BF:
 375     case TOP_LINE_STRIP_CONT:
 376     case TOP_LINE_STRIP_BF:
 377     case TOP_LINE_STRIP_CONT_BF:
 378     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 379     case TOP_TRI_STRIP_REVERSE:
 380     case TOP_PATCHLIST_BASE:
 381     case TOP_UNKNOWN:
 382         SWR_INVALID("Unsupported topology: %d", mode);
 383         return 0;
 384     }
 385
 386     return 0;
 387 }
 388
 389 //////////////////////////////////////////////////////////////////////////
 390 /// @brief Return number of verts per primitive.
 391 /// @param topology - topology
 392 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 393 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 394 {
 395     uint32_t numVerts = 0;
 396     switch (topology)
 397     {
 398     case TOP_POINT_LIST:
 399     case TOP_POINT_LIST_BF:
 400         numVerts = 1;
 401         break;
 402     case TOP_LINE_LIST:
 403     case TOP_LINE_STRIP:
 404     case TOP_LINE_LIST_ADJ:
 405     case TOP_LINE_LOOP:
 406     case TOP_LINE_STRIP_CONT:
 407     case TOP_LINE_STRIP_BF:
 408     case TOP_LISTSTRIP_ADJ:
 409         numVerts = 2;
 410         break;
 411     case TOP_TRIANGLE_LIST:
 412     case TOP_TRIANGLE_STRIP:
 413     case TOP_TRIANGLE_FAN:
 414     case TOP_TRI_LIST_ADJ:
 415     case TOP_TRI_STRIP_ADJ:
 416     case TOP_TRI_STRIP_REVERSE:
 417     case TOP_RECT_LIST:
 418         numVerts = 3;
 419         break;
 420     case TOP_QUAD_LIST:
 421     case TOP_QUAD_STRIP:
 422         numVerts = 4;
 423         break;
 424     case TOP_PATCHLIST_1:
 425     case TOP_PATCHLIST_2:
 426     case TOP_PATCHLIST_3:
 427     case TOP_PATCHLIST_4:
 428     case TOP_PATCHLIST_5:
 429     case TOP_PATCHLIST_6:
 430     case TOP_PATCHLIST_7:
 431     case TOP_PATCHLIST_8:
 432     case TOP_PATCHLIST_9:
 433     case TOP_PATCHLIST_10:
 434     case TOP_PATCHLIST_11:
 435     case TOP_PATCHLIST_12:
 436     case TOP_PATCHLIST_13:
 437     case TOP_PATCHLIST_14:
 438     case TOP_PATCHLIST_15:
 439     case TOP_PATCHLIST_16:
 440     case TOP_PATCHLIST_17:
 441     case TOP_PATCHLIST_18:
 442     case TOP_PATCHLIST_19:
 443     case TOP_PATCHLIST_20:
 444     case TOP_PATCHLIST_21:
 445     case TOP_PATCHLIST_22:
 446     case TOP_PATCHLIST_23:
 447     case TOP_PATCHLIST_24:
 448     case TOP_PATCHLIST_25:
 449     case TOP_PATCHLIST_26:
 450     case TOP_PATCHLIST_27:
 451     case TOP_PATCHLIST_28:
 452     case TOP_PATCHLIST_29:
 453     case TOP_PATCHLIST_30:
 454     case TOP_PATCHLIST_31:
 455     case TOP_PATCHLIST_32:
 456         numVerts = topology - TOP_PATCHLIST_BASE;
 457         break;
 458     default:
 459         SWR_INVALID("Unsupported topology: %d", topology);
 460         break;
 461     }
 462
 463     if (includeAdjVerts)
 464     {
 465         switch (topology)
 466         {
 467         case TOP_LISTSTRIP_ADJ:
 468         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
 469         case TOP_TRI_STRIP_ADJ:
 470         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
 471         default: break;
 472         }
 473     }
 474
 475     return numVerts;
 476 }
 477
 478 //////////////////////////////////////////////////////////////////////////
 479 /// @brief Generate mask from remaining work.
 480 /// @param numWorkItems - Number of items being worked on by a SIMD.
 481 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 482 {
 483     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
 484     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 485     return _simd_castps_si(_simd_vmask_ps(mask));
 486 }
 487
 488 static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
 489 {
 490     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
 491     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 492     return _simd16_castps_si(_simd16_vmask_ps(mask));
 493 }
 494
 495 //////////////////////////////////////////////////////////////////////////
 496 /// @brief StreamOut - Streams vertex data out to SO buffers.
 497 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 498 /// @param pDC - pointer to draw context.
 499 /// @param workerId - thread's worker id. Even thread has a unique id.
 500 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 501 static void StreamOut(
 502     DRAW_CONTEXT* pDC,
 503     PA_STATE& pa,
 504     uint32_t workerId,
 505     uint32_t* pPrimData,
 506     uint32_t streamIndex)
 507 {
 508     RDTSC_BEGIN(FEStreamout, pDC->drawId);
 509
 510     const API_STATE& state = GetApiState(pDC);
 511     const SWR_STREAMOUT_STATE &soState = state.soState;
 512
 513     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 514
 515     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
 516     uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
 517
 518     SWR_STREAMOUT_CONTEXT soContext = { 0 };
 519
 520     // Setup buffer state pointers.
 521     for (uint32_t i = 0; i < 4; ++i)
 522     {
 523         soContext.pBuffer[i] = &state.soBuffer[i];
 524     }
 525
 526     uint32_t numPrims = pa.NumPrims();
 527
 528     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 529     {
 530         DWORD slot = 0;
 531         uint32_t soMask = soState.streamMasks[streamIndex];
 532
 533         // Write all entries into primitive data buffer for SOS.
 534         while (_BitScanForward(&slot, soMask))
 535         {
 536             simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
 537             uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
 538             pa.AssembleSingle(paSlot, primIndex, attrib);
 539
 540             // Attribute offset is relative offset from start of vertex.
 541             // Note that attributes start at slot 1 in the PA buffer. We need to write this
 542             // to prim data starting at slot 0. Which is why we do (slot - 1).
 543             // Also note: GL works slightly differently, and needs slot 0
 544             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
 545
 546             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
 547             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
 548             {
 549                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 550
 551                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 552             }
 553
 554             soMask &= ~(1 << slot);
 555         }
 556
 557         // Update pPrimData pointer
 558         soContext.pPrimData = pPrimData;
 559
 560         // Call SOS
 561         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
 562         state.pfnSoFunc[streamIndex](soContext);
 563     }
 564
 565     // Update SO write offset. The driver provides memory for the update.
 566     for (uint32_t i = 0; i < 4; ++i)
 567     {
 568         if (state.soBuffer[i].pWriteOffset)
 569         {
 570             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 571         }
 572
 573         if (state.soBuffer[i].soWriteEnable)
 574         {
 575             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 576             pDC->dynState.SoWriteOffsetDirty[i] = true;
 577         }
 578     }
 579
 580     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
 581     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 582
 583     RDTSC_END(FEStreamout, 1);
 584 }
 585
 586 #if USE_SIMD16_FRONTEND
 587 //////////////////////////////////////////////////////////////////////////
 588 /// Is value an even number (a multiple of two)
 589 ///
 590 template <typename T>
 591 INLINE static bool IsEven(T value)
 592 {
 593     return (value & 1) == 0;
 594 }
 595
 596 //////////////////////////////////////////////////////////////////////////
 597 /// Round up value to an even number (a multiple of two)
 598 ///
 599 template <typename T>
 600 INLINE static T RoundUpEven(T value)
 601 {
 602     return (value + 1) & ~1;
 603 }
 604
 605 //////////////////////////////////////////////////////////////////////////
 606 /// Round down value to an even number (a multiple of two)
 607 ///
 608 template <typename T>
 609 INLINE static T RoundDownEven(T value)
 610 {
 611     return value & ~1;
 612 }
 613
 614 //////////////////////////////////////////////////////////////////////////
 615 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
 616 ///
 617 /// vertexCount is in terms of the source simdvertexes and must be even
 618 ///
 619 /// attribCount will limit the vector copies to those attribs specified
 620 ///
 621 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
 622 ///
 623 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
 624 {
 625     SWR_ASSERT(vertex);
 626     SWR_ASSERT(vertex_simd16);
 627     SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
 628
 629     simd16vertex temp;
 630
 631     for (uint32_t i = 0; i < vertexCount; i += 2)
 632     {
 633         for (uint32_t j = 0; j < attribCount; j += 1)
 634         {
 635             for (uint32_t k = 0; k < 4; k += 1)
 636             {
 637                 temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
 638
 639                 if ((i + 1) < vertexCount)
 640                 {
 641                     temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
 642                 }
 643             }
 644         }
 645
 646         for (uint32_t j = 0; j < attribCount; j += 1)
 647         {
 648             vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
 649         }
 650     }
 651 }
 652
 653 #endif
 654 //////////////////////////////////////////////////////////////////////////
 655 /// @brief Computes number of invocations. The current index represents
 656 ///        the start of the SIMD. The max index represents how much work
 657 ///        items are remaining. If there is less then a SIMD's xmin of work
 658 ///        then return the remaining amount of work.
 659 /// @param curIndex - The start index for the SIMD.
 660 /// @param maxIndex - The last index for all work items.
 661 static INLINE uint32_t GetNumInvocations(
 662     uint32_t curIndex,
 663     uint32_t maxIndex)
 664 {
 665     uint32_t remainder = (maxIndex - curIndex);
 666 #if USE_SIMD16_FRONTEND
 667     return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
 668 #else
 669     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 670 #endif
 671 }
 672
 673 //////////////////////////////////////////////////////////////////////////
 674 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 675 ///        The geometry shader will loop over each active streamout buffer, assembling
 676 ///        primitives for the downstream stages. When multistream output is enabled,
 677 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 678 ///        buffer for the primitive assembler.
 679 /// @param stream - stream id to generate the cut buffer for
 680 /// @param pStreamIdBase - pointer to the stream ID buffer
 681 /// @param numEmittedVerts - Number of total verts emitted by the GS
 682 /// @param pCutBuffer - output buffer to write cuts to
 683 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
 684 {
 685     SWR_ASSERT(stream < MAX_SO_STREAMS);
 686
 687     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
 688     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 689
 690     for (uint32_t b = 0; b < numOutputBytes; ++b)
 691     {
 692         uint8_t curInputByte = pStreamIdBase[2*b];
 693         uint8_t outByte = 0;
 694         for (uint32_t i = 0; i < 4; ++i)
 695         {
 696             if ((curInputByte & 0x3) != stream)
 697             {
 698                 outByte |= (1 << i);
 699             }
 700             curInputByte >>= 2;
 701         }
 702
 703         curInputByte = pStreamIdBase[2 * b + 1];
 704         for (uint32_t i = 0; i < 4; ++i)
 705         {
 706             if ((curInputByte & 0x3) != stream)
 707             {
 708                 outByte |= (1 << (i + 4));
 709             }
 710             curInputByte >>= 2;
 711         }
 712
 713         *pCutBuffer++ = outByte;
 714     }
 715 }
 716
 717 // Buffers that are allocated if GS is enabled
 718 struct GsBuffers
 719 {
 720     uint8_t* pGsIn;
 721     uint8_t* pGsOut[KNOB_SIMD_WIDTH];
 722     uint8_t* pGsTransposed;
 723     void* pStreamCutBuffer;
 724 };
 725
 726 //////////////////////////////////////////////////////////////////////////
 727 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
 728 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler
 729 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
 730 /// @param numVerts - Number of vertices outputted by the GS
 731 /// @param numAttribs - Number of attributes per vertex
 732 template<typename SIMD_T, uint32_t SimdWidth>
 733 void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
 734 {
 735     uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
 736     uint32_t dstVertexStride = numAttribs * sizeof(Float<SIMD_T>) * 4;
 737
 738     OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
 739
 740     for (uint32_t i = 0; i < SimdWidth; ++i)
 741     {
 742         gatherOffsets[i] = srcVertexStride * i;
 743     }
 744     auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]);
 745
 746     uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
 747     uint32_t remainingVerts = numVerts;
 748
 749     for (uint32_t s = 0; s < numSimd; ++s)
 750     {
 751         uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
 752         uint8_t* pDstBase = pDst + s * dstVertexStride;
 753
 754         // Compute mask to prevent src overflow
 755         uint32_t mask = std::min(remainingVerts, SimdWidth);
 756         mask = GenMask(mask);
 757         auto vMask = SIMD_T::vmask_ps(mask);
 758         auto viMask = SIMD_T::castps_si(vMask);
 759
 760         for (uint32_t a = 0; a < numAttribs; ++a)
 761         {
 762             auto attribGatherX = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
 763             auto attribGatherY = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask);
 764             auto attribGatherZ = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask);
 765             auto attribGatherW = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask);
 766
 767             SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
 768             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
 769             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ);
 770             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW);
 771
 772             pSrcBase += sizeof(float) * 4;
 773             pDstBase += sizeof(Float<SIMD_T>) * 4;
 774         }
 775         remainingVerts -= SimdWidth;
 776     }
 777 }
 778
 779
 780 //////////////////////////////////////////////////////////////////////////
 781 /// @brief Implements GS stage.
 782 /// @param pDC - pointer to draw context.
 783 /// @param workerId - thread's worker id. Even thread has a unique id.
 784 /// @param pa - The primitive assembly object.
 785 /// @param pGsOut - output stream for GS
 786 template <
 787     typename HasStreamOutT,
 788     typename HasRastT>
 789 static void GeometryShaderStage(
 790     DRAW_CONTEXT *pDC,
 791     uint32_t workerId,
 792     PA_STATE& pa,
 793     GsBuffers* pGsBuffers,
 794     uint32_t* pSoPrimData,
 795 #if USE_SIMD16_FRONTEND
 796     uint32_t numPrims_simd8,
 797 #endif
 798     simdscalari const &primID)
 799 {
 800     RDTSC_BEGIN(FEGeometryShader, pDC->drawId);
 801
 802     const API_STATE& state = GetApiState(pDC);
 803     const SWR_GS_STATE* pState = &state.gsState;
 804     SWR_GS_CONTEXT gsContext;
 805
 806     static uint8_t sNullBuffer[128] = { 0 };
 807
 808     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
 809     {
 810         gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
 811     }
 812     gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
 813     gsContext.PrimitiveID = primID;
 814
 815     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 816     simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
 817
 818     // assemble all attributes for the input primitive
 819     gsContext.inputVertStride = pState->inputVertStride;
 820     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 821     {
 822         uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
 823         uint32_t attribSlot = pState->vertexAttribOffset + slot;
 824         pa.Assemble(srcAttribSlot, attrib);
 825
 826         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 827         {
 828             gsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];
 829         }
 830     }
 831
 832     // assemble position
 833     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 834     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 835     {
 836         gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];
 837     }
 838
 839     // record valid prims from the frontend to avoid over binning the newly generated
 840     // prims from the GS
 841 #if USE_SIMD16_FRONTEND
 842     uint32_t numInputPrims = numPrims_simd8;
 843 #else
 844     uint32_t numInputPrims = pa.NumPrims();
 845 #endif
 846
 847     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 848     {
 849         gsContext.InstanceID = instance;
 850         gsContext.mask = GenerateMask(numInputPrims);
 851
 852         // execute the geometry shader
 853         state.pfnGsFunc(GetPrivateState(pDC), &gsContext);
 854         AR_EVENT(GSStats(gsContext.stats.numInstExecuted));
 855
 856         for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
 857         {
 858             gsContext.pStreams[i] += pState->allocationSize;
 859         }
 860     }
 861
 862     // set up new binner and state for the GS output topology
 863 #if USE_SIMD16_FRONTEND
 864     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
 865     if (HasRastT::value)
 866     {
 867         switch (pState->outputTopology)
 868         {
 869         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles_simd16; break;
 870         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines_simd16; break;
 871         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints_simd16; break;
 872         default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
 873         }
 874     }
 875
 876 #else
 877     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 878     if (HasRastT::value)
 879     {
 880         switch (pState->outputTopology)
 881         {
 882         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
 883         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
 884         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
 885         default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
 886         }
 887     }
 888
 889 #endif
 890     // foreach input prim:
 891     // - setup a new PA based on the emitted verts for that prim
 892     // - loop over the new verts, calling PA to assemble each prim
 893     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 894
 895     uint32_t totalPrimsGenerated = 0;
 896     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 897     {
 898         uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
 899
 900         // Vertex count is either emitted by shader or static
 901         uint32_t vertexCount = 0;
 902         if (pState->staticVertexCount)
 903         {
 904             vertexCount = pState->staticVertexCount;
 905         }
 906         else
 907         {
 908             // If emitted in shader, it should be the stored in the first dword of the output buffer
 909             vertexCount = *(uint32_t*)pInstanceBase;
 910         }
 911
 912         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 913         {
 914             uint32_t numEmittedVerts = vertexCount;
 915             if (numEmittedVerts == 0)
 916             {
 917                 continue;
 918             }
 919
 920             uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
 921             uint8_t* pCutBase = pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
 922             uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
 923
 924 #if USE_SIMD16_FRONTEND
 925             TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
 926 #else
 927             TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
 928 #endif
 929
 930             uint32_t numAttribs = state.feNumAttributes;
 931
 932             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
 933             {
 934                 bool processCutVerts = false;
 935                 uint8_t* pCutBuffer = pCutBase;
 936
 937                 // assign default stream ID, only relevant when GS is outputting a single stream
 938                 uint32_t streamID = 0;
 939                 if (pState->isSingleStream)
 940                 {
 941                     processCutVerts = true;
 942                     streamID = pState->singleStreamID;
 943                     if (streamID != stream) continue;
 944                 }
 945                 else
 946                 {
 947                     // early exit if this stream is not enabled for streamout
 948                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
 949                     {
 950                         continue;
 951                     }
 952
 953                     // multi-stream output, need to translate StreamID buffer to a cut buffer
 954                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
 955                     pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;
 956                     processCutVerts = false;
 957                 }
 958
 959 #if USE_SIMD16_FRONTEND
 960                 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);
 961
 962 #else
 963                 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);
 964
 965 #endif
 966                 while (gsPa.GetNextStreamOutput())
 967                 {
 968                     do
 969                     {
 970 #if USE_SIMD16_FRONTEND
 971                         simd16vector attrib_simd16[3];
 972
 973                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
 974
 975 #else
 976                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
 977
 978 #endif
 979                         if (assemble)
 980                         {
 981                             totalPrimsGenerated += gsPa.NumPrims();
 982
 983                             if (HasStreamOutT::value)
 984                             {
 985 #if ENABLE_AVX512_SIMD16
 986                                 gsPa.useAlternateOffset = false;
 987 #endif
 988                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
 989                             }
 990
 991                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
 992                             {
 993 #if USE_SIMD16_FRONTEND
 994                                 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
 995
 996                                 // Gather data from the SVG if provided.
 997                                 simd16scalari vViewportIdx = SIMD16::setzero_si();
 998                                 simd16scalari vRtIdx = SIMD16::setzero_si();
 999                                 SIMD16::Vec4 svgAttrib[4];
1000
1001                                 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1002                                 {
1003                                     gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1004                                 }
1005
1006
1007                                 if (state.backendState.readViewportArrayIndex)
1008                                 {
1009                                     vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1010                                     gsPa.viewportArrayActive = true;
1011                                 }
1012                                 if (state.backendState.readRenderTargetArrayIndex)
1013                                 {
1014                                     vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1015                                     gsPa.rtArrayActive = true;
1016                                 }
1017
1018                                 {
1019                                     // OOB VPAI indices => forced to zero.
1020                                     vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1021                                     simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1022                                     simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1023                                     vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1024
1025                                     gsPa.useAlternateOffset = false;
1026                                     pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx);
1027                                 }
1028 #else
1029                                 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
1030
1031                                 // Gather data from the SVG if provided.
1032                                 simdscalari vViewportIdx = SIMD::setzero_si();
1033                                 simdscalari vRtIdx = SIMD::setzero_si();
1034                                 SIMD::Vec4 svgAttrib[4];
1035
1036                                 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1037                                 {
1038                                     gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1039                                 }
1040
1041
1042                                 if (state.backendState.readViewportArrayIndex)
1043                                 {
1044                                     vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1045
1046                                     // OOB VPAI indices => forced to zero.
1047                                     vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1048                                     simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1049                                     simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1050                                     vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
1051                                     gsPa.viewportArrayActive = true;
1052                                 }
1053                                 if (state.backendState.readRenderTargetArrayIndex)
1054                                 {
1055                                     vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1056                                     gsPa.rtArrayActive = true;
1057                                 }
1058
1059                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx);
1060 #endif
1061                             }
1062                         }
1063                     } while (gsPa.NextPrim());
1064                 }
1065             }
1066         }
1067     }
1068
1069     // update GS pipeline stats
1070     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
1071     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
1072     AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
1073     RDTSC_END(FEGeometryShader, 1);
1074 }
1075
1076 //////////////////////////////////////////////////////////////////////////
1077 /// @brief Allocate GS buffers
1078 /// @param pDC - pointer to draw context.
1079 /// @param state - API state
1080 /// @param ppGsOut - pointer to GS output buffer allocation
1081 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1082 template<typename SIMD_T, uint32_t SIMD_WIDTH>
1083 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers)
1084 {
1085     auto pArena = pDC->pArena;
1086     SWR_ASSERT(pArena != nullptr);
1087     SWR_ASSERT(state.gsState.gsEnable);
1088
1089     const SWR_GS_STATE& gsState = state.gsState;
1090
1091     // Allocate storage for vertex inputs
1092     uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
1093     pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
1094
1095     // Allocate arena space to hold GS output verts
1096     const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
1097
1098     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1099     {
1100         pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
1101     }
1102
1103     // Allocate storage for transposed GS output
1104     uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
1105     uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>);
1106     pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
1107
1108     // Allocate storage to hold temporary stream->cut buffer, if necessary
1109     if (state.gsState.isSingleStream)
1110     {
1111         pGsBuffers->pStreamCutBuffer = nullptr;
1112     }
1113     else
1114     {
1115         pGsBuffers->pStreamCutBuffer = (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
1116     }
1117 }
1118
1119 //////////////////////////////////////////////////////////////////////////
1120 /// @brief Contains all data generated by the HS and passed to the
1121 /// tessellator and DS.
1122 struct TessellationThreadLocalData
1123 {
1124     SWR_HS_CONTEXT hsContext;
1125     ScalarPatch patchData[KNOB_SIMD_WIDTH];
1126     void* pTxCtx;
1127     size_t tsCtxSize;
1128
1129     simdscalar* pDSOutput;
1130     size_t dsOutputAllocSize;
1131 };
1132
1133 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1134
1135 //////////////////////////////////////////////////////////////////////////
1136 /// @brief Allocate tessellation data for this worker thread.
1137 INLINE
1138 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1139 {
1140     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
1141     if (gt_pTessellationThreadData == nullptr)
1142     {
1143         gt_pTessellationThreadData = (TessellationThreadLocalData*)
1144             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1145         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1146     }
1147 }
1148
1149 //////////////////////////////////////////////////////////////////////////
1150 /// @brief Implements Tessellation Stages.
1151 /// @param pDC - pointer to draw context.
1152 /// @param workerId - thread's worker id. Even thread has a unique id.
1153 /// @param pa - The primitive assembly object.
1154 /// @param pGsOut - output stream for GS
1155 template <
1156     typename HasGeometryShaderT,
1157     typename HasStreamOutT,
1158     typename HasRastT>
1159 static void TessellationStages(
1160     DRAW_CONTEXT *pDC,
1161     uint32_t workerId,
1162     PA_STATE& pa,
1163     GsBuffers* pGsBuffers,
1164     uint32_t* pSoPrimData,
1165 #if USE_SIMD16_FRONTEND
1166     uint32_t numPrims_simd8,
1167 #endif
1168     simdscalari const &primID)
1169 {
1170     const API_STATE& state = GetApiState(pDC);
1171     const SWR_TS_STATE& tsState = state.tsState;
1172
1173     SWR_ASSERT(gt_pTessellationThreadData);
1174
1175     HANDLE tsCtx = TSInitCtx(
1176         tsState.domain,
1177         tsState.partitioning,
1178         tsState.tsOutputTopology,
1179         gt_pTessellationThreadData->pTxCtx,
1180         gt_pTessellationThreadData->tsCtxSize);
1181     if (tsCtx == nullptr)
1182     {
1183         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1184         tsCtx = TSInitCtx(
1185             tsState.domain,
1186             tsState.partitioning,
1187             tsState.tsOutputTopology,
1188             gt_pTessellationThreadData->pTxCtx,
1189             gt_pTessellationThreadData->tsCtxSize);
1190     }
1191     SWR_ASSERT(tsCtx);
1192
1193 #if USE_SIMD16_FRONTEND
1194     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1195     if (HasRastT::value)
1196     {
1197         switch (tsState.postDSTopology)
1198         {
1199         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
1200         case TOP_LINE_LIST:     pfnClipFunc = ClipLines_simd16; break;
1201         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints_simd16; break;
1202         default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1203         }
1204     }
1205
1206 #else
1207     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1208     if (HasRastT::value)
1209     {
1210         switch (tsState.postDSTopology)
1211         {
1212         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1213         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
1214         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
1215         default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1216         }
1217     }
1218
1219 #endif
1220     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1221     hsContext.pCPout = gt_pTessellationThreadData->patchData;
1222     hsContext.PrimitiveID = primID;
1223
1224     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1225     // Max storage for one attribute for an entire simdprimitive
1226     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1227
1228     // assemble all attributes for the input primitives
1229     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1230     {
1231         uint32_t attribSlot = tsState.vertexAttribOffset + slot;
1232         pa.Assemble(attribSlot, simdattrib);
1233
1234         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1235         {
1236             hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i];
1237         }
1238     }
1239
1240 #if defined(_DEBUG)
1241     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1242 #endif
1243
1244 #if USE_SIMD16_FRONTEND
1245     uint32_t numPrims = numPrims_simd8;
1246 #else
1247     uint32_t numPrims = pa.NumPrims();
1248 #endif
1249     hsContext.mask = GenerateMask(numPrims);
1250
1251     // Run the HS
1252     RDTSC_BEGIN(FEHullShader, pDC->drawId);
1253     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1254     RDTSC_END(FEHullShader, 0);
1255
1256     UPDATE_STAT_FE(HsInvocations, numPrims);
1257     AR_EVENT(HSStats(hsContext.stats.numInstExecuted));
1258
1259     const uint32_t* pPrimId = (const uint32_t*)&primID;
1260
1261     for (uint32_t p = 0; p < numPrims; ++p)
1262     {
1263         // Run Tessellator
1264         SWR_TS_TESSELLATED_DATA tsData = { 0 };
1265         RDTSC_BEGIN(FETessellation, pDC->drawId);
1266         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1267         AR_EVENT(TessPrimCount(1));
1268         RDTSC_END(FETessellation, 0);
1269
1270         if (tsData.NumPrimitives == 0)
1271         {
1272             continue;
1273         }
1274         SWR_ASSERT(tsData.NumDomainPoints);
1275
1276         // Allocate DS Output memory
1277         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1278 #if USE_SIMD16_FRONTEND
1279         size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.dsAllocationSize;      // simd8 -> simd16, padding
1280 #else
1281         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
1282         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1283 #endif
1284         if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
1285         {
1286             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1287             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1288             gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
1289         }
1290         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1291         SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);
1292
1293 #if defined(_DEBUG)
1294         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1295 #endif
1296
1297         // Run Domain Shader
1298         SWR_DS_CONTEXT dsContext;
1299         dsContext.PrimitiveID = pPrimId[p];
1300         dsContext.pCpIn = &hsContext.pCPout[p];
1301         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1302         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1303         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1304         dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
1305 #if USE_SIMD16_FRONTEND
1306         dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations);      // simd8 -> simd16
1307 #else
1308         dsContext.vectorStride = requiredDSVectorInvocations;
1309 #endif
1310
1311         uint32_t dsInvocations = 0;
1312
1313         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1314         {
1315             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1316
1317             RDTSC_BEGIN(FEDomainShader, pDC->drawId);
1318             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1319             RDTSC_END(FEDomainShader, 0);
1320
1321             AR_EVENT(DSStats(dsContext.stats.numInstExecuted));
1322
1323             dsInvocations += KNOB_SIMD_WIDTH;
1324         }
1325         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1326
1327 #if USE_SIMD16_FRONTEND
1328         SWR_ASSERT(IsEven(dsContext.vectorStride));                             // simd8 -> simd16
1329
1330 #endif
1331         PA_TESS tessPa(
1332             pDC,
1333 #if USE_SIMD16_FRONTEND
1334             reinterpret_cast<const simd16scalar *>(dsContext.pOutputData),      // simd8 -> simd16
1335             dsContext.vectorStride / 2,                                         // simd8 -> simd16
1336 #else
1337             dsContext.pOutputData,
1338             dsContext.vectorStride,
1339 #endif
1340             SWR_VTX_NUM_SLOTS,
1341             tsState.numDsOutputAttribs + tsState.dsOutVtxAttribOffset,
1342             tsData.ppIndices,
1343             tsData.NumPrimitives,
1344             tsState.postDSTopology,
1345             NumVertsPerPrim(tsState.postDSTopology, false));
1346
1347         while (tessPa.HasWork())
1348         {
1349 #if USE_SIMD16_FRONTEND
1350             const uint32_t numPrims = tessPa.NumPrims();
1351             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1352             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1353
1354             const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
1355             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1356             const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1357
1358 #endif
1359             if (HasGeometryShaderT::value)
1360             {
1361 #if USE_SIMD16_FRONTEND
1362                 tessPa.useAlternateOffset = false;
1363                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1364
1365                 if (numPrims_hi)
1366                 {
1367                     tessPa.useAlternateOffset = true;
1368                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1369                 }
1370 #else
1371                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1372                     pDC, workerId, tessPa, pGsBuffers, pSoPrimData, _simd_set1_epi32(dsContext.PrimitiveID));
1373 #endif
1374             }
1375             else
1376             {
1377                 if (HasStreamOutT::value)
1378                 {
1379 #if ENABLE_AVX512_SIMD16
1380                     tessPa.useAlternateOffset = false;
1381 #endif
1382                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1383                 }
1384
1385                 if (HasRastT::value)
1386                 {
1387 #if USE_SIMD16_FRONTEND
1388                     simd16vector    prim_simd16[3]; // Only deal with triangles, lines, or points
1389 #else
1390                     simdvector      prim[3];        // Only deal with triangles, lines, or points
1391 #endif
1392                     RDTSC_BEGIN(FEPAAssemble, pDC->drawId);
1393                     bool assemble =
1394 #if USE_SIMD16_FRONTEND
1395                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1396 #else
1397                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1398 #endif
1399                     RDTSC_END(FEPAAssemble, 1);
1400                     SWR_ASSERT(assemble);
1401
1402                     SWR_ASSERT(pfnClipFunc);
1403 #if USE_SIMD16_FRONTEND
1404                     // Gather data from the SVG if provided.
1405                     simd16scalari vViewportIdx = SIMD16::setzero_si();
1406                     simd16scalari vRtIdx = SIMD16::setzero_si();
1407                     SIMD16::Vec4 svgAttrib[4];
1408
1409                     if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1410                     {
1411                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1412                     }
1413
1414
1415                     if (state.backendState.readViewportArrayIndex)
1416                     {
1417                         vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1418                         tessPa.viewportArrayActive = true;
1419                     }
1420                     if (state.backendState.readRenderTargetArrayIndex)
1421                     {
1422                         vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1423                         tessPa.rtArrayActive = true;
1424                     }
1425
1426
1427                     {
1428                         // OOB VPAI indices => forced to zero.
1429                         vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1430                         simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1431                         simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1432                         vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1433
1434                         tessPa.useAlternateOffset = false;
1435                         pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, vViewportIdx, vRtIdx);
1436                     }
1437 #else
1438                     // Gather data from the SGV if provided.
1439                     simdscalari vViewportIdx = SIMD::setzero_si();
1440                     simdscalari vRtIdx = SIMD::setzero_si();
1441                     SIMD::Vec4 svgAttrib[4];
1442
1443                     if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1444                     {
1445                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1446                     }
1447
1448                     if (state.backendState.readViewportArrayIndex)
1449                     {
1450                         vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1451
1452                         // OOB VPAI indices => forced to zero.
1453                         vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1454                         simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1455                         simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1456                         vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
1457                         tessPa.viewportArrayActive = true;
1458                     }
1459                     if (state.backendState.readRenderTargetArrayIndex)
1460                     {
1461                         vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1462                         tessPa.rtArrayActive = true;
1463                     }
1464                     pfnClipFunc(pDC, tessPa, workerId, prim,
1465                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), vViewportIdx, vRtIdx);
1466 #endif
1467                 }
1468             }
1469
1470             tessPa.NextPrim();
1471
1472         } // while (tessPa.HasWork())
1473     } // for (uint32_t p = 0; p < numPrims; ++p)
1474
1475 #if USE_SIMD16_FRONTEND
1476     if (gt_pTessellationThreadData->pDSOutput != nullptr)
1477     {
1478         AlignedFree(gt_pTessellationThreadData->pDSOutput);
1479         gt_pTessellationThreadData->pDSOutput = nullptr;
1480     }
1481     gt_pTessellationThreadData->dsOutputAllocSize = 0;
1482
1483 #endif
1484     TSDestroyCtx(tsCtx);
1485 }
1486
1487 THREAD PA_STATE::SIMDVERTEX *gpVertexStore = nullptr;
1488 THREAD uint32_t gVertexStoreSize = 0;
1489
1490 //////////////////////////////////////////////////////////////////////////
1491 /// @brief FE handler for SwrDraw.
1492 /// @tparam IsIndexedT - Is indexed drawing enabled
1493 /// @tparam HasTessellationT - Is tessellation enabled
1494 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1495 /// @tparam HasStreamOutT - Is stream-out enabled
1496 /// @tparam HasRastT - Is rasterization enabled
1497 /// @param pContext - pointer to SWR context.
1498 /// @param pDC - pointer to draw context.
1499 /// @param workerId - thread's worker id.
1500 /// @param pUserData - Pointer to DRAW_WORK
1501 template <
1502     typename IsIndexedT,
1503     typename IsCutIndexEnabledT,
1504     typename HasTessellationT,
1505     typename HasGeometryShaderT,
1506     typename HasStreamOutT,
1507     typename HasRastT>
1508 void ProcessDraw(
1509     SWR_CONTEXT *pContext,
1510     DRAW_CONTEXT *pDC,
1511     uint32_t workerId,
1512     void *pUserData)
1513 {
1514
1515 #if KNOB_ENABLE_TOSS_POINTS
1516     if (KNOB_TOSS_QUEUE_FE)
1517     {
1518         return;
1519     }
1520 #endif
1521
1522     RDTSC_BEGIN(FEProcessDraw, pDC->drawId);
1523
1524     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1525     const API_STATE&    state = GetApiState(pDC);
1526
1527     uint32_t indexSize = 0;
1528     uint32_t endVertex = work.numVerts;
1529
1530     gfxptr_t xpLastRequestedIndex = 0;
1531     if (IsIndexedT::value)
1532     {
1533         switch (work.type)
1534         {
1535         case R32_UINT:
1536             indexSize = sizeof(uint32_t);
1537             break;
1538         case R16_UINT:
1539             indexSize = sizeof(uint16_t);
1540             break;
1541         case R8_UINT:
1542             indexSize = sizeof(uint8_t);
1543             break;
1544         default:
1545             SWR_INVALID("Invalid work.type: %d", work.type);
1546         }
1547         xpLastRequestedIndex = work.xpIB + endVertex * indexSize;
1548     }
1549     else
1550     {
1551         // No cuts, prune partial primitives.
1552         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1553     }
1554
1555 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1556     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1557 #endif
1558
1559     GsBuffers gsBuffers;
1560     if (HasGeometryShaderT::value)
1561     {
1562 #if USE_SIMD16_FRONTEND
1563         AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1564 #else
1565         AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1566 #endif
1567     }
1568
1569     if (HasTessellationT::value)
1570     {
1571         SWR_ASSERT(state.tsState.tsEnable == true);
1572         SWR_ASSERT(state.pfnHsFunc != nullptr);
1573         SWR_ASSERT(state.pfnDsFunc != nullptr);
1574
1575         AllocateTessellationData(pContext);
1576     }
1577     else
1578     {
1579         SWR_ASSERT(state.tsState.tsEnable == false);
1580         SWR_ASSERT(state.pfnHsFunc == nullptr);
1581         SWR_ASSERT(state.pfnDsFunc == nullptr);
1582     }
1583
1584     // allocate space for streamout input prim data
1585     uint32_t* pSoPrimData = nullptr;
1586     if (HasStreamOutT::value)
1587     {
1588         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1589     }
1590
1591     const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
1592 #if USE_SIMD16_FRONTEND
1593     uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
1594 #else
1595     uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
1596 #endif
1597
1598     SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
1599
1600     // Compute storage requirements for vertex store
1601     // TODO: allocation needs to be rethought for better cut support
1602     uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
1603     uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
1604
1605     // grow the vertex store for the PA as necessary
1606     if (gVertexStoreSize < vertexStoreSize)
1607     {
1608         if (gpVertexStore != nullptr)
1609         {
1610             AlignedFree(gpVertexStore);
1611             gpVertexStore = nullptr;
1612         }
1613
1614         SWR_ASSERT(gpVertexStore == nullptr);
1615
1616         gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(vertexStoreSize, 64));
1617         gVertexStoreSize = vertexStoreSize;
1618
1619         SWR_ASSERT(gpVertexStore != nullptr);
1620     }
1621
1622     // choose primitive assembler
1623
1624     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize, GetNumVerts(state.topology, 1));
1625     PA_STATE& pa = paFactory.GetPA();
1626
1627 #if USE_SIMD16_FRONTEND
1628 #if USE_SIMD16_SHADERS
1629     simd16vertex        vin;
1630 #else
1631     simdvertex          vin_lo;
1632     simdvertex          vin_hi;
1633 #endif
1634     SWR_VS_CONTEXT      vsContext_lo;
1635     SWR_VS_CONTEXT      vsContext_hi;
1636
1637 #if USE_SIMD16_SHADERS
1638     vsContext_lo.pVin = reinterpret_cast<simdvertex *>(&vin);
1639     vsContext_hi.pVin = reinterpret_cast<simdvertex *>(&vin);
1640 #else
1641     vsContext_lo.pVin = &vin_lo;
1642     vsContext_hi.pVin = &vin_hi;
1643 #endif
1644     vsContext_lo.AlternateOffset = 0;
1645     vsContext_hi.AlternateOffset = 1;
1646
1647     SWR_FETCH_CONTEXT   fetchInfo_lo = { 0 };
1648
1649     fetchInfo_lo.pStreams = &state.vertexBuffers[0];
1650     fetchInfo_lo.StartInstance = work.startInstance;
1651     fetchInfo_lo.StartVertex = 0;
1652
1653     if (IsIndexedT::value)
1654     {
1655         fetchInfo_lo.BaseVertex = work.baseVertex;
1656
1657         // if the entire index buffer isn't being consumed, set the last index
1658         // so that fetches < a SIMD wide will be masked off
1659         fetchInfo_lo.xpLastIndex = state.indexBuffer.xpIndices + state.indexBuffer.size;
1660         if (xpLastRequestedIndex < fetchInfo_lo.xpLastIndex)
1661         {
1662             fetchInfo_lo.xpLastIndex = xpLastRequestedIndex;
1663         }
1664     }
1665     else
1666     {
1667         fetchInfo_lo.StartVertex = work.startVertex;
1668     }
1669
1670     SWR_FETCH_CONTEXT   fetchInfo_hi = fetchInfo_lo;
1671
1672     const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1673
1674     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1675     {
1676         uint32_t  i = 0;
1677
1678         simd16scalari vIndex;
1679
1680         if (IsIndexedT::value)
1681         {
1682             fetchInfo_lo.xpIndices = work.xpIB;
1683             fetchInfo_hi.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize;    // 1/2 of KNOB_SIMD16_WIDTH
1684         }
1685         else
1686         {
1687             vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1688
1689             fetchInfo_lo.xpIndices = (gfxptr_t)&vIndex;
1690             fetchInfo_hi.xpIndices = (gfxptr_t)&vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t); // 1/2 of KNOB_SIMD16_WIDTH
1691         }
1692
1693         fetchInfo_lo.CurInstance = instanceNum;
1694         fetchInfo_hi.CurInstance = instanceNum;
1695
1696         vsContext_lo.InstanceID = instanceNum;
1697         vsContext_hi.InstanceID = instanceNum;
1698
1699         while (pa.HasWork())
1700         {
1701             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1702             // So we need to keep this outside of (i < endVertex) check.
1703
1704             simdmask *pvCutIndices_lo = nullptr;
1705             simdmask *pvCutIndices_hi = nullptr;
1706
1707             if (IsIndexedT::value)
1708             {
1709                 // simd16mask <=> simdmask[2]
1710
1711                 pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
1712                 pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
1713             }
1714
1715             simd16vertex &vout = pa.GetNextVsOutput();
1716
1717             vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
1718             vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);
1719
1720             if (i < endVertex)
1721             {
1722                 if (!IsIndexedT::value)
1723                 {
1724                     fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices;
1725                     uint32_t offset;
1726                     offset = std::min(endVertex-i, (uint32_t) KNOB_SIMD16_WIDTH);
1727 #if USE_SIMD16_SHADERS
1728                     offset *= 4; // convert from index to address
1729                     fetchInfo_lo.xpLastIndex += offset;
1730 #else
1731                     fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t) KNOB_SIMD_WIDTH) * 4; // * 4 for converting index to address
1732                     uint32_t offset2 = std::min(offset, (uint32_t) KNOB_SIMD16_WIDTH)-KNOB_SIMD_WIDTH;
1733                     assert(offset >= 0);
1734                     fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices;
1735                     fetchInfo_hi.xpLastIndex += offset2 * 4; // * 4 for converting index to address
1736 #endif
1737                 }
1738                 // 1. Execute FS/VS for a single SIMD.
1739                 RDTSC_BEGIN(FEFetchShader, pDC->drawId);
1740 #if USE_SIMD16_SHADERS
1741                 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_lo, vin);
1742 #else
1743                 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_lo, vin_lo);
1744
1745                 if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
1746                 {
1747                     state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_hi, vin_hi);
1748                 }
1749 #endif
1750                 RDTSC_END(FEFetchShader, 0);
1751
1752                 // forward fetch generated vertex IDs to the vertex shader
1753 #if USE_SIMD16_SHADERS
1754 #if USE_SIMD16_VS
1755                 vsContext_lo.VertexID16 = _simd16_insert_si(
1756                     vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
1757                 vsContext_lo.VertexID16 = _simd16_insert_si(
1758                     vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
1759 #else
1760                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1761                 vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
1762 #endif
1763 #else
1764                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1765                 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1766 #endif
1767
1768                 // Setup active mask for vertex shader.
1769 #if USE_SIMD16_VS
1770                 vsContext_lo.mask16 = GenerateMask16(endVertex - i);
1771 #else
1772                 vsContext_lo.mask = GenerateMask(endVertex - i);
1773                 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1774 #endif
1775
1776                 // forward cut mask to the PA
1777                 if (IsIndexedT::value)
1778                 {
1779 #if USE_SIMD16_SHADERS
1780                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1781                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
1782 #else
1783                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1784                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1785 #endif
1786                 }
1787
1788                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1789
1790 #if KNOB_ENABLE_TOSS_POINTS
1791                 if (!KNOB_TOSS_FETCH)
1792 #endif
1793                 {
1794                     RDTSC_BEGIN(FEVertexShader, pDC->drawId);
1795 #if USE_SIMD16_VS
1796                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
1797                     AR_EVENT(VSStats(vsContext_lo.stats.numInstExecuted));
1798 #else
1799                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
1800                     AR_EVENT(VSStats(vsContext_lo.stats.numInstExecuted));
1801
1802                     if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
1803                     {
1804                         state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
1805                         AR_EVENT(VSStats(vsContext_hi.stats.numInstExecuted));
1806                     }
1807 #endif
1808                     RDTSC_END(FEVertexShader, 0);
1809
1810                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1811                 }
1812             }
1813
1814             // 2. Assemble primitives given the last two SIMD.
1815             do
1816             {
1817                 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1818
1819                 RDTSC_START(FEPAAssemble);
1820                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1821                 RDTSC_STOP(FEPAAssemble, 1, 0);
1822
1823 #if KNOB_ENABLE_TOSS_POINTS
1824                 if (!KNOB_TOSS_FETCH)
1825 #endif
1826                 {
1827 #if KNOB_ENABLE_TOSS_POINTS
1828                     if (!KNOB_TOSS_VS)
1829 #endif
1830                     {
1831                         if (assemble)
1832                         {
1833                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1834
1835                             const uint32_t numPrims = pa.NumPrims();
1836                             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1837                             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1838
1839                             const simd16scalari primID = pa.GetPrimID(work.startPrimID);
1840                             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1841                             const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1842
1843                             if (HasTessellationT::value)
1844                             {
1845                                 pa.useAlternateOffset = false;
1846                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1847
1848                                 if (numPrims_hi)
1849                                 {
1850                                     pa.useAlternateOffset = true;
1851                                     TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1852                                 }
1853                             }
1854                             else if (HasGeometryShaderT::value)
1855                             {
1856                                 pa.useAlternateOffset = false;
1857                                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1858
1859                                 if (numPrims_hi)
1860                                 {
1861                                     pa.useAlternateOffset = true;
1862                                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1863                                 }
1864                             }
1865                             else
1866                             {
1867                                 // If streamout is enabled then stream vertices out to memory.
1868                                 if (HasStreamOutT::value)
1869                                 {
1870                                     pa.useAlternateOffset = false;
1871                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1872                                 }
1873
1874                                 if (HasRastT::value)
1875                                 {
1876                                     SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
1877                                     // Gather data from the SVG if provided.
1878                                     simd16scalari vpai = SIMD16::setzero_si();
1879                                     simd16scalari rtai = SIMD16::setzero_si();
1880                                     SIMD16::Vec4 svgAttrib[4];
1881
1882                                     if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1883                                     {
1884                                         pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1885                                     }
1886
1887
1888                                     if (state.backendState.readViewportArrayIndex)
1889                                     {
1890                                         vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1891                                         pa.viewportArrayActive = true;
1892                                     }
1893                                     if (state.backendState.readRenderTargetArrayIndex)
1894                                     {
1895                                         rtai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1896                                         pa.rtArrayActive = true;
1897                                     }
1898
1899                                     {
1900                                         // OOB VPAI indices => forced to zero.
1901                                         vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
1902                                         simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1903                                         simd16scalari vClearMask = SIMD16::cmplt_epi32(vpai, vNumViewports);
1904                                         vpai = SIMD16::and_si(vClearMask, vpai);
1905
1906                                         pa.useAlternateOffset = false;
1907                                         pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, vpai, rtai);
1908                                     }
1909                                 }
1910                             }
1911                         }
1912                     }
1913                 }
1914             } while (pa.NextPrim());
1915
1916             if (IsIndexedT::value)
1917             {
1918                 fetchInfo_lo.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
1919                 fetchInfo_hi.xpIndices = fetchInfo_hi.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
1920             }
1921             else
1922             {
1923                 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
1924             }
1925
1926             i += KNOB_SIMD16_WIDTH;
1927         }
1928
1929         pa.Reset();
1930     }
1931
1932 #else
1933     SWR_VS_CONTEXT      vsContext;
1934     SWR_FETCH_CONTEXT   fetchInfo = { 0 };
1935
1936     fetchInfo.pStreams = &state.vertexBuffers[0];
1937     fetchInfo.StartInstance = work.startInstance;
1938     fetchInfo.StartVertex = 0;
1939
1940     if (IsIndexedT::value)
1941     {
1942         fetchInfo.BaseVertex = work.baseVertex;
1943
1944         // if the entire index buffer isn't being consumed, set the last index
1945         // so that fetches < a SIMD wide will be masked off
1946         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1947         if (xpLastRequestedIndex < fetchInfo.pLastIndex)
1948         {
1949             fetchInfo.pLastIndex = xpLastRequestedIndex;
1950         }
1951     }
1952     else
1953     {
1954         fetchInfo.StartVertex = work.startVertex;
1955     }
1956
1957     const simdscalari   vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1958
1959     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1960     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1961     {
1962         simdscalari vIndex;
1963         uint32_t  i = 0;
1964
1965         if (IsIndexedT::value)
1966         {
1967             fetchInfo.pIndices = work.pIB;
1968         }
1969         else
1970         {
1971             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1972             fetchInfo.pIndices = (const int32_t*)&vIndex;
1973         }
1974
1975         fetchInfo.CurInstance = instanceNum;
1976         vsContext.InstanceID = instanceNum;
1977
1978         while (pa.HasWork())
1979         {
1980             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1981             // So we need to keep this outside of (i < endVertex) check.
1982             simdmask* pvCutIndices = nullptr;
1983             if (IsIndexedT::value)
1984             {
1985                 pvCutIndices = &pa.GetNextVsIndices();
1986             }
1987
1988             simdvertex& vout = pa.GetNextVsOutput();
1989             vsContext.pVin = &vout;
1990             vsContext.pVout = &vout;
1991
1992             if (i < endVertex)
1993             {
1994
1995                 // 1. Execute FS/VS for a single SIMD.
1996                 RDTSC_BEGIN(FEFetchShader, pDC->drawId);
1997                 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo, vout);
1998                 RDTSC_END(FEFetchShader, 0);
1999
2000                 // forward fetch generated vertex IDs to the vertex shader
2001                 vsContext.VertexID = fetchInfo.VertexID;
2002
2003                 // Setup active mask for vertex shader.
2004                 vsContext.mask = GenerateMask(endVertex - i);
2005
2006                 // forward cut mask to the PA
2007                 if (IsIndexedT::value)
2008                 {
2009                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
2010                 }
2011
2012                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
2013
2014 #if KNOB_ENABLE_TOSS_POINTS
2015                 if (!KNOB_TOSS_FETCH)
2016 #endif
2017                 {
2018                     RDTSC_BEGIN(FEVertexShader, pDC->drawId);
2019                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
2020                     RDTSC_END(FEVertexShader, 0);
2021
2022                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
2023                     AR_EVENT(VSStats(vsContext.stats.numInstExecuted));
2024                 }
2025             }
2026
2027             // 2. Assemble primitives given the last two SIMD.
2028             do
2029             {
2030                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
2031                 // PaAssemble returns false if there is not enough verts to assemble.
2032                 RDTSC_BEGIN(FEPAAssemble, pDC->drawId);
2033                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
2034                 RDTSC_END(FEPAAssemble, 1);
2035
2036 #if KNOB_ENABLE_TOSS_POINTS
2037                 if (!KNOB_TOSS_FETCH)
2038 #endif
2039                 {
2040 #if KNOB_ENABLE_TOSS_POINTS
2041                     if (!KNOB_TOSS_VS)
2042 #endif
2043                     {
2044                         if (assemble)
2045                         {
2046                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2047
2048                             if (HasTessellationT::value)
2049                             {
2050                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2051                                     pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
2052                             }
2053                             else if (HasGeometryShaderT::value)
2054                             {
2055                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
2056                                     pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
2057                             }
2058                             else
2059                             {
2060                                 // If streamout is enabled then stream vertices out to memory.
2061                                 if (HasStreamOutT::value)
2062                                 {
2063                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2064                                 }
2065
2066                                 if (HasRastT::value)
2067                                 {
2068                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
2069
2070                                     // Gather data from the SVG if provided.
2071                                     simdscalari vViewportIdx = SIMD::setzero_si();
2072                                     simdscalari vRtIdx = SIMD::setzero_si();
2073                                     SIMD::Vec4 svgAttrib[4];
2074
2075                                     if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
2076                                     {
2077                                         pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2078                                     }
2079
2080                                     if (state.backendState.readViewportArrayIndex)
2081                                     {
2082                                         vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2083
2084                                         // OOB VPAI indices => forced to zero.
2085                                         vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
2086                                         simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2087                                         simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
2088                                         vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
2089                                         pa.viewportArrayActive = true;
2090                                     }
2091                                     if (state.backendState.readRenderTargetArrayIndex)
2092                                     {
2093                                         vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2094                                         pa.rtArrayActive = true;
2095                                     }
2096
2097                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
2098                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), vViewportIdx, vRtIdx);
2099                                 }
2100                             }
2101                         }
2102                     }
2103                 }
2104             } while (pa.NextPrim());
2105
2106             if (IsIndexedT::value)
2107             {
2108                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
2109             }
2110             else
2111             {
2112                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
2113             }
2114
2115             i += KNOB_SIMD_WIDTH;
2116         }
2117         pa.Reset();
2118     }
2119
2120 #endif
2121
2122     RDTSC_END(FEProcessDraw, numPrims * work.numInstances);
2123 }
2124
2125 struct FEDrawChooser
2126 {
2127     typedef PFN_FE_WORK_FUNC FuncType;
2128
2129     template <typename... ArgsB>
2130     static FuncType GetFunc()
2131     {
2132         return ProcessDraw<ArgsB...>;
2133     }
2134 };
2135
2136
2137 // Selector for correct templated Draw front-end function
2138 PFN_FE_WORK_FUNC GetProcessDrawFunc(
2139     bool IsIndexed,
2140     bool IsCutIndexEnabled,
2141     bool HasTessellation,
2142     bool HasGeometryShader,
2143     bool HasStreamOut,
2144     bool HasRasterization)
2145 {
2146     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
2147 }