src/gallium/drivers/swr/rasterizer/core/frontend.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file frontend.cpp
  24 *
  25 * @brief Implementation for Frontend which handles vertex processing,
  26 *        primitive assembly, clipping, binning, etc.
  27 *
  28 ******************************************************************************/
  29
  30 #include "api.h"
  31 #include "frontend.h"
  32 #include "backend.h"
  33 #include "context.h"
  34 #include "rdtsc_core.h"
  35 #include "utils.h"
  36 #include "threads.h"
  37 #include "pa.h"
  38 #include "clip.h"
  39 #include "tilemgr.h"
  40 #include "tessellator.h"
  41 #include <limits>
  42
  43 //////////////////////////////////////////////////////////////////////////
  44 /// @brief Helper macro to generate a bitmask
  45 static INLINE uint32_t GenMask(uint32_t numBits)
  46 {
  47     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
  48     return ((1U << numBits) - 1);
  49 }
  50
  51 //////////////////////////////////////////////////////////////////////////
  52 /// @brief FE handler for SwrSync.
  53 /// @param pContext - pointer to SWR context.
  54 /// @param pDC - pointer to draw context.
  55 /// @param workerId - thread's worker id. Even thread has a unique id.
  56 /// @param pUserData - Pointer to user data passed back to sync callback.
  57 /// @todo This should go away when we switch this to use compute threading.
  58 void ProcessSync(
  59     SWR_CONTEXT *pContext,
  60     DRAW_CONTEXT *pDC,
  61     uint32_t workerId,
  62     void *pUserData)
  63 {
  64     BE_WORK work;
  65     work.type = SYNC;
  66     work.pfnWork = ProcessSyncBE;
  67
  68     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  69     pTileMgr->enqueue(0, 0, &work);
  70 }
  71
  72 //////////////////////////////////////////////////////////////////////////
  73 /// @brief FE handler for SwrDestroyContext.
  74 /// @param pContext - pointer to SWR context.
  75 /// @param pDC - pointer to draw context.
  76 /// @param workerId - thread's worker id. Even thread has a unique id.
  77 /// @param pUserData - Pointer to user data passed back to sync callback.
  78 void ProcessShutdown(
  79     SWR_CONTEXT *pContext,
  80     DRAW_CONTEXT *pDC,
  81     uint32_t workerId,
  82     void *pUserData)
  83 {
  84     BE_WORK work;
  85     work.type = SHUTDOWN;
  86     work.pfnWork = ProcessShutdownBE;
  87
  88     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  89     // Enqueue at least 1 work item for each worker thread
  90     // account for number of numa nodes
  91     uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
  92
  93     for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
  94     {
  95         for (uint32_t n = 0; n < numNumaNodes; ++n)
  96         {
  97             pTileMgr->enqueue(i, n, &work);
  98         }
  99     }
 100 }
 101
 102 //////////////////////////////////////////////////////////////////////////
 103 /// @brief FE handler for SwrClearRenderTarget.
 104 /// @param pContext - pointer to SWR context.
 105 /// @param pDC - pointer to draw context.
 106 /// @param workerId - thread's worker id. Even thread has a unique id.
 107 /// @param pUserData - Pointer to user data passed back to clear callback.
 108 /// @todo This should go away when we switch this to use compute threading.
 109 void ProcessClear(
 110     SWR_CONTEXT *pContext,
 111     DRAW_CONTEXT *pDC,
 112     uint32_t workerId,
 113     void *pUserData)
 114 {
 115     CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
 116     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 117
 118     // queue a clear to each macro tile
 119     // compute macro tile bounds for the specified rect
 120     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 121     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 122     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 123     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 124
 125     BE_WORK work;
 126     work.type = CLEAR;
 127     work.pfnWork = ProcessClearBE;
 128     work.desc.clear = *pDesc;
 129
 130     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 131     {
 132         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 133         {
 134             pTileMgr->enqueue(x, y, &work);
 135         }
 136     }
 137 }
 138
 139 //////////////////////////////////////////////////////////////////////////
 140 /// @brief FE handler for SwrStoreTiles.
 141 /// @param pContext - pointer to SWR context.
 142 /// @param pDC - pointer to draw context.
 143 /// @param workerId - thread's worker id. Even thread has a unique id.
 144 /// @param pUserData - Pointer to user data passed back to callback.
 145 /// @todo This should go away when we switch this to use compute threading.
 146 void ProcessStoreTiles(
 147     SWR_CONTEXT *pContext,
 148     DRAW_CONTEXT *pDC,
 149     uint32_t workerId,
 150     void *pUserData)
 151 {
 152     AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
 153     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 154     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
 155
 156     // queue a store to each macro tile
 157     // compute macro tile bounds for the specified rect
 158     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 159     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 160     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 161     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 162
 163     // store tiles
 164     BE_WORK work;
 165     work.type = STORETILES;
 166     work.pfnWork = ProcessStoreTilesBE;
 167     work.desc.storeTiles = *pDesc;
 168
 169     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 170     {
 171         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 172         {
 173             pTileMgr->enqueue(x, y, &work);
 174         }
 175     }
 176
 177     AR_END(FEProcessStoreTiles, 0);
 178 }
 179
 180 //////////////////////////////////////////////////////////////////////////
 181 /// @brief FE handler for SwrInvalidateTiles.
 182 /// @param pContext - pointer to SWR context.
 183 /// @param pDC - pointer to draw context.
 184 /// @param workerId - thread's worker id. Even thread has a unique id.
 185 /// @param pUserData - Pointer to user data passed back to callback.
 186 /// @todo This should go away when we switch this to use compute threading.
 187 void ProcessDiscardInvalidateTiles(
 188     SWR_CONTEXT *pContext,
 189     DRAW_CONTEXT *pDC,
 190     uint32_t workerId,
 191     void *pUserData)
 192 {
 193     AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
 194     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
 195     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 196
 197     // compute macro tile bounds for the specified rect
 198     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
 199     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
 200     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
 201     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
 202
 203     if (pDesc->fullTilesOnly == false)
 204     {
 205         // include partial tiles
 206         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 207         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 208         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 209         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 210     }
 211
 212     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
 213     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
 214
 215     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
 216     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
 217
 218     // load tiles
 219     BE_WORK work;
 220     work.type = DISCARDINVALIDATETILES;
 221     work.pfnWork = ProcessDiscardInvalidateTilesBE;
 222     work.desc.discardInvalidateTiles = *pDesc;
 223
 224     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 225     {
 226         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 227         {
 228             pTileMgr->enqueue(x, y, &work);
 229         }
 230     }
 231
 232     AR_END(FEProcessInvalidateTiles, 0);
 233 }
 234
 235 //////////////////////////////////////////////////////////////////////////
 236 /// @brief Computes the number of primitives given the number of verts.
 237 /// @param mode - primitive topology for draw operation.
 238 /// @param numPrims - number of vertices or indices for draw.
 239 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 240 uint32_t GetNumPrims(
 241     PRIMITIVE_TOPOLOGY mode,
 242     uint32_t numPrims)
 243 {
 244     switch (mode)
 245     {
 246     case TOP_POINT_LIST: return numPrims;
 247     case TOP_TRIANGLE_LIST: return numPrims / 3;
 248     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
 249     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
 250     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
 251     case TOP_QUAD_LIST: return numPrims / 4;
 252     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
 253     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
 254     case TOP_LINE_LIST: return numPrims / 2;
 255     case TOP_LINE_LOOP: return numPrims;
 256     case TOP_RECT_LIST: return numPrims / 3;
 257     case TOP_LINE_LIST_ADJ: return numPrims / 4;
 258     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
 259     case TOP_TRI_LIST_ADJ: return numPrims / 6;
 260     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 261
 262     case TOP_PATCHLIST_1:
 263     case TOP_PATCHLIST_2:
 264     case TOP_PATCHLIST_3:
 265     case TOP_PATCHLIST_4:
 266     case TOP_PATCHLIST_5:
 267     case TOP_PATCHLIST_6:
 268     case TOP_PATCHLIST_7:
 269     case TOP_PATCHLIST_8:
 270     case TOP_PATCHLIST_9:
 271     case TOP_PATCHLIST_10:
 272     case TOP_PATCHLIST_11:
 273     case TOP_PATCHLIST_12:
 274     case TOP_PATCHLIST_13:
 275     case TOP_PATCHLIST_14:
 276     case TOP_PATCHLIST_15:
 277     case TOP_PATCHLIST_16:
 278     case TOP_PATCHLIST_17:
 279     case TOP_PATCHLIST_18:
 280     case TOP_PATCHLIST_19:
 281     case TOP_PATCHLIST_20:
 282     case TOP_PATCHLIST_21:
 283     case TOP_PATCHLIST_22:
 284     case TOP_PATCHLIST_23:
 285     case TOP_PATCHLIST_24:
 286     case TOP_PATCHLIST_25:
 287     case TOP_PATCHLIST_26:
 288     case TOP_PATCHLIST_27:
 289     case TOP_PATCHLIST_28:
 290     case TOP_PATCHLIST_29:
 291     case TOP_PATCHLIST_30:
 292     case TOP_PATCHLIST_31:
 293     case TOP_PATCHLIST_32:
 294         return numPrims / (mode - TOP_PATCHLIST_BASE);
 295
 296     case TOP_POLYGON:
 297     case TOP_POINT_LIST_BF:
 298     case TOP_LINE_STRIP_CONT:
 299     case TOP_LINE_STRIP_BF:
 300     case TOP_LINE_STRIP_CONT_BF:
 301     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 302     case TOP_TRI_STRIP_REVERSE:
 303     case TOP_PATCHLIST_BASE:
 304     case TOP_UNKNOWN:
 305         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 306         return 0;
 307     }
 308
 309     return 0;
 310 }
 311
 312 //////////////////////////////////////////////////////////////////////////
 313 /// @brief Computes the number of verts given the number of primitives.
 314 /// @param mode - primitive topology for draw operation.
 315 /// @param numPrims - number of primitives for draw.
 316 uint32_t GetNumVerts(
 317     PRIMITIVE_TOPOLOGY mode,
 318     uint32_t numPrims)
 319 {
 320     switch (mode)
 321     {
 322     case TOP_POINT_LIST: return numPrims;
 323     case TOP_TRIANGLE_LIST: return numPrims * 3;
 324     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
 325     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
 326     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
 327     case TOP_QUAD_LIST: return numPrims * 4;
 328     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
 329     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
 330     case TOP_LINE_LIST: return numPrims * 2;
 331     case TOP_LINE_LOOP: return numPrims;
 332     case TOP_RECT_LIST: return numPrims * 3;
 333     case TOP_LINE_LIST_ADJ: return numPrims * 4;
 334     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
 335     case TOP_TRI_LIST_ADJ: return numPrims * 6;
 336     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
 337
 338     case TOP_PATCHLIST_1:
 339     case TOP_PATCHLIST_2:
 340     case TOP_PATCHLIST_3:
 341     case TOP_PATCHLIST_4:
 342     case TOP_PATCHLIST_5:
 343     case TOP_PATCHLIST_6:
 344     case TOP_PATCHLIST_7:
 345     case TOP_PATCHLIST_8:
 346     case TOP_PATCHLIST_9:
 347     case TOP_PATCHLIST_10:
 348     case TOP_PATCHLIST_11:
 349     case TOP_PATCHLIST_12:
 350     case TOP_PATCHLIST_13:
 351     case TOP_PATCHLIST_14:
 352     case TOP_PATCHLIST_15:
 353     case TOP_PATCHLIST_16:
 354     case TOP_PATCHLIST_17:
 355     case TOP_PATCHLIST_18:
 356     case TOP_PATCHLIST_19:
 357     case TOP_PATCHLIST_20:
 358     case TOP_PATCHLIST_21:
 359     case TOP_PATCHLIST_22:
 360     case TOP_PATCHLIST_23:
 361     case TOP_PATCHLIST_24:
 362     case TOP_PATCHLIST_25:
 363     case TOP_PATCHLIST_26:
 364     case TOP_PATCHLIST_27:
 365     case TOP_PATCHLIST_28:
 366     case TOP_PATCHLIST_29:
 367     case TOP_PATCHLIST_30:
 368     case TOP_PATCHLIST_31:
 369     case TOP_PATCHLIST_32:
 370         return numPrims * (mode - TOP_PATCHLIST_BASE);
 371
 372     case TOP_POLYGON:
 373     case TOP_POINT_LIST_BF:
 374     case TOP_LINE_STRIP_CONT:
 375     case TOP_LINE_STRIP_BF:
 376     case TOP_LINE_STRIP_CONT_BF:
 377     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 378     case TOP_TRI_STRIP_REVERSE:
 379     case TOP_PATCHLIST_BASE:
 380     case TOP_UNKNOWN:
 381         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 382         return 0;
 383     }
 384
 385     return 0;
 386 }
 387
 388 //////////////////////////////////////////////////////////////////////////
 389 /// @brief Return number of verts per primitive.
 390 /// @param topology - topology
 391 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 392 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 393 {
 394     uint32_t numVerts = 0;
 395     switch (topology)
 396     {
 397     case TOP_POINT_LIST:
 398     case TOP_POINT_LIST_BF:
 399         numVerts = 1;
 400         break;
 401     case TOP_LINE_LIST:
 402     case TOP_LINE_STRIP:
 403     case TOP_LINE_LIST_ADJ:
 404     case TOP_LINE_LOOP:
 405     case TOP_LINE_STRIP_CONT:
 406     case TOP_LINE_STRIP_BF:
 407     case TOP_LISTSTRIP_ADJ:
 408         numVerts = 2;
 409         break;
 410     case TOP_TRIANGLE_LIST:
 411     case TOP_TRIANGLE_STRIP:
 412     case TOP_TRIANGLE_FAN:
 413     case TOP_TRI_LIST_ADJ:
 414     case TOP_TRI_STRIP_ADJ:
 415     case TOP_TRI_STRIP_REVERSE:
 416     case TOP_RECT_LIST:
 417         numVerts = 3;
 418         break;
 419     case TOP_QUAD_LIST:
 420     case TOP_QUAD_STRIP:
 421         numVerts = 4;
 422         break;
 423     case TOP_PATCHLIST_1:
 424     case TOP_PATCHLIST_2:
 425     case TOP_PATCHLIST_3:
 426     case TOP_PATCHLIST_4:
 427     case TOP_PATCHLIST_5:
 428     case TOP_PATCHLIST_6:
 429     case TOP_PATCHLIST_7:
 430     case TOP_PATCHLIST_8:
 431     case TOP_PATCHLIST_9:
 432     case TOP_PATCHLIST_10:
 433     case TOP_PATCHLIST_11:
 434     case TOP_PATCHLIST_12:
 435     case TOP_PATCHLIST_13:
 436     case TOP_PATCHLIST_14:
 437     case TOP_PATCHLIST_15:
 438     case TOP_PATCHLIST_16:
 439     case TOP_PATCHLIST_17:
 440     case TOP_PATCHLIST_18:
 441     case TOP_PATCHLIST_19:
 442     case TOP_PATCHLIST_20:
 443     case TOP_PATCHLIST_21:
 444     case TOP_PATCHLIST_22:
 445     case TOP_PATCHLIST_23:
 446     case TOP_PATCHLIST_24:
 447     case TOP_PATCHLIST_25:
 448     case TOP_PATCHLIST_26:
 449     case TOP_PATCHLIST_27:
 450     case TOP_PATCHLIST_28:
 451     case TOP_PATCHLIST_29:
 452     case TOP_PATCHLIST_30:
 453     case TOP_PATCHLIST_31:
 454     case TOP_PATCHLIST_32:
 455         numVerts = topology - TOP_PATCHLIST_BASE;
 456         break;
 457     default:
 458         SWR_ASSERT(false, "Unsupported topology: %d", topology);
 459         break;
 460     }
 461
 462     if (includeAdjVerts)
 463     {
 464         switch (topology)
 465         {
 466         case TOP_LISTSTRIP_ADJ:
 467         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
 468         case TOP_TRI_STRIP_ADJ:
 469         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
 470         default: break;
 471         }
 472     }
 473
 474     return numVerts;
 475 }
 476
 477 //////////////////////////////////////////////////////////////////////////
 478 /// @brief Generate mask from remaining work.
 479 /// @param numWorkItems - Number of items being worked on by a SIMD.
 480 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 481 {
 482     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
 483     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 484     return _simd_castps_si(vMask(mask));
 485 }
 486
 487 //////////////////////////////////////////////////////////////////////////
 488 /// @brief StreamOut - Streams vertex data out to SO buffers.
 489 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 490 /// @param pDC - pointer to draw context.
 491 /// @param workerId - thread's worker id. Even thread has a unique id.
 492 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 493 static void StreamOut(
 494     DRAW_CONTEXT* pDC,
 495     PA_STATE& pa,
 496     uint32_t workerId,
 497     uint32_t* pPrimData,
 498     uint32_t streamIndex)
 499 {
 500     SWR_CONTEXT *pContext = pDC->pContext;
 501
 502     AR_BEGIN(FEStreamout, pDC->drawId);
 503
 504     const API_STATE& state = GetApiState(pDC);
 505     const SWR_STREAMOUT_STATE &soState = state.soState;
 506
 507     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 508
 509     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
 510     uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
 511
 512     SWR_STREAMOUT_CONTEXT soContext = { 0 };
 513
 514     // Setup buffer state pointers.
 515     for (uint32_t i = 0; i < 4; ++i)
 516     {
 517         soContext.pBuffer[i] = &state.soBuffer[i];
 518     }
 519
 520     uint32_t numPrims = pa.NumPrims();
 521     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 522     {
 523         DWORD slot = 0;
 524         uint32_t soMask = soState.streamMasks[streamIndex];
 525
 526         // Write all entries into primitive data buffer for SOS.
 527         while (_BitScanForward(&slot, soMask))
 528         {
 529             __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
 530             uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
 531             pa.AssembleSingle(paSlot, primIndex, attrib);
 532
 533             // Attribute offset is relative offset from start of vertex.
 534             // Note that attributes start at slot 1 in the PA buffer. We need to write this
 535             // to prim data starting at slot 0. Which is why we do (slot - 1).
 536             // Also note: GL works slightly differently, and needs slot 0
 537             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
 538
 539             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
 540             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
 541             {
 542                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 543
 544                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 545             }
 546             soMask &= ~(1 << slot);
 547         }
 548
 549         // Update pPrimData pointer
 550         soContext.pPrimData = pPrimData;
 551
 552         // Call SOS
 553         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
 554         state.pfnSoFunc[streamIndex](soContext);
 555     }
 556
 557     // Update SO write offset. The driver provides memory for the update.
 558     for (uint32_t i = 0; i < 4; ++i)
 559     {
 560         if (state.soBuffer[i].pWriteOffset)
 561         {
 562             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 563         }
 564
 565         if (state.soBuffer[i].soWriteEnable)
 566         {
 567             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 568             pDC->dynState.SoWriteOffsetDirty[i] = true;
 569         }
 570     }
 571
 572     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
 573     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 574
 575     AR_END(FEStreamout, 1);
 576 }
 577
 578 //////////////////////////////////////////////////////////////////////////
 579 /// @brief Computes number of invocations. The current index represents
 580 ///        the start of the SIMD. The max index represents how much work
 581 ///        items are remaining. If there is less then a SIMD's xmin of work
 582 ///        then return the remaining amount of work.
 583 /// @param curIndex - The start index for the SIMD.
 584 /// @param maxIndex - The last index for all work items.
 585 static INLINE uint32_t GetNumInvocations(
 586     uint32_t curIndex,
 587     uint32_t maxIndex)
 588 {
 589     uint32_t remainder = (maxIndex - curIndex);
 590     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 591 }
 592
 593 //////////////////////////////////////////////////////////////////////////
 594 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 595 ///        The geometry shader will loop over each active streamout buffer, assembling
 596 ///        primitives for the downstream stages. When multistream output is enabled,
 597 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 598 ///        buffer for the primitive assembler.
 599 /// @param stream - stream id to generate the cut buffer for
 600 /// @param pStreamIdBase - pointer to the stream ID buffer
 601 /// @param numEmittedVerts - Number of total verts emitted by the GS
 602 /// @param pCutBuffer - output buffer to write cuts to
 603 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
 604 {
 605     SWR_ASSERT(stream < MAX_SO_STREAMS);
 606
 607     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
 608     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 609
 610     for (uint32_t b = 0; b < numOutputBytes; ++b)
 611     {
 612         uint8_t curInputByte = pStreamIdBase[2*b];
 613         uint8_t outByte = 0;
 614         for (uint32_t i = 0; i < 4; ++i)
 615         {
 616             if ((curInputByte & 0x3) != stream)
 617             {
 618                 outByte |= (1 << i);
 619             }
 620             curInputByte >>= 2;
 621         }
 622
 623         curInputByte = pStreamIdBase[2 * b + 1];
 624         for (uint32_t i = 0; i < 4; ++i)
 625         {
 626             if ((curInputByte & 0x3) != stream)
 627             {
 628                 outByte |= (1 << (i + 4));
 629             }
 630             curInputByte >>= 2;
 631         }
 632
 633         *pCutBuffer++ = outByte;
 634     }
 635 }
 636
 637 THREAD SWR_GS_CONTEXT tlsGsContext;
 638
 639 //////////////////////////////////////////////////////////////////////////
 640 /// @brief Implements GS stage.
 641 /// @param pDC - pointer to draw context.
 642 /// @param workerId - thread's worker id. Even thread has a unique id.
 643 /// @param pa - The primitive assembly object.
 644 /// @param pGsOut - output stream for GS
 645 template <
 646     typename HasStreamOutT,
 647     typename HasRastT>
 648 static void GeometryShaderStage(
 649     DRAW_CONTEXT *pDC,
 650     uint32_t workerId,
 651     PA_STATE& pa,
 652     void* pGsOut,
 653     void* pCutBuffer,
 654     void* pStreamCutBuffer,
 655     uint32_t* pSoPrimData,
 656     simdscalari primID)
 657 {
 658     SWR_CONTEXT *pContext = pDC->pContext;
 659
 660     AR_BEGIN(FEGeometryShader, pDC->drawId);
 661
 662     const API_STATE& state = GetApiState(pDC);
 663     const SWR_GS_STATE* pState = &state.gsState;
 664
 665     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
 666     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 667
 668     tlsGsContext.pStream = (uint8_t*)pGsOut;
 669     tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
 670     tlsGsContext.PrimitiveID = primID;
 671
 672     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 673     simdvector attrib[MAX_ATTRIBUTES];
 674
 675     // assemble all attributes for the input primitive
 676     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 677     {
 678         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
 679         pa.Assemble(attribSlot, attrib);
 680
 681         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 682         {
 683             tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
 684         }
 685     }
 686
 687     // assemble position
 688     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 689     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 690     {
 691         tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
 692     }
 693
 694     const uint32_t vertexStride = sizeof(simdvertex);
 695     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 696     const uint32_t inputPrimStride = numSimdBatches * vertexStride;
 697     const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
 698     uint32_t cutPrimStride;
 699     uint32_t cutInstanceStride;
 700
 701     if (pState->isSingleStream)
 702     {
 703         cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 704         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 705     }
 706     else
 707     {
 708         cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 709         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 710     }
 711
 712     // record valid prims from the frontend to avoid over binning the newly generated
 713     // prims from the GS
 714     uint32_t numInputPrims = pa.NumPrims();
 715
 716     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 717     {
 718         tlsGsContext.InstanceID = instance;
 719         tlsGsContext.mask = GenerateMask(numInputPrims);
 720
 721         // execute the geometry shader
 722         state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 723
 724         tlsGsContext.pStream += instanceStride;
 725         tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
 726     }
 727
 728     // set up new binner and state for the GS output topology
 729     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 730     if (HasRastT::value)
 731     {
 732         switch (pState->outputTopology)
 733         {
 734         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
 735         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
 736         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
 737         default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
 738         }
 739     }
 740
 741     // foreach input prim:
 742     // - setup a new PA based on the emitted verts for that prim
 743     // - loop over the new verts, calling PA to assemble each prim
 744     uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
 745     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 746
 747     uint32_t totalPrimsGenerated = 0;
 748     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 749     {
 750         uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
 751         uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
 752         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 753         {
 754             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 755             if (numEmittedVerts == 0)
 756             {
 757                 continue;
 758             }
 759
 760             uint8_t* pBase = pInstanceBase + instance * instanceStride;
 761             uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
 762
 763             uint32_t numAttribs = state.feNumAttributes;
 764
 765             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
 766             {
 767                 bool processCutVerts = false;
 768
 769                 uint8_t* pCutBuffer = pCutBase;
 770
 771                 // assign default stream ID, only relevant when GS is outputting a single stream
 772                 uint32_t streamID = 0;
 773                 if (pState->isSingleStream)
 774                 {
 775                     processCutVerts = true;
 776                     streamID = pState->singleStreamID;
 777                     if (streamID != stream) continue;
 778                 }
 779                 else
 780                 {
 781                     // early exit if this stream is not enabled for streamout
 782                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
 783                     {
 784                         continue;
 785                     }
 786
 787                     // multi-stream output, need to translate StreamID buffer to a cut buffer
 788                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
 789                     pCutBuffer = (uint8_t*)pStreamCutBuffer;
 790                     processCutVerts = false;
 791                 }
 792
 793                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 794
 795                 while (gsPa.GetNextStreamOutput())
 796                 {
 797                     do
 798                     {
 799                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
 800
 801                         if (assemble)
 802                         {
 803                             totalPrimsGenerated += gsPa.NumPrims();
 804
 805                             if (HasStreamOutT::value)
 806                             {
 807                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
 808                             }
 809
 810                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
 811                             {
 812                                 simdscalari vPrimId;
 813                                 // pull primitiveID from the GS output if available
 814                                 if (state.gsState.emitsPrimitiveID)
 815                                 {
 816                                     simdvector primIdAttrib[3];
 817                                     gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
 818                                     vPrimId = _simd_castps_si(primIdAttrib[0].x);
 819                                 }
 820                                 else
 821                                 {
 822                                     vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
 823                                 }
 824
 825                                 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
 826                                 simdscalari vViewPortIdx;
 827                                 if (state.gsState.emitsViewportArrayIndex)
 828                                 {
 829                                     simdvector vpiAttrib[3];
 830                                     gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
 831
 832                                     // OOB indices => forced to zero.
 833                                     simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 834                                     simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
 835                                     vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
 836
 837                                     vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
 838                                 }
 839                                 else
 840                                 {
 841                                     vViewPortIdx = _simd_set1_epi32(0);
 842                                 }
 843
 844                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
 845                             }
 846                         }
 847                     } while (gsPa.NextPrim());
 848                 }
 849             }
 850         }
 851     }
 852
 853     // update GS pipeline stats
 854     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
 855     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
 856         AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
 857     AR_END(FEGeometryShader, 1);
 858 }
 859
 860 //////////////////////////////////////////////////////////////////////////
 861 /// @brief Allocate GS buffers
 862 /// @param pDC - pointer to draw context.
 863 /// @param state - API state
 864 /// @param ppGsOut - pointer to GS output buffer allocation
 865 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
 866 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
 867     void **ppStreamCutBuffer)
 868 {
 869     auto pArena = pDC->pArena;
 870     SWR_ASSERT(pArena != nullptr);
 871     SWR_ASSERT(state.gsState.gsEnable);
 872     // allocate arena space to hold GS output verts
 873     // @todo pack attribs
 874     // @todo support multiple streams
 875     const uint32_t vertexStride = sizeof(simdvertex);
 876     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 877     uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
 878     *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
 879
 880     const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 881     const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 882     const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 883     const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 884
 885     // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
 886     // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
 887
 888     // allocate space for temporary per-stream cut buffer if multi-stream is enabled
 889     if (state.gsState.isSingleStream)
 890     {
 891         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 892         *ppStreamCutBuffer = nullptr;
 893     }
 894     else
 895     {
 896         *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
 897         *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 898     }
 899
 900 }
 901
 902 //////////////////////////////////////////////////////////////////////////
 903 /// @brief Contains all data generated by the HS and passed to the
 904 /// tessellator and DS.
 905 struct TessellationThreadLocalData
 906 {
 907     SWR_HS_CONTEXT hsContext;
 908     ScalarPatch patchData[KNOB_SIMD_WIDTH];
 909     void* pTxCtx;
 910     size_t tsCtxSize;
 911
 912     simdscalar* pDSOutput;
 913     size_t numDSOutputVectors;
 914 };
 915
 916 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
 917
 918 //////////////////////////////////////////////////////////////////////////
 919 /// @brief Allocate tessellation data for this worker thread.
 920 INLINE
 921 static void AllocateTessellationData(SWR_CONTEXT* pContext)
 922 {
 923     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
 924     if (gt_pTessellationThreadData == nullptr)
 925     {
 926         gt_pTessellationThreadData = (TessellationThreadLocalData*)
 927             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
 928         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
 929     }
 930 }
 931
 932 //////////////////////////////////////////////////////////////////////////
 933 /// @brief Implements Tessellation Stages.
 934 /// @param pDC - pointer to draw context.
 935 /// @param workerId - thread's worker id. Even thread has a unique id.
 936 /// @param pa - The primitive assembly object.
 937 /// @param pGsOut - output stream for GS
 938 template <
 939     typename HasGeometryShaderT,
 940     typename HasStreamOutT,
 941     typename HasRastT>
 942 static void TessellationStages(
 943     DRAW_CONTEXT *pDC,
 944     uint32_t workerId,
 945     PA_STATE& pa,
 946     void* pGsOut,
 947     void* pCutBuffer,
 948     void* pCutStreamBuffer,
 949     uint32_t* pSoPrimData,
 950     simdscalari primID)
 951 {
 952     SWR_CONTEXT *pContext = pDC->pContext;
 953     const API_STATE& state = GetApiState(pDC);
 954     const SWR_TS_STATE& tsState = state.tsState;
 955
 956     SWR_ASSERT(gt_pTessellationThreadData);
 957
 958     HANDLE tsCtx = TSInitCtx(
 959         tsState.domain,
 960         tsState.partitioning,
 961         tsState.tsOutputTopology,
 962         gt_pTessellationThreadData->pTxCtx,
 963         gt_pTessellationThreadData->tsCtxSize);
 964     if (tsCtx == nullptr)
 965     {
 966         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
 967         tsCtx = TSInitCtx(
 968             tsState.domain,
 969             tsState.partitioning,
 970             tsState.tsOutputTopology,
 971             gt_pTessellationThreadData->pTxCtx,
 972             gt_pTessellationThreadData->tsCtxSize);
 973     }
 974     SWR_ASSERT(tsCtx);
 975
 976     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 977     if (HasRastT::value)
 978     {
 979         switch (tsState.postDSTopology)
 980         {
 981         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
 982         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
 983         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
 984         default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
 985         }
 986     }
 987
 988     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
 989     hsContext.pCPout = gt_pTessellationThreadData->patchData;
 990     hsContext.PrimitiveID = primID;
 991
 992     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 993     // Max storage for one attribute for an entire simdprimitive
 994     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
 995
 996     // assemble all attributes for the input primitives
 997     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
 998     {
 999         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1000         pa.Assemble(attribSlot, simdattrib);
1001
1002         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1003         {
1004             hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1005         }
1006     }
1007
1008 #if defined(_DEBUG)
1009     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1010 #endif
1011
1012     uint32_t numPrims = pa.NumPrims();
1013     hsContext.mask = GenerateMask(numPrims);
1014
1015     // Run the HS
1016     AR_BEGIN(FEHullShader, pDC->drawId);
1017     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1018     AR_END(FEHullShader, 0);
1019
1020     UPDATE_STAT_FE(HsInvocations, numPrims);
1021
1022     const uint32_t* pPrimId = (const uint32_t*)&primID;
1023
1024     for (uint32_t p = 0; p < numPrims; ++p)
1025     {
1026         // Run Tessellator
1027         SWR_TS_TESSELLATED_DATA tsData = { 0 };
1028         AR_BEGIN(FETessellation, pDC->drawId);
1029         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1030         AR_EVENT(TessPrimCount(1));
1031         AR_END(FETessellation, 0);
1032
1033         if (tsData.NumPrimitives == 0)
1034         {
1035             continue;
1036         }
1037         SWR_ASSERT(tsData.NumDomainPoints);
1038
1039         // Allocate DS Output memory
1040         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1041         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1042         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1043         if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1044         {
1045             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1046             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1047             gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1048         }
1049         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1050         SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1051
1052 #if defined(_DEBUG)
1053         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1054 #endif
1055
1056         // Run Domain Shader
1057         SWR_DS_CONTEXT dsContext;
1058         dsContext.PrimitiveID = pPrimId[p];
1059         dsContext.pCpIn = &hsContext.pCPout[p];
1060         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1061         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1062         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1063         dsContext.vectorStride = requiredDSVectorInvocations;
1064
1065         uint32_t dsInvocations = 0;
1066
1067         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1068         {
1069             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1070
1071             AR_BEGIN(FEDomainShader, pDC->drawId);
1072             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1073             AR_END(FEDomainShader, 0);
1074
1075             dsInvocations += KNOB_SIMD_WIDTH;
1076         }
1077         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1078
1079         PA_TESS tessPa(
1080             pDC,
1081             dsContext.pOutputData,
1082             dsContext.vectorStride,
1083             tsState.numDsOutputAttribs,
1084             tsData.ppIndices,
1085             tsData.NumPrimitives,
1086             tsState.postDSTopology);
1087
1088         while (tessPa.HasWork())
1089         {
1090             if (HasGeometryShaderT::value)
1091             {
1092                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1093                     pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1094                     _simd_set1_epi32(dsContext.PrimitiveID));
1095             }
1096             else
1097             {
1098                 if (HasStreamOutT::value)
1099                 {
1100                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1101                 }
1102
1103                 if (HasRastT::value)
1104                 {
1105                     simdvector prim[3]; // Only deal with triangles, lines, or points
1106                     AR_BEGIN(FEPAAssemble, pDC->drawId);
1107 #if SWR_ENABLE_ASSERTS
1108                     bool assemble =
1109 #endif
1110                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1111                     AR_END(FEPAAssemble, 1);
1112                     SWR_ASSERT(assemble);
1113
1114                     SWR_ASSERT(pfnClipFunc);
1115                     pfnClipFunc(pDC, tessPa, workerId, prim,
1116                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1117                 }
1118             }
1119
1120             tessPa.NextPrim();
1121
1122         } // while (tessPa.HasWork())
1123     } // for (uint32_t p = 0; p < numPrims; ++p)
1124
1125     TSDestroyCtx(tsCtx);
1126 }
1127
1128 //////////////////////////////////////////////////////////////////////////
1129 /// @brief FE handler for SwrDraw.
1130 /// @tparam IsIndexedT - Is indexed drawing enabled
1131 /// @tparam HasTessellationT - Is tessellation enabled
1132 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1133 /// @tparam HasStreamOutT - Is stream-out enabled
1134 /// @tparam HasRastT - Is rasterization enabled
1135 /// @param pContext - pointer to SWR context.
1136 /// @param pDC - pointer to draw context.
1137 /// @param workerId - thread's worker id.
1138 /// @param pUserData - Pointer to DRAW_WORK
1139 template <
1140     typename IsIndexedT,
1141     typename IsCutIndexEnabledT,
1142     typename HasTessellationT,
1143     typename HasGeometryShaderT,
1144     typename HasStreamOutT,
1145     typename HasRastT>
1146 void ProcessDraw(
1147     SWR_CONTEXT *pContext,
1148     DRAW_CONTEXT *pDC,
1149     uint32_t workerId,
1150     void *pUserData)
1151 {
1152
1153 #if KNOB_ENABLE_TOSS_POINTS
1154     if (KNOB_TOSS_QUEUE_FE)
1155     {
1156         return;
1157     }
1158 #endif
1159
1160     AR_BEGIN(FEProcessDraw, pDC->drawId);
1161
1162     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1163     const API_STATE&    state = GetApiState(pDC);
1164
1165     uint32_t indexSize = 0;
1166     uint32_t endVertex = work.numVerts;
1167
1168     const int32_t* pLastRequestedIndex = nullptr;
1169     if (IsIndexedT::value)
1170     {
1171         switch (work.type)
1172         {
1173         case R32_UINT:
1174             indexSize = sizeof(uint32_t);
1175             pLastRequestedIndex = &(work.pIB[endVertex]);
1176             break;
1177         case R16_UINT:
1178             indexSize = sizeof(uint16_t);
1179             // nasty address offset to last index
1180             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1181             break;
1182         case R8_UINT:
1183             indexSize = sizeof(uint8_t);
1184             // nasty address offset to last index
1185             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1186             break;
1187         default:
1188             SWR_ASSERT(0);
1189         }
1190     }
1191     else
1192     {
1193         // No cuts, prune partial primitives.
1194         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1195     }
1196
1197 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1198     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1199 #endif
1200
1201     void* pGsOut = nullptr;
1202     void* pCutBuffer = nullptr;
1203     void* pStreamCutBuffer = nullptr;
1204     if (HasGeometryShaderT::value)
1205     {
1206         AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1207     }
1208
1209     if (HasTessellationT::value)
1210     {
1211         SWR_ASSERT(state.tsState.tsEnable == true);
1212         SWR_ASSERT(state.pfnHsFunc != nullptr);
1213         SWR_ASSERT(state.pfnDsFunc != nullptr);
1214
1215         AllocateTessellationData(pContext);
1216     }
1217     else
1218     {
1219         SWR_ASSERT(state.tsState.tsEnable == false);
1220         SWR_ASSERT(state.pfnHsFunc == nullptr);
1221         SWR_ASSERT(state.pfnDsFunc == nullptr);
1222     }
1223
1224     // allocate space for streamout input prim data
1225     uint32_t* pSoPrimData = nullptr;
1226     if (HasStreamOutT::value)
1227     {
1228         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1229     }
1230
1231     // choose primitive assembler
1232     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1233     PA_STATE& pa = paFactory.GetPA();
1234
1235 #if USE_SIMD16_FRONTEND
1236     simdvertex          vin_lo;
1237     simdvertex          vin_hi;
1238     SWR_VS_CONTEXT      vsContext_lo;
1239     SWR_VS_CONTEXT      vsContext_hi;
1240
1241     vsContext_lo.pVin = &vin_lo;
1242     vsContext_hi.pVin = &vin_hi;
1243
1244     SWR_FETCH_CONTEXT   fetchInfo_lo = { 0 };
1245
1246     fetchInfo_lo.pStreams = &state.vertexBuffers[0];
1247     fetchInfo_lo.StartInstance = work.startInstance;
1248     fetchInfo_lo.StartVertex = 0;
1249
1250     if (IsIndexedT::value)
1251     {
1252         fetchInfo_lo.BaseVertex = work.baseVertex;
1253
1254         // if the entire index buffer isn't being consumed, set the last index
1255         // so that fetches < a SIMD wide will be masked off
1256         fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1257         if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
1258         {
1259             fetchInfo_lo.pLastIndex = pLastRequestedIndex;
1260         }
1261     }
1262     else
1263     {
1264         fetchInfo_lo.StartVertex = work.startVertex;
1265     }
1266
1267     SWR_FETCH_CONTEXT   fetchInfo_hi = fetchInfo_lo;
1268
1269     const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1270
1271     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1272     {
1273         uint32_t  i = 0;
1274
1275         simd16scalari vIndex;
1276
1277         if (IsIndexedT::value)
1278         {
1279             fetchInfo_lo.pIndices = work.pIB;
1280             fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize);    // 1/2 of KNOB_SIMD16_WIDTH
1281         }
1282         else
1283         {
1284             vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1285
1286             fetchInfo_lo.pIndices = (const int32_t *)&vIndex.lo;
1287             fetchInfo_hi.pIndices = (const int32_t *)&vIndex.hi;
1288         }
1289
1290         fetchInfo_lo.CurInstance = instanceNum;
1291         fetchInfo_hi.CurInstance = instanceNum;
1292
1293         vsContext_lo.InstanceID = instanceNum;
1294         vsContext_hi.InstanceID = instanceNum;
1295
1296         while (pa.HasWork())
1297         {
1298             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1299             // So we need to keep this outside of (i < endVertex) check.
1300
1301             simdmask *pvCutIndices_lo = nullptr;
1302             simdmask *pvCutIndices_hi = nullptr;
1303
1304             if (IsIndexedT::value)
1305             {
1306                 // simd16mask <=> simdmask[2]
1307
1308                 pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
1309                 pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
1310             }
1311
1312             simdvertex vout_lo;
1313             simdvertex vout_hi;
1314
1315             vsContext_lo.pVout = &vout_lo;
1316             vsContext_hi.pVout = &vout_hi;
1317
1318             simd16vertex &vout = pa.GetNextVsOutput();
1319
1320             if (i < endVertex)
1321             {
1322                 // 1. Execute FS/VS for a single SIMD.
1323                 AR_BEGIN(FEFetchShader, pDC->drawId);
1324                 state.pfnFetchFunc(fetchInfo_lo, vin_lo);
1325                 if ((i + KNOB_SIMD_WIDTH) < endVertex)
1326                 {
1327                     state.pfnFetchFunc(fetchInfo_hi, vin_hi);
1328                 }
1329                 AR_END(FEFetchShader, 0);
1330
1331                 // forward fetch generated vertex IDs to the vertex shader
1332                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1333                 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1334
1335                 // Setup active mask for vertex shader.
1336                 vsContext_lo.mask = GenerateMask(endVertex - i);
1337                 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1338
1339                 // forward cut mask to the PA
1340                 if (IsIndexedT::value)
1341                 {
1342                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1343                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1344                 }
1345
1346                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1347
1348 #if KNOB_ENABLE_TOSS_POINTS
1349                 if (!KNOB_TOSS_FETCH)
1350 #endif
1351                 {
1352                     AR_BEGIN(FEVertexShader, pDC->drawId);
1353                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
1354
1355                     // copy SIMD vout_lo to lo part of SIMD16 vout
1356                     {
1357                         const uint32_t voutNumSlots = VERTEX_ATTRIB_START_SLOT + state.feNumAttributes;
1358
1359                         for (uint32_t i = 0; i < voutNumSlots; i += 1)
1360                         {
1361                             for (uint32_t j = 0; j < 4; j += 1)
1362                             {
1363                                 vout.attrib[i][j].lo = vout_lo.attrib[i][j];
1364                             }
1365                         }
1366                     }
1367
1368                     if ((i + KNOB_SIMD_WIDTH) < endVertex)
1369                     {
1370                         state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
1371
1372                         // copy SIMD vout_hi to hi part of SIMD16 vout
1373                         {
1374                             const uint32_t voutNumSlots = VERTEX_ATTRIB_START_SLOT + state.feNumAttributes;
1375
1376                             for (uint32_t i = 0; i < voutNumSlots; i += 1)
1377                             {
1378                                 for (uint32_t j = 0; j < 4; j += 1)
1379                                 {
1380                                     vout.attrib[i][j].hi = vout_hi.attrib[i][j];
1381                                 }
1382                             }
1383                         }
1384                     }
1385                     AR_END(FEVertexShader, 0);
1386
1387                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1388                 }
1389             }
1390
1391             // 2. Assemble primitives given the last two SIMD.
1392             do
1393             {
1394                 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1395
1396                 RDTSC_START(FEPAAssemble);
1397                 bool assemble = pa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
1398                 RDTSC_STOP(FEPAAssemble, 1, 0);
1399
1400 #if KNOB_ENABLE_TOSS_POINTS
1401                 if (!KNOB_TOSS_FETCH)
1402 #endif
1403                 {
1404 #if KNOB_ENABLE_TOSS_POINTS
1405                     if (!KNOB_TOSS_VS)
1406 #endif
1407                     {
1408                         if (assemble)
1409                         {
1410                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1411
1412 #if 0
1413                             if (HasTessellationT::value)
1414                             {
1415                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1416                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1417                             }
1418                             else if (HasGeometryShaderT::value)
1419                             {
1420                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1421                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1422                             }
1423                             else
1424 #endif
1425                             {
1426 #if 0
1427                                 // If streamout is enabled then stream vertices out to memory.
1428                                 if (HasStreamOutT::value)
1429                                 {
1430                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1431                                 }
1432
1433 #endif
1434                                 if (HasRastT::value)
1435                                 {
1436                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
1437
1438                                     uint32_t mask = GenMask(pa.NumPrims());
1439                                     uint32_t mask_lo = mask & 255;
1440                                     uint32_t mask_hi = (mask >> 8) & 255;
1441
1442                                     simd16scalari primid = pa.GetPrimID(work.startPrimID);
1443                                     simdscalari primid_lo = primid.lo;
1444                                     simdscalari primid_hi = primid.hi;
1445
1446                                     simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1447
1448                                     for (uint32_t i = 0; i < 3; i += 1)
1449                                     {
1450                                         for (uint32_t j = 0; j < 4; j += 1)
1451                                         {
1452                                             prim[i][j] = prim_simd16[i][j].lo;
1453                                         }
1454                                     }
1455
1456                                     pa.useAlternateOffset = false;
1457                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, mask_lo, primid_lo, _simd_setzero_si());
1458
1459                                     if (mask_hi)
1460                                     {
1461                                         for (uint32_t i = 0; i < 3; i += 1)
1462                                         {
1463                                             for (uint32_t j = 0; j < 4; j += 1)
1464                                             {
1465                                                 prim[i][j] = prim_simd16[i][j].hi;
1466                                             }
1467                                         }
1468
1469                                         pa.useAlternateOffset = true;
1470                                         pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, mask_hi, primid_hi, _simd_setzero_si());
1471                                     }
1472                                 }
1473                             }
1474                         }
1475                     }
1476                 }
1477             } while (pa.NextPrim());
1478
1479             if (IsIndexedT::value)
1480             {
1481                 fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1482                 fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1483             }
1484             else
1485             {
1486                 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
1487             }
1488
1489             i += KNOB_SIMD16_WIDTH;
1490         }
1491
1492         pa.Reset();
1493     }
1494
1495 #else
1496     simdvertex          vin;
1497     SWR_VS_CONTEXT      vsContext;
1498
1499     vsContext.pVin = &vin;
1500
1501     SWR_FETCH_CONTEXT   fetchInfo = { 0 };
1502
1503     fetchInfo.pStreams = &state.vertexBuffers[0];
1504     fetchInfo.StartInstance = work.startInstance;
1505     fetchInfo.StartVertex = 0;
1506
1507     if (IsIndexedT::value)
1508     {
1509         fetchInfo.BaseVertex = work.baseVertex;
1510
1511         // if the entire index buffer isn't being consumed, set the last index
1512         // so that fetches < a SIMD wide will be masked off
1513         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1514         if (pLastRequestedIndex < fetchInfo.pLastIndex)
1515         {
1516             fetchInfo.pLastIndex = pLastRequestedIndex;
1517         }
1518     }
1519     else
1520     {
1521         fetchInfo.StartVertex = work.startVertex;
1522     }
1523
1524     const simdscalari   vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1525
1526     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1527     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1528     {
1529         simdscalari vIndex;
1530         uint32_t  i = 0;
1531
1532         if (IsIndexedT::value)
1533         {
1534             fetchInfo.pIndices = work.pIB;
1535         }
1536         else
1537         {
1538             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1539             fetchInfo.pIndices = (const int32_t*)&vIndex;
1540         }
1541
1542         fetchInfo.CurInstance = instanceNum;
1543         vsContext.InstanceID = instanceNum;
1544
1545         while (pa.HasWork())
1546         {
1547             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1548             // So we need to keep this outside of (i < endVertex) check.
1549             simdmask* pvCutIndices = nullptr;
1550             if (IsIndexedT::value)
1551             {
1552                 pvCutIndices = &pa.GetNextVsIndices();
1553             }
1554
1555             simdvertex& vout = pa.GetNextVsOutput();
1556             vsContext.pVout = &vout;
1557
1558             if (i < endVertex)
1559             {
1560
1561                 // 1. Execute FS/VS for a single SIMD.
1562                 AR_BEGIN(FEFetchShader, pDC->drawId);
1563                 state.pfnFetchFunc(fetchInfo, vin);
1564                 AR_END(FEFetchShader, 0);
1565
1566                 // forward fetch generated vertex IDs to the vertex shader
1567                 vsContext.VertexID = fetchInfo.VertexID;
1568
1569                 // Setup active mask for vertex shader.
1570                 vsContext.mask = GenerateMask(endVertex - i);
1571
1572                 // forward cut mask to the PA
1573                 if (IsIndexedT::value)
1574                 {
1575                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1576                 }
1577
1578                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1579
1580 #if KNOB_ENABLE_TOSS_POINTS
1581                 if (!KNOB_TOSS_FETCH)
1582 #endif
1583                 {
1584                     AR_BEGIN(FEVertexShader, pDC->drawId);
1585                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1586                     AR_END(FEVertexShader, 0);
1587
1588                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1589                 }
1590             }
1591
1592             // 2. Assemble primitives given the last two SIMD.
1593             do
1594             {
1595                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1596                 // PaAssemble returns false if there is not enough verts to assemble.
1597                 AR_BEGIN(FEPAAssemble, pDC->drawId);
1598                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1599                 AR_END(FEPAAssemble, 1);
1600
1601 #if KNOB_ENABLE_TOSS_POINTS
1602                 if (!KNOB_TOSS_FETCH)
1603 #endif
1604                 {
1605 #if KNOB_ENABLE_TOSS_POINTS
1606                     if (!KNOB_TOSS_VS)
1607 #endif
1608                     {
1609                         if (assemble)
1610                         {
1611                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1612
1613                             if (HasTessellationT::value)
1614                             {
1615                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1616                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1617                             }
1618                             else if (HasGeometryShaderT::value)
1619                             {
1620                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1621                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1622                             }
1623                             else
1624                             {
1625                                 // If streamout is enabled then stream vertices out to memory.
1626                                 if (HasStreamOutT::value)
1627                                 {
1628                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1629                                 }
1630
1631                                 if (HasRastT::value)
1632                                 {
1633                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
1634
1635                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1636                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1637                                 }
1638                             }
1639                         }
1640                     }
1641                 }
1642             } while (pa.NextPrim());
1643
1644             if (IsIndexedT::value)
1645             {
1646                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1647             }
1648             else
1649             {
1650                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1651             }
1652
1653             i += KNOB_SIMD_WIDTH;
1654         }
1655         pa.Reset();
1656     }
1657
1658 #endif
1659
1660     AR_END(FEProcessDraw, numPrims * work.numInstances);
1661 }
1662
1663 struct FEDrawChooser
1664 {
1665     typedef PFN_FE_WORK_FUNC FuncType;
1666
1667     template <typename... ArgsB>
1668     static FuncType GetFunc()
1669     {
1670         return ProcessDraw<ArgsB...>;
1671     }
1672 };
1673
1674
1675 // Selector for correct templated Draw front-end function
1676 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1677     bool IsIndexed,
1678     bool IsCutIndexEnabled,
1679     bool HasTessellation,
1680     bool HasGeometryShader,
1681     bool HasStreamOut,
1682     bool HasRasterization)
1683 {
1684     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1685 }