src/gallium/drivers/swr/rasterizer/core/frontend.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file frontend.cpp
  24 *
  25 * @brief Implementation for Frontend which handles vertex processing,
  26 *        primitive assembly, clipping, binning, etc.
  27 *
  28 ******************************************************************************/
  29
  30 #include "api.h"
  31 #include "frontend.h"
  32 #include "backend.h"
  33 #include "context.h"
  34 #include "rdtsc_core.h"
  35 #include "utils.h"
  36 #include "threads.h"
  37 #include "pa.h"
  38 #include "clip.h"
  39 #include "tilemgr.h"
  40 #include "tessellator.h"
  41 #include <limits>
  42
  43 //////////////////////////////////////////////////////////////////////////
  44 /// @brief Helper macro to generate a bitmask
  45 static INLINE uint32_t GenMask(uint32_t numBits)
  46 {
  47     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
  48     return ((1U << numBits) - 1);
  49 }
  50
  51 //////////////////////////////////////////////////////////////////////////
  52 /// @brief FE handler for SwrSync.
  53 /// @param pContext - pointer to SWR context.
  54 /// @param pDC - pointer to draw context.
  55 /// @param workerId - thread's worker id. Even thread has a unique id.
  56 /// @param pUserData - Pointer to user data passed back to sync callback.
  57 /// @todo This should go away when we switch this to use compute threading.
  58 void ProcessSync(
  59     SWR_CONTEXT *pContext,
  60     DRAW_CONTEXT *pDC,
  61     uint32_t workerId,
  62     void *pUserData)
  63 {
  64     BE_WORK work;
  65     work.type = SYNC;
  66     work.pfnWork = ProcessSyncBE;
  67
  68     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  69     pTileMgr->enqueue(0, 0, &work);
  70 }
  71
  72 //////////////////////////////////////////////////////////////////////////
  73 /// @brief FE handler for SwrDestroyContext.
  74 /// @param pContext - pointer to SWR context.
  75 /// @param pDC - pointer to draw context.
  76 /// @param workerId - thread's worker id. Even thread has a unique id.
  77 /// @param pUserData - Pointer to user data passed back to sync callback.
  78 void ProcessShutdown(
  79     SWR_CONTEXT *pContext,
  80     DRAW_CONTEXT *pDC,
  81     uint32_t workerId,
  82     void *pUserData)
  83 {
  84     BE_WORK work;
  85     work.type = SHUTDOWN;
  86     work.pfnWork = ProcessShutdownBE;
  87
  88     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  89     // Enqueue at least 1 work item for each worker thread
  90     // account for number of numa nodes
  91     uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
  92
  93     for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
  94     {
  95         for (uint32_t n = 0; n < numNumaNodes; ++n)
  96         {
  97             pTileMgr->enqueue(i, n, &work);
  98         }
  99     }
 100 }
 101
 102 //////////////////////////////////////////////////////////////////////////
 103 /// @brief FE handler for SwrClearRenderTarget.
 104 /// @param pContext - pointer to SWR context.
 105 /// @param pDC - pointer to draw context.
 106 /// @param workerId - thread's worker id. Even thread has a unique id.
 107 /// @param pUserData - Pointer to user data passed back to clear callback.
 108 /// @todo This should go away when we switch this to use compute threading.
 109 void ProcessClear(
 110     SWR_CONTEXT *pContext,
 111     DRAW_CONTEXT *pDC,
 112     uint32_t workerId,
 113     void *pUserData)
 114 {
 115     CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
 116     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 117
 118     // queue a clear to each macro tile
 119     // compute macro tile bounds for the specified rect
 120     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 121     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 122     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 123     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 124
 125     BE_WORK work;
 126     work.type = CLEAR;
 127     work.pfnWork = ProcessClearBE;
 128     work.desc.clear = *pDesc;
 129
 130     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 131     {
 132         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 133         {
 134             pTileMgr->enqueue(x, y, &work);
 135         }
 136     }
 137 }
 138
 139 //////////////////////////////////////////////////////////////////////////
 140 /// @brief FE handler for SwrStoreTiles.
 141 /// @param pContext - pointer to SWR context.
 142 /// @param pDC - pointer to draw context.
 143 /// @param workerId - thread's worker id. Even thread has a unique id.
 144 /// @param pUserData - Pointer to user data passed back to callback.
 145 /// @todo This should go away when we switch this to use compute threading.
 146 void ProcessStoreTiles(
 147     SWR_CONTEXT *pContext,
 148     DRAW_CONTEXT *pDC,
 149     uint32_t workerId,
 150     void *pUserData)
 151 {
 152     AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
 153     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 154     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
 155
 156     // queue a store to each macro tile
 157     // compute macro tile bounds for the specified rect
 158     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 159     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 160     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 161     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 162
 163     // store tiles
 164     BE_WORK work;
 165     work.type = STORETILES;
 166     work.pfnWork = ProcessStoreTilesBE;
 167     work.desc.storeTiles = *pDesc;
 168
 169     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 170     {
 171         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 172         {
 173             pTileMgr->enqueue(x, y, &work);
 174         }
 175     }
 176
 177     AR_END(FEProcessStoreTiles, 0);
 178 }
 179
 180 //////////////////////////////////////////////////////////////////////////
 181 /// @brief FE handler for SwrInvalidateTiles.
 182 /// @param pContext - pointer to SWR context.
 183 /// @param pDC - pointer to draw context.
 184 /// @param workerId - thread's worker id. Even thread has a unique id.
 185 /// @param pUserData - Pointer to user data passed back to callback.
 186 /// @todo This should go away when we switch this to use compute threading.
 187 void ProcessDiscardInvalidateTiles(
 188     SWR_CONTEXT *pContext,
 189     DRAW_CONTEXT *pDC,
 190     uint32_t workerId,
 191     void *pUserData)
 192 {
 193     AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
 194     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
 195     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 196
 197     // compute macro tile bounds for the specified rect
 198     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
 199     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
 200     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
 201     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
 202
 203     if (pDesc->fullTilesOnly == false)
 204     {
 205         // include partial tiles
 206         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 207         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 208         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 209         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 210     }
 211
 212     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
 213     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
 214
 215     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
 216     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
 217
 218     // load tiles
 219     BE_WORK work;
 220     work.type = DISCARDINVALIDATETILES;
 221     work.pfnWork = ProcessDiscardInvalidateTilesBE;
 222     work.desc.discardInvalidateTiles = *pDesc;
 223
 224     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 225     {
 226         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 227         {
 228             pTileMgr->enqueue(x, y, &work);
 229         }
 230     }
 231
 232     AR_END(FEProcessInvalidateTiles, 0);
 233 }
 234
 235 //////////////////////////////////////////////////////////////////////////
 236 /// @brief Computes the number of primitives given the number of verts.
 237 /// @param mode - primitive topology for draw operation.
 238 /// @param numPrims - number of vertices or indices for draw.
 239 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 240 uint32_t GetNumPrims(
 241     PRIMITIVE_TOPOLOGY mode,
 242     uint32_t numPrims)
 243 {
 244     switch (mode)
 245     {
 246     case TOP_POINT_LIST: return numPrims;
 247     case TOP_TRIANGLE_LIST: return numPrims / 3;
 248     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
 249     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
 250     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
 251     case TOP_QUAD_LIST: return numPrims / 4;
 252     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
 253     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
 254     case TOP_LINE_LIST: return numPrims / 2;
 255     case TOP_LINE_LOOP: return numPrims;
 256     case TOP_RECT_LIST: return numPrims / 3;
 257     case TOP_LINE_LIST_ADJ: return numPrims / 4;
 258     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
 259     case TOP_TRI_LIST_ADJ: return numPrims / 6;
 260     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 261
 262     case TOP_PATCHLIST_1:
 263     case TOP_PATCHLIST_2:
 264     case TOP_PATCHLIST_3:
 265     case TOP_PATCHLIST_4:
 266     case TOP_PATCHLIST_5:
 267     case TOP_PATCHLIST_6:
 268     case TOP_PATCHLIST_7:
 269     case TOP_PATCHLIST_8:
 270     case TOP_PATCHLIST_9:
 271     case TOP_PATCHLIST_10:
 272     case TOP_PATCHLIST_11:
 273     case TOP_PATCHLIST_12:
 274     case TOP_PATCHLIST_13:
 275     case TOP_PATCHLIST_14:
 276     case TOP_PATCHLIST_15:
 277     case TOP_PATCHLIST_16:
 278     case TOP_PATCHLIST_17:
 279     case TOP_PATCHLIST_18:
 280     case TOP_PATCHLIST_19:
 281     case TOP_PATCHLIST_20:
 282     case TOP_PATCHLIST_21:
 283     case TOP_PATCHLIST_22:
 284     case TOP_PATCHLIST_23:
 285     case TOP_PATCHLIST_24:
 286     case TOP_PATCHLIST_25:
 287     case TOP_PATCHLIST_26:
 288     case TOP_PATCHLIST_27:
 289     case TOP_PATCHLIST_28:
 290     case TOP_PATCHLIST_29:
 291     case TOP_PATCHLIST_30:
 292     case TOP_PATCHLIST_31:
 293     case TOP_PATCHLIST_32:
 294         return numPrims / (mode - TOP_PATCHLIST_BASE);
 295
 296     case TOP_POLYGON:
 297     case TOP_POINT_LIST_BF:
 298     case TOP_LINE_STRIP_CONT:
 299     case TOP_LINE_STRIP_BF:
 300     case TOP_LINE_STRIP_CONT_BF:
 301     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 302     case TOP_TRI_STRIP_REVERSE:
 303     case TOP_PATCHLIST_BASE:
 304     case TOP_UNKNOWN:
 305         SWR_INVALID("Unsupported topology: %d", mode);
 306         return 0;
 307     }
 308
 309     return 0;
 310 }
 311
 312 //////////////////////////////////////////////////////////////////////////
 313 /// @brief Computes the number of verts given the number of primitives.
 314 /// @param mode - primitive topology for draw operation.
 315 /// @param numPrims - number of primitives for draw.
 316 uint32_t GetNumVerts(
 317     PRIMITIVE_TOPOLOGY mode,
 318     uint32_t numPrims)
 319 {
 320     switch (mode)
 321     {
 322     case TOP_POINT_LIST: return numPrims;
 323     case TOP_TRIANGLE_LIST: return numPrims * 3;
 324     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
 325     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
 326     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
 327     case TOP_QUAD_LIST: return numPrims * 4;
 328     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
 329     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
 330     case TOP_LINE_LIST: return numPrims * 2;
 331     case TOP_LINE_LOOP: return numPrims;
 332     case TOP_RECT_LIST: return numPrims * 3;
 333     case TOP_LINE_LIST_ADJ: return numPrims * 4;
 334     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
 335     case TOP_TRI_LIST_ADJ: return numPrims * 6;
 336     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
 337
 338     case TOP_PATCHLIST_1:
 339     case TOP_PATCHLIST_2:
 340     case TOP_PATCHLIST_3:
 341     case TOP_PATCHLIST_4:
 342     case TOP_PATCHLIST_5:
 343     case TOP_PATCHLIST_6:
 344     case TOP_PATCHLIST_7:
 345     case TOP_PATCHLIST_8:
 346     case TOP_PATCHLIST_9:
 347     case TOP_PATCHLIST_10:
 348     case TOP_PATCHLIST_11:
 349     case TOP_PATCHLIST_12:
 350     case TOP_PATCHLIST_13:
 351     case TOP_PATCHLIST_14:
 352     case TOP_PATCHLIST_15:
 353     case TOP_PATCHLIST_16:
 354     case TOP_PATCHLIST_17:
 355     case TOP_PATCHLIST_18:
 356     case TOP_PATCHLIST_19:
 357     case TOP_PATCHLIST_20:
 358     case TOP_PATCHLIST_21:
 359     case TOP_PATCHLIST_22:
 360     case TOP_PATCHLIST_23:
 361     case TOP_PATCHLIST_24:
 362     case TOP_PATCHLIST_25:
 363     case TOP_PATCHLIST_26:
 364     case TOP_PATCHLIST_27:
 365     case TOP_PATCHLIST_28:
 366     case TOP_PATCHLIST_29:
 367     case TOP_PATCHLIST_30:
 368     case TOP_PATCHLIST_31:
 369     case TOP_PATCHLIST_32:
 370         return numPrims * (mode - TOP_PATCHLIST_BASE);
 371
 372     case TOP_POLYGON:
 373     case TOP_POINT_LIST_BF:
 374     case TOP_LINE_STRIP_CONT:
 375     case TOP_LINE_STRIP_BF:
 376     case TOP_LINE_STRIP_CONT_BF:
 377     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 378     case TOP_TRI_STRIP_REVERSE:
 379     case TOP_PATCHLIST_BASE:
 380     case TOP_UNKNOWN:
 381         SWR_INVALID("Unsupported topology: %d", mode);
 382         return 0;
 383     }
 384
 385     return 0;
 386 }
 387
 388 //////////////////////////////////////////////////////////////////////////
 389 /// @brief Return number of verts per primitive.
 390 /// @param topology - topology
 391 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 392 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 393 {
 394     uint32_t numVerts = 0;
 395     switch (topology)
 396     {
 397     case TOP_POINT_LIST:
 398     case TOP_POINT_LIST_BF:
 399         numVerts = 1;
 400         break;
 401     case TOP_LINE_LIST:
 402     case TOP_LINE_STRIP:
 403     case TOP_LINE_LIST_ADJ:
 404     case TOP_LINE_LOOP:
 405     case TOP_LINE_STRIP_CONT:
 406     case TOP_LINE_STRIP_BF:
 407     case TOP_LISTSTRIP_ADJ:
 408         numVerts = 2;
 409         break;
 410     case TOP_TRIANGLE_LIST:
 411     case TOP_TRIANGLE_STRIP:
 412     case TOP_TRIANGLE_FAN:
 413     case TOP_TRI_LIST_ADJ:
 414     case TOP_TRI_STRIP_ADJ:
 415     case TOP_TRI_STRIP_REVERSE:
 416     case TOP_RECT_LIST:
 417         numVerts = 3;
 418         break;
 419     case TOP_QUAD_LIST:
 420     case TOP_QUAD_STRIP:
 421         numVerts = 4;
 422         break;
 423     case TOP_PATCHLIST_1:
 424     case TOP_PATCHLIST_2:
 425     case TOP_PATCHLIST_3:
 426     case TOP_PATCHLIST_4:
 427     case TOP_PATCHLIST_5:
 428     case TOP_PATCHLIST_6:
 429     case TOP_PATCHLIST_7:
 430     case TOP_PATCHLIST_8:
 431     case TOP_PATCHLIST_9:
 432     case TOP_PATCHLIST_10:
 433     case TOP_PATCHLIST_11:
 434     case TOP_PATCHLIST_12:
 435     case TOP_PATCHLIST_13:
 436     case TOP_PATCHLIST_14:
 437     case TOP_PATCHLIST_15:
 438     case TOP_PATCHLIST_16:
 439     case TOP_PATCHLIST_17:
 440     case TOP_PATCHLIST_18:
 441     case TOP_PATCHLIST_19:
 442     case TOP_PATCHLIST_20:
 443     case TOP_PATCHLIST_21:
 444     case TOP_PATCHLIST_22:
 445     case TOP_PATCHLIST_23:
 446     case TOP_PATCHLIST_24:
 447     case TOP_PATCHLIST_25:
 448     case TOP_PATCHLIST_26:
 449     case TOP_PATCHLIST_27:
 450     case TOP_PATCHLIST_28:
 451     case TOP_PATCHLIST_29:
 452     case TOP_PATCHLIST_30:
 453     case TOP_PATCHLIST_31:
 454     case TOP_PATCHLIST_32:
 455         numVerts = topology - TOP_PATCHLIST_BASE;
 456         break;
 457     default:
 458         SWR_INVALID("Unsupported topology: %d", topology);
 459         break;
 460     }
 461
 462     if (includeAdjVerts)
 463     {
 464         switch (topology)
 465         {
 466         case TOP_LISTSTRIP_ADJ:
 467         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
 468         case TOP_TRI_STRIP_ADJ:
 469         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
 470         default: break;
 471         }
 472     }
 473
 474     return numVerts;
 475 }
 476
 477 //////////////////////////////////////////////////////////////////////////
 478 /// @brief Generate mask from remaining work.
 479 /// @param numWorkItems - Number of items being worked on by a SIMD.
 480 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 481 {
 482     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
 483     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 484     return _simd_castps_si(vMask(mask));
 485 }
 486
 487 //////////////////////////////////////////////////////////////////////////
 488 /// @brief StreamOut - Streams vertex data out to SO buffers.
 489 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 490 /// @param pDC - pointer to draw context.
 491 /// @param workerId - thread's worker id. Even thread has a unique id.
 492 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 493 static void StreamOut(
 494     DRAW_CONTEXT* pDC,
 495     PA_STATE& pa,
 496     uint32_t workerId,
 497     uint32_t* pPrimData,
 498 #if USE_SIMD16_FRONTEND
 499     uint32_t numPrims_simd8,
 500 #endif
 501     uint32_t streamIndex)
 502 {
 503     SWR_CONTEXT *pContext = pDC->pContext;
 504
 505     AR_BEGIN(FEStreamout, pDC->drawId);
 506
 507     const API_STATE& state = GetApiState(pDC);
 508     const SWR_STREAMOUT_STATE &soState = state.soState;
 509
 510     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 511
 512     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
 513     uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
 514
 515     SWR_STREAMOUT_CONTEXT soContext = { 0 };
 516
 517     // Setup buffer state pointers.
 518     for (uint32_t i = 0; i < 4; ++i)
 519     {
 520         soContext.pBuffer[i] = &state.soBuffer[i];
 521     }
 522
 523 #if USE_SIMD16_FRONTEND
 524     uint32_t numPrims = numPrims_simd8;
 525 #else
 526     uint32_t numPrims = pa.NumPrims();
 527 #endif
 528
 529     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 530     {
 531         DWORD slot = 0;
 532         uint32_t soMask = soState.streamMasks[streamIndex];
 533
 534         // Write all entries into primitive data buffer for SOS.
 535         while (_BitScanForward(&slot, soMask))
 536         {
 537             __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
 538             uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
 539             pa.AssembleSingle(paSlot, primIndex, attrib);
 540
 541             // Attribute offset is relative offset from start of vertex.
 542             // Note that attributes start at slot 1 in the PA buffer. We need to write this
 543             // to prim data starting at slot 0. Which is why we do (slot - 1).
 544             // Also note: GL works slightly differently, and needs slot 0
 545             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
 546
 547             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
 548             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
 549             {
 550                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 551
 552                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 553             }
 554             soMask &= ~(1 << slot);
 555         }
 556
 557         // Update pPrimData pointer
 558         soContext.pPrimData = pPrimData;
 559
 560         // Call SOS
 561         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
 562         state.pfnSoFunc[streamIndex](soContext);
 563     }
 564
 565     // Update SO write offset. The driver provides memory for the update.
 566     for (uint32_t i = 0; i < 4; ++i)
 567     {
 568         if (state.soBuffer[i].pWriteOffset)
 569         {
 570             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 571         }
 572
 573         if (state.soBuffer[i].soWriteEnable)
 574         {
 575             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 576             pDC->dynState.SoWriteOffsetDirty[i] = true;
 577         }
 578     }
 579
 580     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
 581     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 582
 583     AR_END(FEStreamout, 1);
 584 }
 585
 586 #if USE_SIMD16_FRONTEND
 587 //////////////////////////////////////////////////////////////////////////
 588 /// Is value an even number (a multiple of two)
 589 ///
 590 template <typename T>
 591 INLINE static bool IsEven(T value)
 592 {
 593     return (value & 1) == 0;
 594 }
 595
 596 //////////////////////////////////////////////////////////////////////////
 597 /// Round up value to an even number (a multiple of two)
 598 ///
 599 template <typename T>
 600 INLINE static T RoundUpEven(T value)
 601 {
 602     return (value + 1) & ~1;
 603 }
 604
 605 //////////////////////////////////////////////////////////////////////////
 606 /// Round down value to an even number (a multiple of two)
 607 ///
 608 template <typename T>
 609 INLINE static T RoundDownEven(T value)
 610 {
 611     return value & ~1;
 612 }
 613
 614 //////////////////////////////////////////////////////////////////////////
 615 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
 616 ///
 617 /// vertexCount is in terms of the source simdvertexes and must be even
 618 ///
 619 /// attribCount will limit the vector copies to those attribs specified
 620 ///
 621 /// note: the stride between vertexes is determinded by KNOB_NUM_ATTRIBUTES
 622 ///
 623 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
 624 {
 625     SWR_ASSERT(vertex);
 626     SWR_ASSERT(vertex_simd16);
 627     SWR_ASSERT(attribCount <= KNOB_NUM_ATTRIBUTES);
 628
 629     simd16vertex temp;
 630
 631     for (uint32_t i = 0; i < vertexCount; i += 2)
 632     {
 633         for (uint32_t j = 0; j < attribCount; j += 1)
 634         {
 635             for (uint32_t k = 0; k < 4; k += 1)
 636             {
 637                 temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
 638
 639                 if ((i + 1) < vertexCount)
 640                 {
 641                     temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
 642                 }
 643             }
 644         }
 645
 646         for (uint32_t j = 0; j < attribCount; j += 1)
 647         {
 648             vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
 649         }
 650     }
 651 }
 652
 653 #endif
 654 //////////////////////////////////////////////////////////////////////////
 655 /// @brief Computes number of invocations. The current index represents
 656 ///        the start of the SIMD. The max index represents how much work
 657 ///        items are remaining. If there is less then a SIMD's xmin of work
 658 ///        then return the remaining amount of work.
 659 /// @param curIndex - The start index for the SIMD.
 660 /// @param maxIndex - The last index for all work items.
 661 static INLINE uint32_t GetNumInvocations(
 662     uint32_t curIndex,
 663     uint32_t maxIndex)
 664 {
 665     uint32_t remainder = (maxIndex - curIndex);
 666 #if USE_SIMD16_FRONTEND
 667     return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
 668 #else
 669     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 670 #endif
 671 }
 672
 673 //////////////////////////////////////////////////////////////////////////
 674 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 675 ///        The geometry shader will loop over each active streamout buffer, assembling
 676 ///        primitives for the downstream stages. When multistream output is enabled,
 677 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 678 ///        buffer for the primitive assembler.
 679 /// @param stream - stream id to generate the cut buffer for
 680 /// @param pStreamIdBase - pointer to the stream ID buffer
 681 /// @param numEmittedVerts - Number of total verts emitted by the GS
 682 /// @param pCutBuffer - output buffer to write cuts to
 683 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
 684 {
 685     SWR_ASSERT(stream < MAX_SO_STREAMS);
 686
 687     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
 688     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 689
 690     for (uint32_t b = 0; b < numOutputBytes; ++b)
 691     {
 692         uint8_t curInputByte = pStreamIdBase[2*b];
 693         uint8_t outByte = 0;
 694         for (uint32_t i = 0; i < 4; ++i)
 695         {
 696             if ((curInputByte & 0x3) != stream)
 697             {
 698                 outByte |= (1 << i);
 699             }
 700             curInputByte >>= 2;
 701         }
 702
 703         curInputByte = pStreamIdBase[2 * b + 1];
 704         for (uint32_t i = 0; i < 4; ++i)
 705         {
 706             if ((curInputByte & 0x3) != stream)
 707             {
 708                 outByte |= (1 << (i + 4));
 709             }
 710             curInputByte >>= 2;
 711         }
 712
 713         *pCutBuffer++ = outByte;
 714     }
 715 }
 716
 717 THREAD SWR_GS_CONTEXT tlsGsContext;
 718
 719 #if USE_SIMD16_FRONTEND
 720 THREAD simd16vertex tempVertex_simd16[128];
 721
 722 #endif
 723 template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
 724 struct GsBufferInfo
 725 {
 726     GsBufferInfo(const SWR_GS_STATE &gsState)
 727     {
 728         const uint32_t vertexCount = gsState.maxNumVerts;
 729         const uint32_t vertexStride = sizeof(SIMDVERTEX);
 730         const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;
 731
 732         vertexPrimitiveStride = vertexStride * numSimdBatches;
 733         vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
 734
 735         if (gsState.isSingleStream)
 736         {
 737             cutPrimitiveStride = (vertexCount + 7) / 8;
 738             cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
 739
 740             streamCutPrimitiveStride = 0;
 741             streamCutInstanceStride = 0;
 742         }
 743         else
 744         {
 745             cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
 746             cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
 747
 748             streamCutPrimitiveStride = (vertexCount + 7) / 8;
 749             streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
 750         }
 751     }
 752
 753     uint32_t vertexPrimitiveStride;
 754     uint32_t vertexInstanceStride;
 755
 756     uint32_t cutPrimitiveStride;
 757     uint32_t cutInstanceStride;
 758
 759     uint32_t streamCutPrimitiveStride;
 760     uint32_t streamCutInstanceStride;
 761 };
 762
 763 //////////////////////////////////////////////////////////////////////////
 764 /// @brief Implements GS stage.
 765 /// @param pDC - pointer to draw context.
 766 /// @param workerId - thread's worker id. Even thread has a unique id.
 767 /// @param pa - The primitive assembly object.
 768 /// @param pGsOut - output stream for GS
 769 template <
 770     typename HasStreamOutT,
 771     typename HasRastT>
 772 static void GeometryShaderStage(
 773     DRAW_CONTEXT *pDC,
 774     uint32_t workerId,
 775     PA_STATE& pa,
 776     void* pGsOut,
 777     void* pCutBuffer,
 778     void* pStreamCutBuffer,
 779     uint32_t* pSoPrimData,
 780 #if USE_SIMD16_FRONTEND
 781     uint32_t numPrims_simd8,
 782 #endif
 783     simdscalari primID)
 784 {
 785     SWR_CONTEXT *pContext = pDC->pContext;
 786
 787     AR_BEGIN(FEGeometryShader, pDC->drawId);
 788
 789     const API_STATE& state = GetApiState(pDC);
 790     const SWR_GS_STATE* pState = &state.gsState;
 791
 792     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
 793     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 794
 795     tlsGsContext.pStream = (uint8_t*)pGsOut;
 796     tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
 797     tlsGsContext.PrimitiveID = primID;
 798
 799     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 800     simdvector attrib[MAX_ATTRIBUTES];
 801
 802     // assemble all attributes for the input primitive
 803     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 804     {
 805         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
 806         pa.Assemble(attribSlot, attrib);
 807
 808         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 809         {
 810             tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
 811         }
 812     }
 813
 814     // assemble position
 815     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 816     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 817     {
 818         tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
 819     }
 820
 821     const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);
 822
 823     // record valid prims from the frontend to avoid over binning the newly generated
 824     // prims from the GS
 825 #if USE_SIMD16_FRONTEND
 826     uint32_t numInputPrims = numPrims_simd8;
 827 #else
 828     uint32_t numInputPrims = pa.NumPrims();
 829 #endif
 830
 831     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 832     {
 833         tlsGsContext.InstanceID = instance;
 834         tlsGsContext.mask = GenerateMask(numInputPrims);
 835
 836         // execute the geometry shader
 837         state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 838
 839         tlsGsContext.pStream += bufferInfo.vertexInstanceStride;
 840         tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride;
 841     }
 842
 843     // set up new binner and state for the GS output topology
 844 #if USE_SIMD16_FRONTEND
 845     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
 846     if (HasRastT::value)
 847     {
 848         switch (pState->outputTopology)
 849         {
 850         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles_simd16; break;
 851         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines_simd16; break;
 852         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints_simd16; break;
 853         default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
 854         }
 855     }
 856
 857 #else
 858     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 859     if (HasRastT::value)
 860     {
 861         switch (pState->outputTopology)
 862         {
 863         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
 864         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
 865         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
 866         default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
 867         }
 868     }
 869
 870 #endif
 871     // foreach input prim:
 872     // - setup a new PA based on the emitted verts for that prim
 873     // - loop over the new verts, calling PA to assemble each prim
 874     uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
 875     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 876
 877     uint32_t totalPrimsGenerated = 0;
 878     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 879     {
 880         uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * bufferInfo.vertexPrimitiveStride;
 881         uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * bufferInfo.cutPrimitiveStride;
 882
 883         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 884         {
 885             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 886             if (numEmittedVerts == 0)
 887             {
 888                 continue;
 889             }
 890
 891             uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;
 892             uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;
 893
 894             uint32_t numAttribs = state.feNumAttributes;
 895
 896             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
 897             {
 898                 bool processCutVerts = false;
 899
 900                 uint8_t* pCutBuffer = pCutBase;
 901
 902                 // assign default stream ID, only relevant when GS is outputting a single stream
 903                 uint32_t streamID = 0;
 904                 if (pState->isSingleStream)
 905                 {
 906                     processCutVerts = true;
 907                     streamID = pState->singleStreamID;
 908                     if (streamID != stream) continue;
 909                 }
 910                 else
 911                 {
 912                     // early exit if this stream is not enabled for streamout
 913                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
 914                     {
 915                         continue;
 916                     }
 917
 918                     // multi-stream output, need to translate StreamID buffer to a cut buffer
 919                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
 920                     pCutBuffer = (uint8_t*)pStreamCutBuffer;
 921                     processCutVerts = false;
 922                 }
 923
 924 #if USE_SIMD16_FRONTEND
 925                 // TEMPORARY: GS outputs simdvertex, PA inputs simd16vertex, so convert simdvertex to simd16vertex
 926
 927                 SWR_ASSERT(numEmittedVerts <= 256);
 928
 929                 PackPairsOfSimdVertexIntoSimd16Vertex(
 930                     tempVertex_simd16,
 931                     reinterpret_cast<const simdvertex *>(pBase),
 932                     numEmittedVerts,
 933                     KNOB_NUM_ATTRIBUTES);
 934
 935 #endif
 936 #if USE_SIMD16_FRONTEND
 937                 PA_STATE_CUT gsPa(pDC, reinterpret_cast<uint8_t *>(tempVertex_simd16), numEmittedVerts, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 938
 939 #else
 940                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 941
 942 #endif
 943                 while (gsPa.GetNextStreamOutput())
 944                 {
 945                     do
 946                     {
 947 #if USE_SIMD16_FRONTEND
 948                         simd16vector attrib_simd16[3];
 949
 950                         bool assemble = gsPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
 951
 952 #else
 953                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
 954
 955 #endif
 956                         if (assemble)
 957                         {
 958                             totalPrimsGenerated += gsPa.NumPrims();
 959
 960                             if (HasStreamOutT::value)
 961                             {
 962 #if USE_SIMD16_FRONTEND
 963                                 const uint32_t numPrims = gsPa.NumPrims();
 964                                 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
 965                                 const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
 966
 967                                 gsPa.useAlternateOffset = false;
 968                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_lo, stream);
 969
 970                                 if (numPrims_hi)
 971                                 {
 972                                     gsPa.useAlternateOffset = true;
 973                                     StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_hi, stream);
 974                                 }
 975 #else
 976                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
 977 #endif
 978                             }
 979
 980                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
 981                             {
 982 #if USE_SIMD16_FRONTEND
 983                                 simd16scalari vPrimId;
 984                                 // pull primitiveID from the GS output if available
 985                                 if (state.gsState.emitsPrimitiveID)
 986                                 {
 987                                     simd16vector primIdAttrib[3];
 988                                     gsPa.Assemble_simd16(VERTEX_PRIMID_SLOT, primIdAttrib);
 989                                     vPrimId = _simd16_castps_si(primIdAttrib[state.frontendState.topologyProvokingVertex].x);
 990                                 }
 991                                 else
 992                                 {
 993                                     vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
 994                                 }
 995
 996                                 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
 997                                 simd16scalari vViewPortIdx;
 998                                 if (state.gsState.emitsViewportArrayIndex)
 999                                 {
1000                                     simd16vector vpiAttrib[3];
1001                                     gsPa.Assemble_simd16(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
1002
1003                                     // OOB indices => forced to zero.
1004                                     simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1005                                     simd16scalari vClearMask = _simd16_cmplt_epi32(_simd16_castps_si(vpiAttrib[0].x), vNumViewports);
1006                                     vpiAttrib[0].x = _simd16_and_ps(_simd16_castsi_ps(vClearMask), vpiAttrib[0].x);
1007
1008                                     vViewPortIdx = _simd16_castps_si(vpiAttrib[0].x);
1009                                 }
1010                                 else
1011                                 {
1012                                     vViewPortIdx = _simd16_set1_epi32(0);
1013                                 }
1014
1015                                 gsPa.useAlternateOffset = false;
1016                                 pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
1017 #else
1018                                 simdscalari vPrimId;
1019                                 // pull primitiveID from the GS output if available
1020                                 if (state.gsState.emitsPrimitiveID)
1021                                 {
1022                                     simdvector primIdAttrib[3];
1023                                     gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
1024                                     vPrimId = _simd_castps_si(primIdAttrib[state.frontendState.topologyProvokingVertex].x);
1025                                 }
1026                                 else
1027                                 {
1028                                     vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
1029                                 }
1030
1031                                 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
1032                                 simdscalari vViewPortIdx;
1033                                 if (state.gsState.emitsViewportArrayIndex)
1034                                 {
1035                                     simdvector vpiAttrib[3];
1036                                     gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
1037
1038                                     // OOB indices => forced to zero.
1039                                     simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1040                                     simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
1041                                     vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
1042
1043                                     vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
1044                                 }
1045                                 else
1046                                 {
1047                                     vViewPortIdx = _simd_set1_epi32(0);
1048                                 }
1049
1050                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
1051 #endif
1052                             }
1053                         }
1054                     } while (gsPa.NextPrim());
1055                 }
1056             }
1057         }
1058     }
1059
1060     // update GS pipeline stats
1061     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
1062     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
1063     AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
1064     AR_END(FEGeometryShader, 1);
1065 }
1066
1067 //////////////////////////////////////////////////////////////////////////
1068 /// @brief Allocate GS buffers
1069 /// @param pDC - pointer to draw context.
1070 /// @param state - API state
1071 /// @param ppGsOut - pointer to GS output buffer allocation
1072 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1073 template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
1074 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
1075     void **ppStreamCutBuffer)
1076 {
1077     auto pArena = pDC->pArena;
1078     SWR_ASSERT(pArena != nullptr);
1079     SWR_ASSERT(state.gsState.gsEnable);
1080
1081     // allocate arena space to hold GS output verts
1082     // @todo pack attribs
1083     // @todo support multiple streams
1084
1085     const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState);
1086
1087     const uint32_t vertexBufferSize = state.gsState.instanceCount * bufferInfo.vertexInstanceStride;
1088
1089     *ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * sizeof(float));
1090
1091     // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
1092     // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
1093
1094     // allocate space for temporary per-stream cut buffer if multi-stream is enabled
1095     if (state.gsState.isSingleStream)
1096     {
1097         const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
1098
1099         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
1100         *ppStreamCutBuffer = nullptr;
1101     }
1102     else
1103     {
1104         const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
1105         const uint32_t streamCutBufferSize = state.gsState.instanceCount * bufferInfo.streamCutInstanceStride;
1106
1107         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
1108         *ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, SIMD_WIDTH * sizeof(float));
1109     }
1110 }
1111
1112 //////////////////////////////////////////////////////////////////////////
1113 /// @brief Contains all data generated by the HS and passed to the
1114 /// tessellator and DS.
1115 struct TessellationThreadLocalData
1116 {
1117     SWR_HS_CONTEXT hsContext;
1118     ScalarPatch patchData[KNOB_SIMD_WIDTH];
1119     void* pTxCtx;
1120     size_t tsCtxSize;
1121
1122     simdscalar* pDSOutput;
1123     size_t numDSOutputVectors;
1124 };
1125
1126 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1127
1128 //////////////////////////////////////////////////////////////////////////
1129 /// @brief Allocate tessellation data for this worker thread.
1130 INLINE
1131 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1132 {
1133     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
1134     if (gt_pTessellationThreadData == nullptr)
1135     {
1136         gt_pTessellationThreadData = (TessellationThreadLocalData*)
1137             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1138         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1139     }
1140 }
1141
1142 //////////////////////////////////////////////////////////////////////////
1143 /// @brief Implements Tessellation Stages.
1144 /// @param pDC - pointer to draw context.
1145 /// @param workerId - thread's worker id. Even thread has a unique id.
1146 /// @param pa - The primitive assembly object.
1147 /// @param pGsOut - output stream for GS
1148 template <
1149     typename HasGeometryShaderT,
1150     typename HasStreamOutT,
1151     typename HasRastT>
1152 static void TessellationStages(
1153     DRAW_CONTEXT *pDC,
1154     uint32_t workerId,
1155     PA_STATE& pa,
1156     void* pGsOut,
1157     void* pCutBuffer,
1158     void* pCutStreamBuffer,
1159     uint32_t* pSoPrimData,
1160 #if USE_SIMD16_FRONTEND
1161     uint32_t numPrims_simd8,
1162 #endif
1163     simdscalari primID)
1164 {
1165     SWR_CONTEXT *pContext = pDC->pContext;
1166     const API_STATE& state = GetApiState(pDC);
1167     const SWR_TS_STATE& tsState = state.tsState;
1168
1169     SWR_ASSERT(gt_pTessellationThreadData);
1170
1171     HANDLE tsCtx = TSInitCtx(
1172         tsState.domain,
1173         tsState.partitioning,
1174         tsState.tsOutputTopology,
1175         gt_pTessellationThreadData->pTxCtx,
1176         gt_pTessellationThreadData->tsCtxSize);
1177     if (tsCtx == nullptr)
1178     {
1179         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1180         tsCtx = TSInitCtx(
1181             tsState.domain,
1182             tsState.partitioning,
1183             tsState.tsOutputTopology,
1184             gt_pTessellationThreadData->pTxCtx,
1185             gt_pTessellationThreadData->tsCtxSize);
1186     }
1187     SWR_ASSERT(tsCtx);
1188
1189 #if USE_SIMD16_FRONTEND
1190     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1191     if (HasRastT::value)
1192     {
1193         switch (tsState.postDSTopology)
1194         {
1195         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
1196         case TOP_LINE_LIST:     pfnClipFunc = ClipLines_simd16; break;
1197         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints_simd16; break;
1198         default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1199         }
1200     }
1201
1202 #else
1203     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1204     if (HasRastT::value)
1205     {
1206         switch (tsState.postDSTopology)
1207         {
1208         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1209         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
1210         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
1211         default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1212         }
1213     }
1214
1215 #endif
1216     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1217     hsContext.pCPout = gt_pTessellationThreadData->patchData;
1218     hsContext.PrimitiveID = primID;
1219
1220     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1221     // Max storage for one attribute for an entire simdprimitive
1222     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1223
1224     // assemble all attributes for the input primitives
1225     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1226     {
1227         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1228         pa.Assemble(attribSlot, simdattrib);
1229
1230         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1231         {
1232             hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1233         }
1234     }
1235
1236 #if defined(_DEBUG)
1237     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1238 #endif
1239
1240 #if USE_SIMD16_FRONTEND
1241     uint32_t numPrims = numPrims_simd8;
1242 #else
1243     uint32_t numPrims = pa.NumPrims();
1244 #endif
1245     hsContext.mask = GenerateMask(numPrims);
1246
1247     // Run the HS
1248     AR_BEGIN(FEHullShader, pDC->drawId);
1249     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1250     AR_END(FEHullShader, 0);
1251
1252     UPDATE_STAT_FE(HsInvocations, numPrims);
1253
1254     const uint32_t* pPrimId = (const uint32_t*)&primID;
1255
1256     for (uint32_t p = 0; p < numPrims; ++p)
1257     {
1258         // Run Tessellator
1259         SWR_TS_TESSELLATED_DATA tsData = { 0 };
1260         AR_BEGIN(FETessellation, pDC->drawId);
1261         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1262         AR_EVENT(TessPrimCount(1));
1263         AR_END(FETessellation, 0);
1264
1265         if (tsData.NumPrimitives == 0)
1266         {
1267             continue;
1268         }
1269         SWR_ASSERT(tsData.NumDomainPoints);
1270
1271         // Allocate DS Output memory
1272         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1273         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1274 #if USE_SIMD16_FRONTEND
1275         size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs;      // simd8 -> simd16, padding
1276 #else
1277         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1278 #endif
1279         if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1280         {
1281             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1282             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1283 #if USE_SIMD16_FRONTEND
1284             gt_pTessellationThreadData->numDSOutputVectors = RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
1285 #else
1286             gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1287 #endif
1288         }
1289         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1290         SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1291
1292 #if defined(_DEBUG)
1293         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1294 #endif
1295
1296         // Run Domain Shader
1297         SWR_DS_CONTEXT dsContext;
1298         dsContext.PrimitiveID = pPrimId[p];
1299         dsContext.pCpIn = &hsContext.pCPout[p];
1300         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1301         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1302         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1303 #if USE_SIMD16_FRONTEND
1304         dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations);      // simd8 -> simd16
1305 #else
1306         dsContext.vectorStride = requiredDSVectorInvocations;
1307 #endif
1308
1309         uint32_t dsInvocations = 0;
1310
1311         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1312         {
1313             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1314
1315             AR_BEGIN(FEDomainShader, pDC->drawId);
1316             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1317             AR_END(FEDomainShader, 0);
1318
1319             dsInvocations += KNOB_SIMD_WIDTH;
1320         }
1321         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1322
1323 #if USE_SIMD16_FRONTEND
1324         SWR_ASSERT(IsEven(dsContext.vectorStride));                             // simd8 -> simd16
1325
1326 #endif
1327         PA_TESS tessPa(
1328             pDC,
1329 #if USE_SIMD16_FRONTEND
1330             reinterpret_cast<const simd16scalar *>(dsContext.pOutputData),      // simd8 -> simd16
1331             dsContext.vectorStride / 2,                                         // simd8 -> simd16
1332 #else
1333             dsContext.pOutputData,
1334             dsContext.vectorStride,
1335 #endif
1336             tsState.numDsOutputAttribs,
1337             tsData.ppIndices,
1338             tsData.NumPrimitives,
1339             tsState.postDSTopology);
1340
1341         while (tessPa.HasWork())
1342         {
1343 #if USE_SIMD16_FRONTEND
1344             const uint32_t numPrims = tessPa.NumPrims();
1345             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1346             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1347
1348             const uint32_t primMask = GenMask(numPrims);
1349             const uint32_t primMask_lo = primMask & 255;
1350             const uint32_t primMask_hi = (primMask >> 8) & 255;
1351
1352             const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
1353             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1354             const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1355
1356 #endif
1357             if (HasGeometryShaderT::value)
1358             {
1359 #if USE_SIMD16_FRONTEND
1360                 tessPa.useAlternateOffset = false;
1361                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_lo, primID_lo);
1362
1363                 if (numPrims_hi)
1364                 {
1365                     tessPa.useAlternateOffset = true;
1366                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_hi, primID_hi);
1367                 }
1368 #else
1369                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1370                     pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1371                     _simd_set1_epi32(dsContext.PrimitiveID));
1372 #endif
1373             }
1374             else
1375             {
1376                 if (HasStreamOutT::value)
1377                 {
1378 #if USE_SIMD16_FRONTEND
1379                     tessPa.useAlternateOffset = false;
1380                     StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_lo, 0);
1381
1382                     if (numPrims_hi)
1383                     {
1384                         tessPa.useAlternateOffset = true;
1385                         StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_hi, 0);
1386                     }
1387 #else
1388                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1389 #endif
1390                 }
1391
1392                 if (HasRastT::value)
1393                 {
1394                     simdvector      prim[3]; // Only deal with triangles, lines, or points
1395 #if USE_SIMD16_FRONTEND
1396                     simd16vector    prim_simd16[3];
1397 #endif
1398                     AR_BEGIN(FEPAAssemble, pDC->drawId);
1399                     bool assemble =
1400 #if USE_SIMD16_FRONTEND
1401                         tessPa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
1402 #else
1403                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1404 #endif
1405                     AR_END(FEPAAssemble, 1);
1406                     SWR_ASSERT(assemble);
1407
1408                     SWR_ASSERT(pfnClipFunc);
1409 #if USE_SIMD16_FRONTEND
1410                     tessPa.useAlternateOffset = false;
1411                     pfnClipFunc(pDC, tessPa, workerId, prim_simd16, primMask, primID, _simd16_set1_epi32(0));
1412 #else
1413                     pfnClipFunc(pDC, tessPa, workerId, prim,
1414                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1415 #endif
1416                 }
1417             }
1418
1419             tessPa.NextPrim();
1420
1421         } // while (tessPa.HasWork())
1422     } // for (uint32_t p = 0; p < numPrims; ++p)
1423
1424     TSDestroyCtx(tsCtx);
1425 }
1426
1427 //////////////////////////////////////////////////////////////////////////
1428 /// @brief FE handler for SwrDraw.
1429 /// @tparam IsIndexedT - Is indexed drawing enabled
1430 /// @tparam HasTessellationT - Is tessellation enabled
1431 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1432 /// @tparam HasStreamOutT - Is stream-out enabled
1433 /// @tparam HasRastT - Is rasterization enabled
1434 /// @param pContext - pointer to SWR context.
1435 /// @param pDC - pointer to draw context.
1436 /// @param workerId - thread's worker id.
1437 /// @param pUserData - Pointer to DRAW_WORK
1438 template <
1439     typename IsIndexedT,
1440     typename IsCutIndexEnabledT,
1441     typename HasTessellationT,
1442     typename HasGeometryShaderT,
1443     typename HasStreamOutT,
1444     typename HasRastT>
1445 void ProcessDraw(
1446     SWR_CONTEXT *pContext,
1447     DRAW_CONTEXT *pDC,
1448     uint32_t workerId,
1449     void *pUserData)
1450 {
1451
1452 #if KNOB_ENABLE_TOSS_POINTS
1453     if (KNOB_TOSS_QUEUE_FE)
1454     {
1455         return;
1456     }
1457 #endif
1458
1459     AR_BEGIN(FEProcessDraw, pDC->drawId);
1460
1461     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1462     const API_STATE&    state = GetApiState(pDC);
1463
1464     uint32_t indexSize = 0;
1465     uint32_t endVertex = work.numVerts;
1466
1467     const int32_t* pLastRequestedIndex = nullptr;
1468     if (IsIndexedT::value)
1469     {
1470         switch (work.type)
1471         {
1472         case R32_UINT:
1473             indexSize = sizeof(uint32_t);
1474             pLastRequestedIndex = &(work.pIB[endVertex]);
1475             break;
1476         case R16_UINT:
1477             indexSize = sizeof(uint16_t);
1478             // nasty address offset to last index
1479             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1480             break;
1481         case R8_UINT:
1482             indexSize = sizeof(uint8_t);
1483             // nasty address offset to last index
1484             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1485             break;
1486         default:
1487             SWR_INVALID("Invalid work.type: %d", work.type);
1488         }
1489     }
1490     else
1491     {
1492         // No cuts, prune partial primitives.
1493         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1494     }
1495
1496 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1497     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1498 #endif
1499
1500     void* pGsOut = nullptr;
1501     void* pCutBuffer = nullptr;
1502     void* pStreamCutBuffer = nullptr;
1503     if (HasGeometryShaderT::value)
1504     {
1505 #if USE_SIMD16_FRONTEND
1506         AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1507 #else
1508         AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1509 #endif
1510     }
1511
1512     if (HasTessellationT::value)
1513     {
1514         SWR_ASSERT(state.tsState.tsEnable == true);
1515         SWR_ASSERT(state.pfnHsFunc != nullptr);
1516         SWR_ASSERT(state.pfnDsFunc != nullptr);
1517
1518         AllocateTessellationData(pContext);
1519     }
1520     else
1521     {
1522         SWR_ASSERT(state.tsState.tsEnable == false);
1523         SWR_ASSERT(state.pfnHsFunc == nullptr);
1524         SWR_ASSERT(state.pfnDsFunc == nullptr);
1525     }
1526
1527     // allocate space for streamout input prim data
1528     uint32_t* pSoPrimData = nullptr;
1529     if (HasStreamOutT::value)
1530     {
1531         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1532     }
1533
1534     // choose primitive assembler
1535     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1536     PA_STATE& pa = paFactory.GetPA();
1537
1538 #if USE_SIMD16_FRONTEND
1539     simdvertex          vin_lo;
1540     simdvertex          vin_hi;
1541     SWR_VS_CONTEXT      vsContext_lo;
1542     SWR_VS_CONTEXT      vsContext_hi;
1543
1544     vsContext_lo.pVin = &vin_lo;
1545     vsContext_hi.pVin = &vin_hi;
1546     vsContext_lo.AlternateOffset = 0;
1547     vsContext_hi.AlternateOffset = 1;
1548
1549     SWR_FETCH_CONTEXT   fetchInfo_lo = { 0 };
1550
1551     fetchInfo_lo.pStreams = &state.vertexBuffers[0];
1552     fetchInfo_lo.StartInstance = work.startInstance;
1553     fetchInfo_lo.StartVertex = 0;
1554
1555     if (IsIndexedT::value)
1556     {
1557         fetchInfo_lo.BaseVertex = work.baseVertex;
1558
1559         // if the entire index buffer isn't being consumed, set the last index
1560         // so that fetches < a SIMD wide will be masked off
1561         fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1562         if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
1563         {
1564             fetchInfo_lo.pLastIndex = pLastRequestedIndex;
1565         }
1566     }
1567     else
1568     {
1569         fetchInfo_lo.StartVertex = work.startVertex;
1570     }
1571
1572     SWR_FETCH_CONTEXT   fetchInfo_hi = fetchInfo_lo;
1573
1574     const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1575
1576     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1577     {
1578         uint32_t  i = 0;
1579
1580         simd16scalari vIndex;
1581
1582         if (IsIndexedT::value)
1583         {
1584             fetchInfo_lo.pIndices = work.pIB;
1585             fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize);    // 1/2 of KNOB_SIMD16_WIDTH
1586         }
1587         else
1588         {
1589             vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1590
1591             fetchInfo_lo.pIndices = (const int32_t *)&vIndex;
1592             fetchInfo_hi.pIndices = (const int32_t *)&vIndex + KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH
1593         }
1594
1595         fetchInfo_lo.CurInstance = instanceNum;
1596         fetchInfo_hi.CurInstance = instanceNum;
1597
1598         vsContext_lo.InstanceID = instanceNum;
1599         vsContext_hi.InstanceID = instanceNum;
1600
1601         while (pa.HasWork())
1602         {
1603             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1604             // So we need to keep this outside of (i < endVertex) check.
1605
1606             simdmask *pvCutIndices_lo = nullptr;
1607             simdmask *pvCutIndices_hi = nullptr;
1608
1609             if (IsIndexedT::value)
1610             {
1611                 // simd16mask <=> simdmask[2]
1612
1613                 pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
1614                 pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
1615             }
1616
1617             simd16vertex &vout = pa.GetNextVsOutput();
1618
1619             vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
1620             vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);
1621
1622             if (i < endVertex)
1623             {
1624                 // 1. Execute FS/VS for a single SIMD.
1625                 AR_BEGIN(FEFetchShader, pDC->drawId);
1626                 state.pfnFetchFunc(fetchInfo_lo, vin_lo);
1627
1628                 if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
1629                 {
1630                     state.pfnFetchFunc(fetchInfo_hi, vin_hi);
1631                 }
1632                 AR_END(FEFetchShader, 0);
1633
1634                 // forward fetch generated vertex IDs to the vertex shader
1635                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1636                 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1637
1638                 // Setup active mask for vertex shader.
1639                 vsContext_lo.mask = GenerateMask(endVertex - i);
1640                 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1641
1642                 // forward cut mask to the PA
1643                 if (IsIndexedT::value)
1644                 {
1645                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1646                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1647                 }
1648
1649                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1650
1651 #if KNOB_ENABLE_TOSS_POINTS
1652                 if (!KNOB_TOSS_FETCH)
1653 #endif
1654                 {
1655                     AR_BEGIN(FEVertexShader, pDC->drawId);
1656                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
1657
1658                     if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
1659                     {
1660                         state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
1661                     }
1662                     AR_END(FEVertexShader, 0);
1663
1664                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1665                 }
1666             }
1667
1668             // 2. Assemble primitives given the last two SIMD.
1669             do
1670             {
1671                 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1672
1673                 RDTSC_START(FEPAAssemble);
1674                 bool assemble = pa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
1675                 RDTSC_STOP(FEPAAssemble, 1, 0);
1676
1677 #if KNOB_ENABLE_TOSS_POINTS
1678                 if (!KNOB_TOSS_FETCH)
1679 #endif
1680                 {
1681 #if KNOB_ENABLE_TOSS_POINTS
1682                     if (!KNOB_TOSS_VS)
1683 #endif
1684                     {
1685                         if (assemble)
1686                         {
1687                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1688
1689                             const uint32_t numPrims = pa.NumPrims();
1690                             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1691                             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1692
1693                             const uint32_t primMask = GenMask(numPrims);
1694                             const uint32_t primMask_lo = primMask & 255;
1695                             const uint32_t primMask_hi = (primMask >> 8) & 255;
1696
1697                             const simd16scalari primID = pa.GetPrimID(work.startPrimID);
1698                             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1699                             const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1700
1701                             if (HasTessellationT::value)
1702                             {
1703                                 pa.useAlternateOffset = false;
1704                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
1705
1706                                 if (numPrims_hi)
1707                                 {
1708                                     pa.useAlternateOffset = true;
1709                                     TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
1710                                 }
1711                             }
1712                             else if (HasGeometryShaderT::value)
1713                             {
1714                                 pa.useAlternateOffset = false;
1715                                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
1716
1717                                 if (numPrims_hi)
1718                                 {
1719                                     pa.useAlternateOffset = true;
1720                                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
1721                                 }
1722                             }
1723                             else
1724                             {
1725                                 // If streamout is enabled then stream vertices out to memory.
1726                                 if (HasStreamOutT::value)
1727                                 {
1728 #if 1
1729                                     pa.useAlternateOffset = false;
1730                                     StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_lo, 0);
1731
1732                                     if (numPrims_hi)
1733                                     {
1734                                         pa.useAlternateOffset = true;
1735                                         StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_hi, 0);
1736                                     }
1737 #else
1738                                     pa.useAlternateOffset = false;  // StreamOut() is SIMD16-compatible..
1739                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1740 #endif
1741                                 }
1742
1743                                 if (HasRastT::value)
1744                                 {
1745                                     SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
1746
1747                                     pa.useAlternateOffset = false;
1748                                     pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, primMask, primID, _simd16_setzero_si());
1749                                 }
1750                             }
1751                         }
1752                     }
1753                 }
1754             } while (pa.NextPrim());
1755
1756             if (IsIndexedT::value)
1757             {
1758                 fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1759                 fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1760             }
1761             else
1762             {
1763                 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
1764             }
1765
1766             i += KNOB_SIMD16_WIDTH;
1767         }
1768
1769         pa.Reset();
1770     }
1771
1772 #else
1773     simdvertex          vin;
1774     SWR_VS_CONTEXT      vsContext;
1775
1776     vsContext.pVin = &vin;
1777
1778     SWR_FETCH_CONTEXT   fetchInfo = { 0 };
1779
1780     fetchInfo.pStreams = &state.vertexBuffers[0];
1781     fetchInfo.StartInstance = work.startInstance;
1782     fetchInfo.StartVertex = 0;
1783
1784     if (IsIndexedT::value)
1785     {
1786         fetchInfo.BaseVertex = work.baseVertex;
1787
1788         // if the entire index buffer isn't being consumed, set the last index
1789         // so that fetches < a SIMD wide will be masked off
1790         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1791         if (pLastRequestedIndex < fetchInfo.pLastIndex)
1792         {
1793             fetchInfo.pLastIndex = pLastRequestedIndex;
1794         }
1795     }
1796     else
1797     {
1798         fetchInfo.StartVertex = work.startVertex;
1799     }
1800
1801     const simdscalari   vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1802
1803     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1804     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1805     {
1806         simdscalari vIndex;
1807         uint32_t  i = 0;
1808
1809         if (IsIndexedT::value)
1810         {
1811             fetchInfo.pIndices = work.pIB;
1812         }
1813         else
1814         {
1815             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1816             fetchInfo.pIndices = (const int32_t*)&vIndex;
1817         }
1818
1819         fetchInfo.CurInstance = instanceNum;
1820         vsContext.InstanceID = instanceNum;
1821
1822         while (pa.HasWork())
1823         {
1824             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1825             // So we need to keep this outside of (i < endVertex) check.
1826             simdmask* pvCutIndices = nullptr;
1827             if (IsIndexedT::value)
1828             {
1829                 pvCutIndices = &pa.GetNextVsIndices();
1830             }
1831
1832             simdvertex& vout = pa.GetNextVsOutput();
1833             vsContext.pVout = &vout;
1834
1835             if (i < endVertex)
1836             {
1837
1838                 // 1. Execute FS/VS for a single SIMD.
1839                 AR_BEGIN(FEFetchShader, pDC->drawId);
1840                 state.pfnFetchFunc(fetchInfo, vin);
1841                 AR_END(FEFetchShader, 0);
1842
1843                 // forward fetch generated vertex IDs to the vertex shader
1844                 vsContext.VertexID = fetchInfo.VertexID;
1845
1846                 // Setup active mask for vertex shader.
1847                 vsContext.mask = GenerateMask(endVertex - i);
1848
1849                 // forward cut mask to the PA
1850                 if (IsIndexedT::value)
1851                 {
1852                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1853                 }
1854
1855                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1856
1857 #if KNOB_ENABLE_TOSS_POINTS
1858                 if (!KNOB_TOSS_FETCH)
1859 #endif
1860                 {
1861                     AR_BEGIN(FEVertexShader, pDC->drawId);
1862                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1863                     AR_END(FEVertexShader, 0);
1864
1865                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1866                 }
1867             }
1868
1869             // 2. Assemble primitives given the last two SIMD.
1870             do
1871             {
1872                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1873                 // PaAssemble returns false if there is not enough verts to assemble.
1874                 AR_BEGIN(FEPAAssemble, pDC->drawId);
1875                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1876                 AR_END(FEPAAssemble, 1);
1877
1878 #if KNOB_ENABLE_TOSS_POINTS
1879                 if (!KNOB_TOSS_FETCH)
1880 #endif
1881                 {
1882 #if KNOB_ENABLE_TOSS_POINTS
1883                     if (!KNOB_TOSS_VS)
1884 #endif
1885                     {
1886                         if (assemble)
1887                         {
1888                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1889
1890                             if (HasTessellationT::value)
1891                             {
1892                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1893                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1894                             }
1895                             else if (HasGeometryShaderT::value)
1896                             {
1897                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1898                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1899                             }
1900                             else
1901                             {
1902                                 // If streamout is enabled then stream vertices out to memory.
1903                                 if (HasStreamOutT::value)
1904                                 {
1905                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1906                                 }
1907
1908                                 if (HasRastT::value)
1909                                 {
1910                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
1911
1912                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1913                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1914                                 }
1915                             }
1916                         }
1917                     }
1918                 }
1919             } while (pa.NextPrim());
1920
1921             if (IsIndexedT::value)
1922             {
1923                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1924             }
1925             else
1926             {
1927                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1928             }
1929
1930             i += KNOB_SIMD_WIDTH;
1931         }
1932         pa.Reset();
1933     }
1934
1935 #endif
1936
1937     AR_END(FEProcessDraw, numPrims * work.numInstances);
1938 }
1939
1940 struct FEDrawChooser
1941 {
1942     typedef PFN_FE_WORK_FUNC FuncType;
1943
1944     template <typename... ArgsB>
1945     static FuncType GetFunc()
1946     {
1947         return ProcessDraw<ArgsB...>;
1948     }
1949 };
1950
1951
1952 // Selector for correct templated Draw front-end function
1953 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1954     bool IsIndexed,
1955     bool IsCutIndexEnabled,
1956     bool HasTessellation,
1957     bool HasGeometryShader,
1958     bool HasStreamOut,
1959     bool HasRasterization)
1960 {
1961     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1962 }