src/gallium/drivers/swr/rasterizer/core/frontend.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file frontend.cpp
  24 *
  25 * @brief Implementation for Frontend which handles vertex processing,
  26 *        primitive assembly, clipping, binning, etc.
  27 *
  28 ******************************************************************************/
  29
  30 #include "api.h"
  31 #include "frontend.h"
  32 #include "backend.h"
  33 #include "context.h"
  34 #include "rdtsc_core.h"
  35 #include "utils.h"
  36 #include "threads.h"
  37 #include "pa.h"
  38 #include "clip.h"
  39 #include "tilemgr.h"
  40 #include "tessellator.h"
  41 #include <limits>
  42
  43 //////////////////////////////////////////////////////////////////////////
  44 /// @brief Helper macro to generate a bitmask
  45 static INLINE uint32_t GenMask(uint32_t numBits)
  46 {
  47     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
  48     return ((1U << numBits) - 1);
  49 }
  50
  51 //////////////////////////////////////////////////////////////////////////
  52 /// @brief FE handler for SwrSync.
  53 /// @param pContext - pointer to SWR context.
  54 /// @param pDC - pointer to draw context.
  55 /// @param workerId - thread's worker id. Even thread has a unique id.
  56 /// @param pUserData - Pointer to user data passed back to sync callback.
  57 /// @todo This should go away when we switch this to use compute threading.
  58 void ProcessSync(
  59     SWR_CONTEXT *pContext,
  60     DRAW_CONTEXT *pDC,
  61     uint32_t workerId,
  62     void *pUserData)
  63 {
  64     BE_WORK work;
  65     work.type = SYNC;
  66     work.pfnWork = ProcessSyncBE;
  67
  68     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  69     pTileMgr->enqueue(0, 0, &work);
  70 }
  71
  72 //////////////////////////////////////////////////////////////////////////
  73 /// @brief FE handler for SwrDestroyContext.
  74 /// @param pContext - pointer to SWR context.
  75 /// @param pDC - pointer to draw context.
  76 /// @param workerId - thread's worker id. Even thread has a unique id.
  77 /// @param pUserData - Pointer to user data passed back to sync callback.
  78 void ProcessShutdown(
  79     SWR_CONTEXT *pContext,
  80     DRAW_CONTEXT *pDC,
  81     uint32_t workerId,
  82     void *pUserData)
  83 {
  84     BE_WORK work;
  85     work.type = SHUTDOWN;
  86     work.pfnWork = ProcessShutdownBE;
  87
  88     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  89     // Enqueue at least 1 work item for each worker thread
  90     // account for number of numa nodes
  91     uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
  92
  93     for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
  94     {
  95         for (uint32_t n = 0; n < numNumaNodes; ++n)
  96         {
  97             pTileMgr->enqueue(i, n, &work);
  98         }
  99     }
 100 }
 101
 102 //////////////////////////////////////////////////////////////////////////
 103 /// @brief FE handler for SwrClearRenderTarget.
 104 /// @param pContext - pointer to SWR context.
 105 /// @param pDC - pointer to draw context.
 106 /// @param workerId - thread's worker id. Even thread has a unique id.
 107 /// @param pUserData - Pointer to user data passed back to clear callback.
 108 /// @todo This should go away when we switch this to use compute threading.
 109 void ProcessClear(
 110     SWR_CONTEXT *pContext,
 111     DRAW_CONTEXT *pDC,
 112     uint32_t workerId,
 113     void *pUserData)
 114 {
 115     CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
 116     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 117
 118     // queue a clear to each macro tile
 119     // compute macro tile bounds for the specified rect
 120     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 121     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 122     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 123     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 124
 125     BE_WORK work;
 126     work.type = CLEAR;
 127     work.pfnWork = ProcessClearBE;
 128     work.desc.clear = *pDesc;
 129
 130     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 131     {
 132         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 133         {
 134             pTileMgr->enqueue(x, y, &work);
 135         }
 136     }
 137 }
 138
 139 //////////////////////////////////////////////////////////////////////////
 140 /// @brief FE handler for SwrStoreTiles.
 141 /// @param pContext - pointer to SWR context.
 142 /// @param pDC - pointer to draw context.
 143 /// @param workerId - thread's worker id. Even thread has a unique id.
 144 /// @param pUserData - Pointer to user data passed back to callback.
 145 /// @todo This should go away when we switch this to use compute threading.
 146 void ProcessStoreTiles(
 147     SWR_CONTEXT *pContext,
 148     DRAW_CONTEXT *pDC,
 149     uint32_t workerId,
 150     void *pUserData)
 151 {
 152     AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
 153     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 154     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
 155
 156     // queue a store to each macro tile
 157     // compute macro tile bounds for the specified rect
 158     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 159     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 160     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 161     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 162
 163     // store tiles
 164     BE_WORK work;
 165     work.type = STORETILES;
 166     work.pfnWork = ProcessStoreTilesBE;
 167     work.desc.storeTiles = *pDesc;
 168
 169     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 170     {
 171         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 172         {
 173             pTileMgr->enqueue(x, y, &work);
 174         }
 175     }
 176
 177     AR_END(FEProcessStoreTiles, 0);
 178 }
 179
 180 //////////////////////////////////////////////////////////////////////////
 181 /// @brief FE handler for SwrInvalidateTiles.
 182 /// @param pContext - pointer to SWR context.
 183 /// @param pDC - pointer to draw context.
 184 /// @param workerId - thread's worker id. Even thread has a unique id.
 185 /// @param pUserData - Pointer to user data passed back to callback.
 186 /// @todo This should go away when we switch this to use compute threading.
 187 void ProcessDiscardInvalidateTiles(
 188     SWR_CONTEXT *pContext,
 189     DRAW_CONTEXT *pDC,
 190     uint32_t workerId,
 191     void *pUserData)
 192 {
 193     AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
 194     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
 195     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 196
 197     // compute macro tile bounds for the specified rect
 198     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
 199     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
 200     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
 201     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
 202
 203     if (pDesc->fullTilesOnly == false)
 204     {
 205         // include partial tiles
 206         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 207         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 208         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 209         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 210     }
 211
 212     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
 213     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
 214
 215     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
 216     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
 217
 218     // load tiles
 219     BE_WORK work;
 220     work.type = DISCARDINVALIDATETILES;
 221     work.pfnWork = ProcessDiscardInvalidateTilesBE;
 222     work.desc.discardInvalidateTiles = *pDesc;
 223
 224     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 225     {
 226         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 227         {
 228             pTileMgr->enqueue(x, y, &work);
 229         }
 230     }
 231
 232     AR_END(FEProcessInvalidateTiles, 0);
 233 }
 234
 235 //////////////////////////////////////////////////////////////////////////
 236 /// @brief Computes the number of primitives given the number of verts.
 237 /// @param mode - primitive topology for draw operation.
 238 /// @param numPrims - number of vertices or indices for draw.
 239 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 240 uint32_t GetNumPrims(
 241     PRIMITIVE_TOPOLOGY mode,
 242     uint32_t numPrims)
 243 {
 244     switch (mode)
 245     {
 246     case TOP_POINT_LIST: return numPrims;
 247     case TOP_TRIANGLE_LIST: return numPrims / 3;
 248     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
 249     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
 250     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
 251     case TOP_QUAD_LIST: return numPrims / 4;
 252     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
 253     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
 254     case TOP_LINE_LIST: return numPrims / 2;
 255     case TOP_LINE_LOOP: return numPrims;
 256     case TOP_RECT_LIST: return numPrims / 3;
 257     case TOP_LINE_LIST_ADJ: return numPrims / 4;
 258     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
 259     case TOP_TRI_LIST_ADJ: return numPrims / 6;
 260     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 261
 262     case TOP_PATCHLIST_1:
 263     case TOP_PATCHLIST_2:
 264     case TOP_PATCHLIST_3:
 265     case TOP_PATCHLIST_4:
 266     case TOP_PATCHLIST_5:
 267     case TOP_PATCHLIST_6:
 268     case TOP_PATCHLIST_7:
 269     case TOP_PATCHLIST_8:
 270     case TOP_PATCHLIST_9:
 271     case TOP_PATCHLIST_10:
 272     case TOP_PATCHLIST_11:
 273     case TOP_PATCHLIST_12:
 274     case TOP_PATCHLIST_13:
 275     case TOP_PATCHLIST_14:
 276     case TOP_PATCHLIST_15:
 277     case TOP_PATCHLIST_16:
 278     case TOP_PATCHLIST_17:
 279     case TOP_PATCHLIST_18:
 280     case TOP_PATCHLIST_19:
 281     case TOP_PATCHLIST_20:
 282     case TOP_PATCHLIST_21:
 283     case TOP_PATCHLIST_22:
 284     case TOP_PATCHLIST_23:
 285     case TOP_PATCHLIST_24:
 286     case TOP_PATCHLIST_25:
 287     case TOP_PATCHLIST_26:
 288     case TOP_PATCHLIST_27:
 289     case TOP_PATCHLIST_28:
 290     case TOP_PATCHLIST_29:
 291     case TOP_PATCHLIST_30:
 292     case TOP_PATCHLIST_31:
 293     case TOP_PATCHLIST_32:
 294         return numPrims / (mode - TOP_PATCHLIST_BASE);
 295
 296     case TOP_POLYGON:
 297     case TOP_POINT_LIST_BF:
 298     case TOP_LINE_STRIP_CONT:
 299     case TOP_LINE_STRIP_BF:
 300     case TOP_LINE_STRIP_CONT_BF:
 301     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 302     case TOP_TRI_STRIP_REVERSE:
 303     case TOP_PATCHLIST_BASE:
 304     case TOP_UNKNOWN:
 305         SWR_INVALID("Unsupported topology: %d", mode);
 306         return 0;
 307     }
 308
 309     return 0;
 310 }
 311
 312 //////////////////////////////////////////////////////////////////////////
 313 /// @brief Computes the number of verts given the number of primitives.
 314 /// @param mode - primitive topology for draw operation.
 315 /// @param numPrims - number of primitives for draw.
 316 uint32_t GetNumVerts(
 317     PRIMITIVE_TOPOLOGY mode,
 318     uint32_t numPrims)
 319 {
 320     switch (mode)
 321     {
 322     case TOP_POINT_LIST: return numPrims;
 323     case TOP_TRIANGLE_LIST: return numPrims * 3;
 324     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
 325     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
 326     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
 327     case TOP_QUAD_LIST: return numPrims * 4;
 328     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
 329     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
 330     case TOP_LINE_LIST: return numPrims * 2;
 331     case TOP_LINE_LOOP: return numPrims;
 332     case TOP_RECT_LIST: return numPrims * 3;
 333     case TOP_LINE_LIST_ADJ: return numPrims * 4;
 334     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
 335     case TOP_TRI_LIST_ADJ: return numPrims * 6;
 336     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
 337
 338     case TOP_PATCHLIST_1:
 339     case TOP_PATCHLIST_2:
 340     case TOP_PATCHLIST_3:
 341     case TOP_PATCHLIST_4:
 342     case TOP_PATCHLIST_5:
 343     case TOP_PATCHLIST_6:
 344     case TOP_PATCHLIST_7:
 345     case TOP_PATCHLIST_8:
 346     case TOP_PATCHLIST_9:
 347     case TOP_PATCHLIST_10:
 348     case TOP_PATCHLIST_11:
 349     case TOP_PATCHLIST_12:
 350     case TOP_PATCHLIST_13:
 351     case TOP_PATCHLIST_14:
 352     case TOP_PATCHLIST_15:
 353     case TOP_PATCHLIST_16:
 354     case TOP_PATCHLIST_17:
 355     case TOP_PATCHLIST_18:
 356     case TOP_PATCHLIST_19:
 357     case TOP_PATCHLIST_20:
 358     case TOP_PATCHLIST_21:
 359     case TOP_PATCHLIST_22:
 360     case TOP_PATCHLIST_23:
 361     case TOP_PATCHLIST_24:
 362     case TOP_PATCHLIST_25:
 363     case TOP_PATCHLIST_26:
 364     case TOP_PATCHLIST_27:
 365     case TOP_PATCHLIST_28:
 366     case TOP_PATCHLIST_29:
 367     case TOP_PATCHLIST_30:
 368     case TOP_PATCHLIST_31:
 369     case TOP_PATCHLIST_32:
 370         return numPrims * (mode - TOP_PATCHLIST_BASE);
 371
 372     case TOP_POLYGON:
 373     case TOP_POINT_LIST_BF:
 374     case TOP_LINE_STRIP_CONT:
 375     case TOP_LINE_STRIP_BF:
 376     case TOP_LINE_STRIP_CONT_BF:
 377     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 378     case TOP_TRI_STRIP_REVERSE:
 379     case TOP_PATCHLIST_BASE:
 380     case TOP_UNKNOWN:
 381         SWR_INVALID("Unsupported topology: %d", mode);
 382         return 0;
 383     }
 384
 385     return 0;
 386 }
 387
 388 //////////////////////////////////////////////////////////////////////////
 389 /// @brief Return number of verts per primitive.
 390 /// @param topology - topology
 391 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 392 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 393 {
 394     uint32_t numVerts = 0;
 395     switch (topology)
 396     {
 397     case TOP_POINT_LIST:
 398     case TOP_POINT_LIST_BF:
 399         numVerts = 1;
 400         break;
 401     case TOP_LINE_LIST:
 402     case TOP_LINE_STRIP:
 403     case TOP_LINE_LIST_ADJ:
 404     case TOP_LINE_LOOP:
 405     case TOP_LINE_STRIP_CONT:
 406     case TOP_LINE_STRIP_BF:
 407     case TOP_LISTSTRIP_ADJ:
 408         numVerts = 2;
 409         break;
 410     case TOP_TRIANGLE_LIST:
 411     case TOP_TRIANGLE_STRIP:
 412     case TOP_TRIANGLE_FAN:
 413     case TOP_TRI_LIST_ADJ:
 414     case TOP_TRI_STRIP_ADJ:
 415     case TOP_TRI_STRIP_REVERSE:
 416     case TOP_RECT_LIST:
 417         numVerts = 3;
 418         break;
 419     case TOP_QUAD_LIST:
 420     case TOP_QUAD_STRIP:
 421         numVerts = 4;
 422         break;
 423     case TOP_PATCHLIST_1:
 424     case TOP_PATCHLIST_2:
 425     case TOP_PATCHLIST_3:
 426     case TOP_PATCHLIST_4:
 427     case TOP_PATCHLIST_5:
 428     case TOP_PATCHLIST_6:
 429     case TOP_PATCHLIST_7:
 430     case TOP_PATCHLIST_8:
 431     case TOP_PATCHLIST_9:
 432     case TOP_PATCHLIST_10:
 433     case TOP_PATCHLIST_11:
 434     case TOP_PATCHLIST_12:
 435     case TOP_PATCHLIST_13:
 436     case TOP_PATCHLIST_14:
 437     case TOP_PATCHLIST_15:
 438     case TOP_PATCHLIST_16:
 439     case TOP_PATCHLIST_17:
 440     case TOP_PATCHLIST_18:
 441     case TOP_PATCHLIST_19:
 442     case TOP_PATCHLIST_20:
 443     case TOP_PATCHLIST_21:
 444     case TOP_PATCHLIST_22:
 445     case TOP_PATCHLIST_23:
 446     case TOP_PATCHLIST_24:
 447     case TOP_PATCHLIST_25:
 448     case TOP_PATCHLIST_26:
 449     case TOP_PATCHLIST_27:
 450     case TOP_PATCHLIST_28:
 451     case TOP_PATCHLIST_29:
 452     case TOP_PATCHLIST_30:
 453     case TOP_PATCHLIST_31:
 454     case TOP_PATCHLIST_32:
 455         numVerts = topology - TOP_PATCHLIST_BASE;
 456         break;
 457     default:
 458         SWR_INVALID("Unsupported topology: %d", topology);
 459         break;
 460     }
 461
 462     if (includeAdjVerts)
 463     {
 464         switch (topology)
 465         {
 466         case TOP_LISTSTRIP_ADJ:
 467         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
 468         case TOP_TRI_STRIP_ADJ:
 469         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
 470         default: break;
 471         }
 472     }
 473
 474     return numVerts;
 475 }
 476
 477 //////////////////////////////////////////////////////////////////////////
 478 /// @brief Generate mask from remaining work.
 479 /// @param numWorkItems - Number of items being worked on by a SIMD.
 480 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 481 {
 482     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
 483     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 484     return _simd_castps_si(vMask(mask));
 485 }
 486
 487 //////////////////////////////////////////////////////////////////////////
 488 /// @brief StreamOut - Streams vertex data out to SO buffers.
 489 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 490 /// @param pDC - pointer to draw context.
 491 /// @param workerId - thread's worker id. Even thread has a unique id.
 492 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 493 static void StreamOut(
 494     DRAW_CONTEXT* pDC,
 495     PA_STATE& pa,
 496     uint32_t workerId,
 497     uint32_t* pPrimData,
 498 #if USE_SIMD16_FRONTEND
 499     uint32_t numPrims_simd8,
 500 #endif
 501     uint32_t streamIndex)
 502 {
 503     SWR_CONTEXT *pContext = pDC->pContext;
 504
 505     AR_BEGIN(FEStreamout, pDC->drawId);
 506
 507     const API_STATE& state = GetApiState(pDC);
 508     const SWR_STREAMOUT_STATE &soState = state.soState;
 509
 510     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 511
 512     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
 513     uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
 514
 515     SWR_STREAMOUT_CONTEXT soContext = { 0 };
 516
 517     // Setup buffer state pointers.
 518     for (uint32_t i = 0; i < 4; ++i)
 519     {
 520         soContext.pBuffer[i] = &state.soBuffer[i];
 521     }
 522
 523 #if USE_SIMD16_FRONTEND
 524     uint32_t numPrims = numPrims_simd8;
 525 #else
 526     uint32_t numPrims = pa.NumPrims();
 527 #endif
 528
 529     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 530     {
 531         DWORD slot = 0;
 532         uint32_t soMask = soState.streamMasks[streamIndex];
 533
 534         // Write all entries into primitive data buffer for SOS.
 535         while (_BitScanForward(&slot, soMask))
 536         {
 537             __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
 538             uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
 539             pa.AssembleSingle(paSlot, primIndex, attrib);
 540
 541             // Attribute offset is relative offset from start of vertex.
 542             // Note that attributes start at slot 1 in the PA buffer. We need to write this
 543             // to prim data starting at slot 0. Which is why we do (slot - 1).
 544             // Also note: GL works slightly differently, and needs slot 0
 545             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
 546
 547             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
 548             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
 549             {
 550                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 551
 552                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 553             }
 554
 555             soMask &= ~(1 << slot);
 556         }
 557
 558         // Update pPrimData pointer
 559         soContext.pPrimData = pPrimData;
 560
 561         // Call SOS
 562         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
 563         state.pfnSoFunc[streamIndex](soContext);
 564     }
 565
 566     // Update SO write offset. The driver provides memory for the update.
 567     for (uint32_t i = 0; i < 4; ++i)
 568     {
 569         if (state.soBuffer[i].pWriteOffset)
 570         {
 571             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 572         }
 573
 574         if (state.soBuffer[i].soWriteEnable)
 575         {
 576             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 577             pDC->dynState.SoWriteOffsetDirty[i] = true;
 578         }
 579     }
 580
 581     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
 582     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 583
 584     AR_END(FEStreamout, 1);
 585 }
 586
 587 #if USE_SIMD16_FRONTEND
 588 //////////////////////////////////////////////////////////////////////////
 589 /// Is value an even number (a multiple of two)
 590 ///
 591 template <typename T>
 592 INLINE static bool IsEven(T value)
 593 {
 594     return (value & 1) == 0;
 595 }
 596
 597 //////////////////////////////////////////////////////////////////////////
 598 /// Round up value to an even number (a multiple of two)
 599 ///
 600 template <typename T>
 601 INLINE static T RoundUpEven(T value)
 602 {
 603     return (value + 1) & ~1;
 604 }
 605
 606 //////////////////////////////////////////////////////////////////////////
 607 /// Round down value to an even number (a multiple of two)
 608 ///
 609 template <typename T>
 610 INLINE static T RoundDownEven(T value)
 611 {
 612     return value & ~1;
 613 }
 614
 615 //////////////////////////////////////////////////////////////////////////
 616 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
 617 ///
 618 /// vertexCount is in terms of the source simdvertexes and must be even
 619 ///
 620 /// attribCount will limit the vector copies to those attribs specified
 621 ///
 622 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
 623 ///
 624 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
 625 {
 626     SWR_ASSERT(vertex);
 627     SWR_ASSERT(vertex_simd16);
 628     SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
 629
 630     simd16vertex temp;
 631
 632     for (uint32_t i = 0; i < vertexCount; i += 2)
 633     {
 634         for (uint32_t j = 0; j < attribCount; j += 1)
 635         {
 636             for (uint32_t k = 0; k < 4; k += 1)
 637             {
 638                 temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
 639
 640                 if ((i + 1) < vertexCount)
 641                 {
 642                     temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
 643                 }
 644             }
 645         }
 646
 647         for (uint32_t j = 0; j < attribCount; j += 1)
 648         {
 649             vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
 650         }
 651     }
 652 }
 653
 654 #endif
 655 //////////////////////////////////////////////////////////////////////////
 656 /// @brief Computes number of invocations. The current index represents
 657 ///        the start of the SIMD. The max index represents how much work
 658 ///        items are remaining. If there is less then a SIMD's xmin of work
 659 ///        then return the remaining amount of work.
 660 /// @param curIndex - The start index for the SIMD.
 661 /// @param maxIndex - The last index for all work items.
 662 static INLINE uint32_t GetNumInvocations(
 663     uint32_t curIndex,
 664     uint32_t maxIndex)
 665 {
 666     uint32_t remainder = (maxIndex - curIndex);
 667 #if USE_SIMD16_FRONTEND
 668     return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
 669 #else
 670     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 671 #endif
 672 }
 673
 674 //////////////////////////////////////////////////////////////////////////
 675 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 676 ///        The geometry shader will loop over each active streamout buffer, assembling
 677 ///        primitives for the downstream stages. When multistream output is enabled,
 678 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 679 ///        buffer for the primitive assembler.
 680 /// @param stream - stream id to generate the cut buffer for
 681 /// @param pStreamIdBase - pointer to the stream ID buffer
 682 /// @param numEmittedVerts - Number of total verts emitted by the GS
 683 /// @param pCutBuffer - output buffer to write cuts to
 684 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
 685 {
 686     SWR_ASSERT(stream < MAX_SO_STREAMS);
 687
 688     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
 689     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 690
 691     for (uint32_t b = 0; b < numOutputBytes; ++b)
 692     {
 693         uint8_t curInputByte = pStreamIdBase[2*b];
 694         uint8_t outByte = 0;
 695         for (uint32_t i = 0; i < 4; ++i)
 696         {
 697             if ((curInputByte & 0x3) != stream)
 698             {
 699                 outByte |= (1 << i);
 700             }
 701             curInputByte >>= 2;
 702         }
 703
 704         curInputByte = pStreamIdBase[2 * b + 1];
 705         for (uint32_t i = 0; i < 4; ++i)
 706         {
 707             if ((curInputByte & 0x3) != stream)
 708             {
 709                 outByte |= (1 << (i + 4));
 710             }
 711             curInputByte >>= 2;
 712         }
 713
 714         *pCutBuffer++ = outByte;
 715     }
 716 }
 717
 718 THREAD SWR_GS_CONTEXT tlsGsContext;
 719
 720 template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
 721 struct GsBufferInfo
 722 {
 723     GsBufferInfo(const SWR_GS_STATE &gsState)
 724     {
 725         const uint32_t vertexCount = gsState.maxNumVerts;
 726         const uint32_t vertexStride = sizeof(SIMDVERTEX);
 727         const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;
 728
 729         vertexPrimitiveStride = vertexStride * numSimdBatches;
 730         vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
 731
 732         if (gsState.isSingleStream)
 733         {
 734             cutPrimitiveStride = (vertexCount + 7) / 8;
 735             cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
 736
 737             streamCutPrimitiveStride = 0;
 738             streamCutInstanceStride = 0;
 739         }
 740         else
 741         {
 742             cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
 743             cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
 744
 745             streamCutPrimitiveStride = (vertexCount + 7) / 8;
 746             streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
 747         }
 748     }
 749
 750     uint32_t vertexPrimitiveStride;
 751     uint32_t vertexInstanceStride;
 752
 753     uint32_t cutPrimitiveStride;
 754     uint32_t cutInstanceStride;
 755
 756     uint32_t streamCutPrimitiveStride;
 757     uint32_t streamCutInstanceStride;
 758 };
 759
 760 //////////////////////////////////////////////////////////////////////////
 761 /// @brief Implements GS stage.
 762 /// @param pDC - pointer to draw context.
 763 /// @param workerId - thread's worker id. Even thread has a unique id.
 764 /// @param pa - The primitive assembly object.
 765 /// @param pGsOut - output stream for GS
 766 template <
 767     typename HasStreamOutT,
 768     typename HasRastT>
 769 static void GeometryShaderStage(
 770     DRAW_CONTEXT *pDC,
 771     uint32_t workerId,
 772     PA_STATE& pa,
 773     void* pGsOut,
 774     void* pCutBuffer,
 775     void* pStreamCutBuffer,
 776     uint32_t* pSoPrimData,
 777 #if USE_SIMD16_FRONTEND
 778     uint32_t numPrims_simd8,
 779 #endif
 780     simdscalari primID)
 781 {
 782     SWR_CONTEXT *pContext = pDC->pContext;
 783
 784     AR_BEGIN(FEGeometryShader, pDC->drawId);
 785
 786     const API_STATE& state = GetApiState(pDC);
 787     const SWR_GS_STATE* pState = &state.gsState;
 788
 789     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
 790     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 791
 792     tlsGsContext.pStream = (uint8_t*)pGsOut;
 793     tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
 794     tlsGsContext.PrimitiveID = primID;
 795
 796     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 797     simdvector attrib[MAX_ATTRIBUTES];
 798
 799     // assemble all attributes for the input primitive
 800     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 801     {
 802         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
 803         pa.Assemble(attribSlot, attrib);
 804
 805         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 806         {
 807             tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
 808         }
 809     }
 810
 811     // assemble position
 812     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 813     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 814     {
 815         tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
 816     }
 817
 818 #if USE_SIMD16_FRONTEND
 819     const GsBufferInfo<simd16vertex, KNOB_SIMD16_WIDTH> bufferInfo(state.gsState);
 820 #else
 821     const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);
 822 #endif
 823
 824     // record valid prims from the frontend to avoid over binning the newly generated
 825     // prims from the GS
 826 #if USE_SIMD16_FRONTEND
 827     uint32_t numInputPrims = numPrims_simd8;
 828 #else
 829     uint32_t numInputPrims = pa.NumPrims();
 830 #endif
 831
 832     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 833     {
 834         tlsGsContext.InstanceID = instance;
 835         tlsGsContext.mask = GenerateMask(numInputPrims);
 836
 837         // execute the geometry shader
 838         state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 839
 840         tlsGsContext.pStream += bufferInfo.vertexInstanceStride;
 841         tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride;
 842     }
 843
 844     // set up new binner and state for the GS output topology
 845 #if USE_SIMD16_FRONTEND
 846     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
 847     if (HasRastT::value)
 848     {
 849         switch (pState->outputTopology)
 850         {
 851         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles_simd16; break;
 852         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines_simd16; break;
 853         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints_simd16; break;
 854         default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
 855         }
 856     }
 857
 858 #else
 859     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 860     if (HasRastT::value)
 861     {
 862         switch (pState->outputTopology)
 863         {
 864         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
 865         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
 866         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
 867         default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
 868         }
 869     }
 870
 871 #endif
 872     // foreach input prim:
 873     // - setup a new PA based on the emitted verts for that prim
 874     // - loop over the new verts, calling PA to assemble each prim
 875     uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
 876     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 877
 878     uint32_t totalPrimsGenerated = 0;
 879     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 880     {
 881         uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * bufferInfo.vertexPrimitiveStride;
 882         uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * bufferInfo.cutPrimitiveStride;
 883
 884         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 885         {
 886             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 887             if (numEmittedVerts == 0)
 888             {
 889                 continue;
 890             }
 891
 892             uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;
 893             uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;
 894
 895             uint32_t numAttribs = state.feNumAttributes;
 896
 897             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
 898             {
 899                 bool processCutVerts = false;
 900
 901                 uint8_t* pCutBuffer = pCutBase;
 902
 903                 // assign default stream ID, only relevant when GS is outputting a single stream
 904                 uint32_t streamID = 0;
 905                 if (pState->isSingleStream)
 906                 {
 907                     processCutVerts = true;
 908                     streamID = pState->singleStreamID;
 909                     if (streamID != stream) continue;
 910                 }
 911                 else
 912                 {
 913                     // early exit if this stream is not enabled for streamout
 914                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
 915                     {
 916                         continue;
 917                     }
 918
 919                     // multi-stream output, need to translate StreamID buffer to a cut buffer
 920                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
 921                     pCutBuffer = (uint8_t*)pStreamCutBuffer;
 922                     processCutVerts = false;
 923                 }
 924
 925 #if USE_SIMD16_FRONTEND
 926                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 927
 928 #else
 929                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 930
 931 #endif
 932                 while (gsPa.GetNextStreamOutput())
 933                 {
 934                     do
 935                     {
 936 #if USE_SIMD16_FRONTEND
 937                         simd16vector attrib_simd16[3];
 938
 939                         bool assemble = gsPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
 940
 941 #else
 942                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
 943
 944 #endif
 945                         if (assemble)
 946                         {
 947                             totalPrimsGenerated += gsPa.NumPrims();
 948
 949                             if (HasStreamOutT::value)
 950                             {
 951 #if USE_SIMD16_FRONTEND
 952                                 const uint32_t numPrims = gsPa.NumPrims();
 953                                 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
 954                                 const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
 955
 956                                 gsPa.useAlternateOffset = false;
 957                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_lo, stream);
 958
 959                                 if (numPrims_hi)
 960                                 {
 961                                     gsPa.useAlternateOffset = true;
 962                                     StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_hi, stream);
 963                                 }
 964 #else
 965                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
 966 #endif
 967                             }
 968
 969                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
 970                             {
 971 #if USE_SIMD16_FRONTEND
 972                                 simd16scalari vPrimId;
 973                                 // pull primitiveID from the GS output if available
 974                                 if (state.gsState.emitsPrimitiveID)
 975                                 {
 976                                     simd16vector primIdAttrib[3];
 977                                     gsPa.Assemble_simd16(VERTEX_PRIMID_SLOT, primIdAttrib);
 978                                     vPrimId = _simd16_castps_si(primIdAttrib[state.frontendState.topologyProvokingVertex].x);
 979                                 }
 980                                 else
 981                                 {
 982                                     vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
 983                                 }
 984
 985                                 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
 986                                 simd16scalari vViewPortIdx;
 987                                 if (state.gsState.emitsViewportArrayIndex)
 988                                 {
 989                                     simd16vector vpiAttrib[3];
 990                                     gsPa.Assemble_simd16(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
 991
 992                                     // OOB indices => forced to zero.
 993                                     simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 994                                     simd16scalari vClearMask = _simd16_cmplt_epi32(_simd16_castps_si(vpiAttrib[0].x), vNumViewports);
 995                                     vpiAttrib[0].x = _simd16_and_ps(_simd16_castsi_ps(vClearMask), vpiAttrib[0].x);
 996
 997                                     vViewPortIdx = _simd16_castps_si(vpiAttrib[0].x);
 998                                 }
 999                                 else
1000                                 {
1001                                     vViewPortIdx = _simd16_set1_epi32(0);
1002                                 }
1003
1004                                 gsPa.useAlternateOffset = false;
1005                                 pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
1006 #else
1007                                 simdscalari vPrimId;
1008                                 // pull primitiveID from the GS output if available
1009                                 if (state.gsState.emitsPrimitiveID)
1010                                 {
1011                                     simdvector primIdAttrib[3];
1012                                     gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
1013                                     vPrimId = _simd_castps_si(primIdAttrib[state.frontendState.topologyProvokingVertex].x);
1014                                 }
1015                                 else
1016                                 {
1017                                     vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
1018                                 }
1019
1020                                 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
1021                                 simdscalari vViewPortIdx;
1022                                 if (state.gsState.emitsViewportArrayIndex)
1023                                 {
1024                                     simdvector vpiAttrib[3];
1025                                     gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
1026
1027                                     // OOB indices => forced to zero.
1028                                     simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1029                                     simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
1030                                     vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
1031
1032                                     vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
1033                                 }
1034                                 else
1035                                 {
1036                                     vViewPortIdx = _simd_set1_epi32(0);
1037                                 }
1038
1039                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
1040 #endif
1041                             }
1042                         }
1043                     } while (gsPa.NextPrim());
1044                 }
1045             }
1046         }
1047     }
1048
1049     // update GS pipeline stats
1050     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
1051     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
1052     AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
1053     AR_END(FEGeometryShader, 1);
1054 }
1055
1056 //////////////////////////////////////////////////////////////////////////
1057 /// @brief Allocate GS buffers
1058 /// @param pDC - pointer to draw context.
1059 /// @param state - API state
1060 /// @param ppGsOut - pointer to GS output buffer allocation
1061 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1062 template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
1063 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
1064     void **ppStreamCutBuffer)
1065 {
1066     auto pArena = pDC->pArena;
1067     SWR_ASSERT(pArena != nullptr);
1068     SWR_ASSERT(state.gsState.gsEnable);
1069
1070     // allocate arena space to hold GS output verts
1071     // @todo pack attribs
1072     // @todo support multiple streams
1073
1074     const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState);
1075
1076     const uint32_t vertexBufferSize = state.gsState.instanceCount * bufferInfo.vertexInstanceStride;
1077
1078     *ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * sizeof(float));
1079
1080     // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
1081     // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
1082
1083     // allocate space for temporary per-stream cut buffer if multi-stream is enabled
1084     if (state.gsState.isSingleStream)
1085     {
1086         const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
1087
1088         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
1089         *ppStreamCutBuffer = nullptr;
1090     }
1091     else
1092     {
1093         const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
1094         const uint32_t streamCutBufferSize = state.gsState.instanceCount * bufferInfo.streamCutInstanceStride;
1095
1096         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
1097         *ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, SIMD_WIDTH * sizeof(float));
1098     }
1099 }
1100
1101 //////////////////////////////////////////////////////////////////////////
1102 /// @brief Contains all data generated by the HS and passed to the
1103 /// tessellator and DS.
1104 struct TessellationThreadLocalData
1105 {
1106     SWR_HS_CONTEXT hsContext;
1107     ScalarPatch patchData[KNOB_SIMD_WIDTH];
1108     void* pTxCtx;
1109     size_t tsCtxSize;
1110
1111     simdscalar* pDSOutput;
1112     size_t numDSOutputVectors;
1113 };
1114
1115 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1116
1117 //////////////////////////////////////////////////////////////////////////
1118 /// @brief Allocate tessellation data for this worker thread.
1119 INLINE
1120 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1121 {
1122     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
1123     if (gt_pTessellationThreadData == nullptr)
1124     {
1125         gt_pTessellationThreadData = (TessellationThreadLocalData*)
1126             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1127         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1128     }
1129 }
1130
1131 //////////////////////////////////////////////////////////////////////////
1132 /// @brief Implements Tessellation Stages.
1133 /// @param pDC - pointer to draw context.
1134 /// @param workerId - thread's worker id. Even thread has a unique id.
1135 /// @param pa - The primitive assembly object.
1136 /// @param pGsOut - output stream for GS
1137 template <
1138     typename HasGeometryShaderT,
1139     typename HasStreamOutT,
1140     typename HasRastT>
1141 static void TessellationStages(
1142     DRAW_CONTEXT *pDC,
1143     uint32_t workerId,
1144     PA_STATE& pa,
1145     void* pGsOut,
1146     void* pCutBuffer,
1147     void* pCutStreamBuffer,
1148     uint32_t* pSoPrimData,
1149 #if USE_SIMD16_FRONTEND
1150     uint32_t numPrims_simd8,
1151 #endif
1152     simdscalari primID)
1153 {
1154     SWR_CONTEXT *pContext = pDC->pContext;
1155     const API_STATE& state = GetApiState(pDC);
1156     const SWR_TS_STATE& tsState = state.tsState;
1157
1158     SWR_ASSERT(gt_pTessellationThreadData);
1159
1160     HANDLE tsCtx = TSInitCtx(
1161         tsState.domain,
1162         tsState.partitioning,
1163         tsState.tsOutputTopology,
1164         gt_pTessellationThreadData->pTxCtx,
1165         gt_pTessellationThreadData->tsCtxSize);
1166     if (tsCtx == nullptr)
1167     {
1168         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1169         tsCtx = TSInitCtx(
1170             tsState.domain,
1171             tsState.partitioning,
1172             tsState.tsOutputTopology,
1173             gt_pTessellationThreadData->pTxCtx,
1174             gt_pTessellationThreadData->tsCtxSize);
1175     }
1176     SWR_ASSERT(tsCtx);
1177
1178 #if USE_SIMD16_FRONTEND
1179     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1180     if (HasRastT::value)
1181     {
1182         switch (tsState.postDSTopology)
1183         {
1184         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
1185         case TOP_LINE_LIST:     pfnClipFunc = ClipLines_simd16; break;
1186         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints_simd16; break;
1187         default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1188         }
1189     }
1190
1191 #else
1192     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1193     if (HasRastT::value)
1194     {
1195         switch (tsState.postDSTopology)
1196         {
1197         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1198         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
1199         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
1200         default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1201         }
1202     }
1203
1204 #endif
1205     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1206     hsContext.pCPout = gt_pTessellationThreadData->patchData;
1207     hsContext.PrimitiveID = primID;
1208
1209     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1210     // Max storage for one attribute for an entire simdprimitive
1211     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1212
1213     // assemble all attributes for the input primitives
1214     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1215     {
1216         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1217         pa.Assemble(attribSlot, simdattrib);
1218
1219         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1220         {
1221             hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1222         }
1223     }
1224
1225 #if defined(_DEBUG)
1226     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1227 #endif
1228
1229 #if USE_SIMD16_FRONTEND
1230     uint32_t numPrims = numPrims_simd8;
1231 #else
1232     uint32_t numPrims = pa.NumPrims();
1233 #endif
1234     hsContext.mask = GenerateMask(numPrims);
1235
1236     // Run the HS
1237     AR_BEGIN(FEHullShader, pDC->drawId);
1238     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1239     AR_END(FEHullShader, 0);
1240
1241     UPDATE_STAT_FE(HsInvocations, numPrims);
1242
1243     const uint32_t* pPrimId = (const uint32_t*)&primID;
1244
1245     for (uint32_t p = 0; p < numPrims; ++p)
1246     {
1247         // Run Tessellator
1248         SWR_TS_TESSELLATED_DATA tsData = { 0 };
1249         AR_BEGIN(FETessellation, pDC->drawId);
1250         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1251         AR_EVENT(TessPrimCount(1));
1252         AR_END(FETessellation, 0);
1253
1254         if (tsData.NumPrimitives == 0)
1255         {
1256             continue;
1257         }
1258         SWR_ASSERT(tsData.NumDomainPoints);
1259
1260         // Allocate DS Output memory
1261         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1262         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1263 #if USE_SIMD16_FRONTEND
1264         size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs;      // simd8 -> simd16, padding
1265 #else
1266         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1267 #endif
1268         if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1269         {
1270             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1271             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1272 #if USE_SIMD16_FRONTEND
1273             gt_pTessellationThreadData->numDSOutputVectors = RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
1274 #else
1275             gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1276 #endif
1277         }
1278         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1279         SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1280
1281 #if defined(_DEBUG)
1282         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1283 #endif
1284
1285         // Run Domain Shader
1286         SWR_DS_CONTEXT dsContext;
1287         dsContext.PrimitiveID = pPrimId[p];
1288         dsContext.pCpIn = &hsContext.pCPout[p];
1289         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1290         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1291         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1292 #if USE_SIMD16_FRONTEND
1293         dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations);      // simd8 -> simd16
1294 #else
1295         dsContext.vectorStride = requiredDSVectorInvocations;
1296 #endif
1297
1298         uint32_t dsInvocations = 0;
1299
1300         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1301         {
1302             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1303
1304             AR_BEGIN(FEDomainShader, pDC->drawId);
1305             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1306             AR_END(FEDomainShader, 0);
1307
1308             dsInvocations += KNOB_SIMD_WIDTH;
1309         }
1310         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1311
1312 #if USE_SIMD16_FRONTEND
1313         SWR_ASSERT(IsEven(dsContext.vectorStride));                             // simd8 -> simd16
1314
1315 #endif
1316         PA_TESS tessPa(
1317             pDC,
1318 #if USE_SIMD16_FRONTEND
1319             reinterpret_cast<const simd16scalar *>(dsContext.pOutputData),      // simd8 -> simd16
1320             dsContext.vectorStride / 2,                                         // simd8 -> simd16
1321 #else
1322             dsContext.pOutputData,
1323             dsContext.vectorStride,
1324 #endif
1325             tsState.numDsOutputAttribs,
1326             tsData.ppIndices,
1327             tsData.NumPrimitives,
1328             tsState.postDSTopology);
1329
1330         while (tessPa.HasWork())
1331         {
1332 #if USE_SIMD16_FRONTEND
1333             const uint32_t numPrims = tessPa.NumPrims();
1334             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1335             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1336
1337             const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
1338             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1339             const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1340
1341 #endif
1342             if (HasGeometryShaderT::value)
1343             {
1344 #if USE_SIMD16_FRONTEND
1345                 tessPa.useAlternateOffset = false;
1346                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_lo, primID_lo);
1347
1348                 if (numPrims_hi)
1349                 {
1350                     tessPa.useAlternateOffset = true;
1351                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_hi, primID_hi);
1352                 }
1353 #else
1354                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1355                     pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1356                     _simd_set1_epi32(dsContext.PrimitiveID));
1357 #endif
1358             }
1359             else
1360             {
1361                 if (HasStreamOutT::value)
1362                 {
1363 #if USE_SIMD16_FRONTEND
1364                     tessPa.useAlternateOffset = false;
1365                     StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_lo, 0);
1366
1367                     if (numPrims_hi)
1368                     {
1369                         tessPa.useAlternateOffset = true;
1370                         StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_hi, 0);
1371                     }
1372 #else
1373                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1374 #endif
1375                 }
1376
1377                 if (HasRastT::value)
1378                 {
1379 #if USE_SIMD16_FRONTEND
1380                     simd16vector    prim_simd16[3]; // Only deal with triangles, lines, or points
1381 #else
1382                     simdvector      prim[3];        // Only deal with triangles, lines, or points
1383 #endif
1384                     AR_BEGIN(FEPAAssemble, pDC->drawId);
1385                     bool assemble =
1386 #if USE_SIMD16_FRONTEND
1387                         tessPa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
1388 #else
1389                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1390 #endif
1391                     AR_END(FEPAAssemble, 1);
1392                     SWR_ASSERT(assemble);
1393
1394                     SWR_ASSERT(pfnClipFunc);
1395 #if USE_SIMD16_FRONTEND
1396                     tessPa.useAlternateOffset = false;
1397                     pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_set1_epi32(0));
1398 #else
1399                     pfnClipFunc(pDC, tessPa, workerId, prim,
1400                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1401 #endif
1402                 }
1403             }
1404
1405             tessPa.NextPrim();
1406
1407         } // while (tessPa.HasWork())
1408     } // for (uint32_t p = 0; p < numPrims; ++p)
1409
1410 #if USE_SIMD16_FRONTEND
1411     if (gt_pTessellationThreadData->pDSOutput != nullptr)
1412     {
1413         AlignedFree(gt_pTessellationThreadData->pDSOutput);
1414         gt_pTessellationThreadData->pDSOutput = nullptr;
1415     }
1416     gt_pTessellationThreadData->numDSOutputVectors = 0;
1417
1418 #endif
1419     TSDestroyCtx(tsCtx);
1420 }
1421
1422 THREAD PA_STATE::SIMDVERTEX *pVertexStore = nullptr;
1423 THREAD uint32_t gVertexStoreSize = 0;
1424
1425 //////////////////////////////////////////////////////////////////////////
1426 /// @brief FE handler for SwrDraw.
1427 /// @tparam IsIndexedT - Is indexed drawing enabled
1428 /// @tparam HasTessellationT - Is tessellation enabled
1429 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1430 /// @tparam HasStreamOutT - Is stream-out enabled
1431 /// @tparam HasRastT - Is rasterization enabled
1432 /// @param pContext - pointer to SWR context.
1433 /// @param pDC - pointer to draw context.
1434 /// @param workerId - thread's worker id.
1435 /// @param pUserData - Pointer to DRAW_WORK
1436 template <
1437     typename IsIndexedT,
1438     typename IsCutIndexEnabledT,
1439     typename HasTessellationT,
1440     typename HasGeometryShaderT,
1441     typename HasStreamOutT,
1442     typename HasRastT>
1443 void ProcessDraw(
1444     SWR_CONTEXT *pContext,
1445     DRAW_CONTEXT *pDC,
1446     uint32_t workerId,
1447     void *pUserData)
1448 {
1449
1450 #if KNOB_ENABLE_TOSS_POINTS
1451     if (KNOB_TOSS_QUEUE_FE)
1452     {
1453         return;
1454     }
1455 #endif
1456
1457     AR_BEGIN(FEProcessDraw, pDC->drawId);
1458
1459     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1460     const API_STATE&    state = GetApiState(pDC);
1461
1462     uint32_t indexSize = 0;
1463     uint32_t endVertex = work.numVerts;
1464
1465     const int32_t* pLastRequestedIndex = nullptr;
1466     if (IsIndexedT::value)
1467     {
1468         switch (work.type)
1469         {
1470         case R32_UINT:
1471             indexSize = sizeof(uint32_t);
1472             pLastRequestedIndex = &(work.pIB[endVertex]);
1473             break;
1474         case R16_UINT:
1475             indexSize = sizeof(uint16_t);
1476             // nasty address offset to last index
1477             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1478             break;
1479         case R8_UINT:
1480             indexSize = sizeof(uint8_t);
1481             // nasty address offset to last index
1482             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1483             break;
1484         default:
1485             SWR_INVALID("Invalid work.type: %d", work.type);
1486         }
1487     }
1488     else
1489     {
1490         // No cuts, prune partial primitives.
1491         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1492     }
1493
1494 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1495     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1496 #endif
1497
1498     void* pGsOut = nullptr;
1499     void* pCutBuffer = nullptr;
1500     void* pStreamCutBuffer = nullptr;
1501     if (HasGeometryShaderT::value)
1502     {
1503 #if USE_SIMD16_FRONTEND
1504         AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1505 #else
1506         AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1507 #endif
1508     }
1509
1510     if (HasTessellationT::value)
1511     {
1512         SWR_ASSERT(state.tsState.tsEnable == true);
1513         SWR_ASSERT(state.pfnHsFunc != nullptr);
1514         SWR_ASSERT(state.pfnDsFunc != nullptr);
1515
1516         AllocateTessellationData(pContext);
1517     }
1518     else
1519     {
1520         SWR_ASSERT(state.tsState.tsEnable == false);
1521         SWR_ASSERT(state.pfnHsFunc == nullptr);
1522         SWR_ASSERT(state.pfnDsFunc == nullptr);
1523     }
1524
1525     // allocate space for streamout input prim data
1526     uint32_t* pSoPrimData = nullptr;
1527     if (HasStreamOutT::value)
1528     {
1529         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1530     }
1531
1532     const uint32_t vertexCount = NumVertsPerPrim(state.topology, state.gsState.gsEnable);
1533
1534     SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
1535
1536     // grow the vertex store for the PA as necessary
1537     if (gVertexStoreSize < vertexCount)
1538     {
1539         if (pVertexStore != nullptr)
1540         {
1541             AlignedFree(pVertexStore);
1542         }
1543
1544         while (gVertexStoreSize < vertexCount)
1545         {
1546 #if USE_SIMD16_FRONTEND
1547             gVertexStoreSize += 4;  // grow in chunks of 4 simd16vertex
1548 #else
1549             gVertexStoreSize += 8;  // grow in chunks of 8 simdvertex
1550 #endif
1551         }
1552
1553         SWR_ASSERT(gVertexStoreSize <= MAX_NUM_VERTS_PER_PRIM);
1554
1555         pVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(gVertexStoreSize * sizeof(pVertexStore[0]), 64));
1556
1557         SWR_ASSERT(pVertexStore != nullptr);
1558     }
1559
1560     // choose primitive assembler
1561     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, pVertexStore, gVertexStoreSize);
1562     PA_STATE& pa = paFactory.GetPA();
1563
1564 #if USE_SIMD16_FRONTEND
1565     simdvertex          vin_lo;
1566     simdvertex          vin_hi;
1567     SWR_VS_CONTEXT      vsContext_lo;
1568     SWR_VS_CONTEXT      vsContext_hi;
1569
1570     vsContext_lo.pVin = &vin_lo;
1571     vsContext_hi.pVin = &vin_hi;
1572     vsContext_lo.AlternateOffset = 0;
1573     vsContext_hi.AlternateOffset = 1;
1574
1575     SWR_FETCH_CONTEXT   fetchInfo_lo = { 0 };
1576
1577     fetchInfo_lo.pStreams = &state.vertexBuffers[0];
1578     fetchInfo_lo.StartInstance = work.startInstance;
1579     fetchInfo_lo.StartVertex = 0;
1580
1581     if (IsIndexedT::value)
1582     {
1583         fetchInfo_lo.BaseVertex = work.baseVertex;
1584
1585         // if the entire index buffer isn't being consumed, set the last index
1586         // so that fetches < a SIMD wide will be masked off
1587         fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1588         if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
1589         {
1590             fetchInfo_lo.pLastIndex = pLastRequestedIndex;
1591         }
1592     }
1593     else
1594     {
1595         fetchInfo_lo.StartVertex = work.startVertex;
1596     }
1597
1598     SWR_FETCH_CONTEXT   fetchInfo_hi = fetchInfo_lo;
1599
1600     const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1601
1602     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1603     {
1604         uint32_t  i = 0;
1605
1606         simd16scalari vIndex;
1607
1608         if (IsIndexedT::value)
1609         {
1610             fetchInfo_lo.pIndices = work.pIB;
1611             fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize);    // 1/2 of KNOB_SIMD16_WIDTH
1612         }
1613         else
1614         {
1615             vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1616
1617             fetchInfo_lo.pIndices = (const int32_t *)&vIndex;
1618             fetchInfo_hi.pIndices = (const int32_t *)&vIndex + KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH
1619         }
1620
1621         fetchInfo_lo.CurInstance = instanceNum;
1622         fetchInfo_hi.CurInstance = instanceNum;
1623
1624         vsContext_lo.InstanceID = instanceNum;
1625         vsContext_hi.InstanceID = instanceNum;
1626
1627         while (pa.HasWork())
1628         {
1629             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1630             // So we need to keep this outside of (i < endVertex) check.
1631
1632             simdmask *pvCutIndices_lo = nullptr;
1633             simdmask *pvCutIndices_hi = nullptr;
1634
1635             if (IsIndexedT::value)
1636             {
1637                 // simd16mask <=> simdmask[2]
1638
1639                 pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
1640                 pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
1641             }
1642
1643             simd16vertex &vout = pa.GetNextVsOutput();
1644
1645             vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
1646             vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);
1647
1648             if (i < endVertex)
1649             {
1650                 // 1. Execute FS/VS for a single SIMD.
1651                 AR_BEGIN(FEFetchShader, pDC->drawId);
1652                 state.pfnFetchFunc(fetchInfo_lo, vin_lo);
1653
1654                 if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
1655                 {
1656                     state.pfnFetchFunc(fetchInfo_hi, vin_hi);
1657                 }
1658                 AR_END(FEFetchShader, 0);
1659
1660                 // forward fetch generated vertex IDs to the vertex shader
1661                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1662                 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1663
1664                 // Setup active mask for vertex shader.
1665                 vsContext_lo.mask = GenerateMask(endVertex - i);
1666                 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1667
1668                 // forward cut mask to the PA
1669                 if (IsIndexedT::value)
1670                 {
1671                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1672                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1673                 }
1674
1675                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1676
1677 #if KNOB_ENABLE_TOSS_POINTS
1678                 if (!KNOB_TOSS_FETCH)
1679 #endif
1680                 {
1681                     AR_BEGIN(FEVertexShader, pDC->drawId);
1682                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
1683
1684                     if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
1685                     {
1686                         state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
1687                     }
1688                     AR_END(FEVertexShader, 0);
1689
1690                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1691                 }
1692             }
1693
1694             // 2. Assemble primitives given the last two SIMD.
1695             do
1696             {
1697                 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1698
1699                 RDTSC_START(FEPAAssemble);
1700                 bool assemble = pa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
1701                 RDTSC_STOP(FEPAAssemble, 1, 0);
1702
1703 #if KNOB_ENABLE_TOSS_POINTS
1704                 if (!KNOB_TOSS_FETCH)
1705 #endif
1706                 {
1707 #if KNOB_ENABLE_TOSS_POINTS
1708                     if (!KNOB_TOSS_VS)
1709 #endif
1710                     {
1711                         if (assemble)
1712                         {
1713                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1714
1715                             const uint32_t numPrims = pa.NumPrims();
1716                             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1717                             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1718
1719                             const simd16scalari primID = pa.GetPrimID(work.startPrimID);
1720                             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1721                             const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1722
1723                             if (HasTessellationT::value)
1724                             {
1725                                 pa.useAlternateOffset = false;
1726                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
1727
1728                                 if (numPrims_hi)
1729                                 {
1730                                     pa.useAlternateOffset = true;
1731                                     TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
1732                                 }
1733                             }
1734                             else if (HasGeometryShaderT::value)
1735                             {
1736                                 pa.useAlternateOffset = false;
1737                                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
1738
1739                                 if (numPrims_hi)
1740                                 {
1741                                     pa.useAlternateOffset = true;
1742                                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
1743                                 }
1744                             }
1745                             else
1746                             {
1747                                 // If streamout is enabled then stream vertices out to memory.
1748                                 if (HasStreamOutT::value)
1749                                 {
1750 #if 1
1751                                     pa.useAlternateOffset = false;
1752                                     StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_lo, 0);
1753
1754                                     if (numPrims_hi)
1755                                     {
1756                                         pa.useAlternateOffset = true;
1757                                         StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_hi, 0);
1758                                     }
1759 #else
1760                                     pa.useAlternateOffset = false;
1761                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1762 #endif
1763                                 }
1764
1765                                 if (HasRastT::value)
1766                                 {
1767                                     SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
1768
1769                                     pa.useAlternateOffset = false;
1770                                     pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_setzero_si());
1771                                 }
1772                             }
1773                         }
1774                     }
1775                 }
1776             } while (pa.NextPrim());
1777
1778             if (IsIndexedT::value)
1779             {
1780                 fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1781                 fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1782             }
1783             else
1784             {
1785                 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
1786             }
1787
1788             i += KNOB_SIMD16_WIDTH;
1789         }
1790
1791         pa.Reset();
1792     }
1793
1794 #else
1795     simdvertex          vin;
1796     SWR_VS_CONTEXT      vsContext;
1797
1798     vsContext.pVin = &vin;
1799
1800     SWR_FETCH_CONTEXT   fetchInfo = { 0 };
1801
1802     fetchInfo.pStreams = &state.vertexBuffers[0];
1803     fetchInfo.StartInstance = work.startInstance;
1804     fetchInfo.StartVertex = 0;
1805
1806     if (IsIndexedT::value)
1807     {
1808         fetchInfo.BaseVertex = work.baseVertex;
1809
1810         // if the entire index buffer isn't being consumed, set the last index
1811         // so that fetches < a SIMD wide will be masked off
1812         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1813         if (pLastRequestedIndex < fetchInfo.pLastIndex)
1814         {
1815             fetchInfo.pLastIndex = pLastRequestedIndex;
1816         }
1817     }
1818     else
1819     {
1820         fetchInfo.StartVertex = work.startVertex;
1821     }
1822
1823     const simdscalari   vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1824
1825     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1826     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1827     {
1828         simdscalari vIndex;
1829         uint32_t  i = 0;
1830
1831         if (IsIndexedT::value)
1832         {
1833             fetchInfo.pIndices = work.pIB;
1834         }
1835         else
1836         {
1837             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1838             fetchInfo.pIndices = (const int32_t*)&vIndex;
1839         }
1840
1841         fetchInfo.CurInstance = instanceNum;
1842         vsContext.InstanceID = instanceNum;
1843
1844         while (pa.HasWork())
1845         {
1846             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1847             // So we need to keep this outside of (i < endVertex) check.
1848             simdmask* pvCutIndices = nullptr;
1849             if (IsIndexedT::value)
1850             {
1851                 pvCutIndices = &pa.GetNextVsIndices();
1852             }
1853
1854             simdvertex& vout = pa.GetNextVsOutput();
1855             vsContext.pVout = &vout;
1856
1857             if (i < endVertex)
1858             {
1859
1860                 // 1. Execute FS/VS for a single SIMD.
1861                 AR_BEGIN(FEFetchShader, pDC->drawId);
1862                 state.pfnFetchFunc(fetchInfo, vin);
1863                 AR_END(FEFetchShader, 0);
1864
1865                 // forward fetch generated vertex IDs to the vertex shader
1866                 vsContext.VertexID = fetchInfo.VertexID;
1867
1868                 // Setup active mask for vertex shader.
1869                 vsContext.mask = GenerateMask(endVertex - i);
1870
1871                 // forward cut mask to the PA
1872                 if (IsIndexedT::value)
1873                 {
1874                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1875                 }
1876
1877                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1878
1879 #if KNOB_ENABLE_TOSS_POINTS
1880                 if (!KNOB_TOSS_FETCH)
1881 #endif
1882                 {
1883                     AR_BEGIN(FEVertexShader, pDC->drawId);
1884                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1885                     AR_END(FEVertexShader, 0);
1886
1887                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1888                 }
1889             }
1890
1891             // 2. Assemble primitives given the last two SIMD.
1892             do
1893             {
1894                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1895                 // PaAssemble returns false if there is not enough verts to assemble.
1896                 AR_BEGIN(FEPAAssemble, pDC->drawId);
1897                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1898                 AR_END(FEPAAssemble, 1);
1899
1900 #if KNOB_ENABLE_TOSS_POINTS
1901                 if (!KNOB_TOSS_FETCH)
1902 #endif
1903                 {
1904 #if KNOB_ENABLE_TOSS_POINTS
1905                     if (!KNOB_TOSS_VS)
1906 #endif
1907                     {
1908                         if (assemble)
1909                         {
1910                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1911
1912                             if (HasTessellationT::value)
1913                             {
1914                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1915                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1916                             }
1917                             else if (HasGeometryShaderT::value)
1918                             {
1919                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1920                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1921                             }
1922                             else
1923                             {
1924                                 // If streamout is enabled then stream vertices out to memory.
1925                                 if (HasStreamOutT::value)
1926                                 {
1927                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1928                                 }
1929
1930                                 if (HasRastT::value)
1931                                 {
1932                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
1933
1934                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1935                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1936                                 }
1937                             }
1938                         }
1939                     }
1940                 }
1941             } while (pa.NextPrim());
1942
1943             if (IsIndexedT::value)
1944             {
1945                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1946             }
1947             else
1948             {
1949                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1950             }
1951
1952             i += KNOB_SIMD_WIDTH;
1953         }
1954         pa.Reset();
1955     }
1956
1957 #endif
1958
1959     AR_END(FEProcessDraw, numPrims * work.numInstances);
1960 }
1961
1962 struct FEDrawChooser
1963 {
1964     typedef PFN_FE_WORK_FUNC FuncType;
1965
1966     template <typename... ArgsB>
1967     static FuncType GetFunc()
1968     {
1969         return ProcessDraw<ArgsB...>;
1970     }
1971 };
1972
1973
1974 // Selector for correct templated Draw front-end function
1975 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1976     bool IsIndexed,
1977     bool IsCutIndexEnabled,
1978     bool HasTessellation,
1979     bool HasGeometryShader,
1980     bool HasStreamOut,
1981     bool HasRasterization)
1982 {
1983     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1984 }