src/gallium/drivers/swr/rasterizer/core/frontend.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file frontend.cpp
  24 *
  25 * @brief Implementation for Frontend which handles vertex processing,
  26 *        primitive assembly, clipping, binning, etc.
  27 *
  28 ******************************************************************************/
  29
  30 #include "api.h"
  31 #include "frontend.h"
  32 #include "backend.h"
  33 #include "context.h"
  34 #include "rdtsc_core.h"
  35 #include "utils.h"
  36 #include "threads.h"
  37 #include "pa.h"
  38 #include "clip.h"
  39 #include "tilemgr.h"
  40 #include "tessellator.h"
  41 #include <limits>
  42
  43 //////////////////////////////////////////////////////////////////////////
  44 /// @brief Helper macro to generate a bitmask
  45 static INLINE uint32_t GenMask(uint32_t numBits)
  46 {
  47     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
  48     return ((1U << numBits) - 1);
  49 }
  50
  51 //////////////////////////////////////////////////////////////////////////
  52 /// @brief FE handler for SwrSync.
  53 /// @param pContext - pointer to SWR context.
  54 /// @param pDC - pointer to draw context.
  55 /// @param workerId - thread's worker id. Even thread has a unique id.
  56 /// @param pUserData - Pointer to user data passed back to sync callback.
  57 /// @todo This should go away when we switch this to use compute threading.
  58 void ProcessSync(
  59     SWR_CONTEXT *pContext,
  60     DRAW_CONTEXT *pDC,
  61     uint32_t workerId,
  62     void *pUserData)
  63 {
  64     BE_WORK work;
  65     work.type = SYNC;
  66     work.pfnWork = ProcessSyncBE;
  67
  68     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  69     pTileMgr->enqueue(0, 0, &work);
  70 }
  71
  72 //////////////////////////////////////////////////////////////////////////
  73 /// @brief FE handler for SwrDestroyContext.
  74 /// @param pContext - pointer to SWR context.
  75 /// @param pDC - pointer to draw context.
  76 /// @param workerId - thread's worker id. Even thread has a unique id.
  77 /// @param pUserData - Pointer to user data passed back to sync callback.
  78 void ProcessShutdown(
  79     SWR_CONTEXT *pContext,
  80     DRAW_CONTEXT *pDC,
  81     uint32_t workerId,
  82     void *pUserData)
  83 {
  84     BE_WORK work;
  85     work.type = SHUTDOWN;
  86     work.pfnWork = ProcessShutdownBE;
  87
  88     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  89     // Enqueue at least 1 work item for each worker thread
  90     // account for number of numa nodes
  91     uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
  92
  93     for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
  94     {
  95         for (uint32_t n = 0; n < numNumaNodes; ++n)
  96         {
  97             pTileMgr->enqueue(i, n, &work);
  98         }
  99     }
 100 }
 101
 102 //////////////////////////////////////////////////////////////////////////
 103 /// @brief FE handler for SwrClearRenderTarget.
 104 /// @param pContext - pointer to SWR context.
 105 /// @param pDC - pointer to draw context.
 106 /// @param workerId - thread's worker id. Even thread has a unique id.
 107 /// @param pUserData - Pointer to user data passed back to clear callback.
 108 /// @todo This should go away when we switch this to use compute threading.
 109 void ProcessClear(
 110     SWR_CONTEXT *pContext,
 111     DRAW_CONTEXT *pDC,
 112     uint32_t workerId,
 113     void *pUserData)
 114 {
 115     CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
 116     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 117
 118     // queue a clear to each macro tile
 119     // compute macro tile bounds for the specified rect
 120     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 121     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 122     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 123     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 124
 125     BE_WORK work;
 126     work.type = CLEAR;
 127     work.pfnWork = ProcessClearBE;
 128     work.desc.clear = *pDesc;
 129
 130     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 131     {
 132         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 133         {
 134             pTileMgr->enqueue(x, y, &work);
 135         }
 136     }
 137 }
 138
 139 //////////////////////////////////////////////////////////////////////////
 140 /// @brief FE handler for SwrStoreTiles.
 141 /// @param pContext - pointer to SWR context.
 142 /// @param pDC - pointer to draw context.
 143 /// @param workerId - thread's worker id. Even thread has a unique id.
 144 /// @param pUserData - Pointer to user data passed back to callback.
 145 /// @todo This should go away when we switch this to use compute threading.
 146 void ProcessStoreTiles(
 147     SWR_CONTEXT *pContext,
 148     DRAW_CONTEXT *pDC,
 149     uint32_t workerId,
 150     void *pUserData)
 151 {
 152     AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
 153     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 154     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
 155
 156     // queue a store to each macro tile
 157     // compute macro tile bounds for the specified rect
 158     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 159     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 160     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 161     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 162
 163     // store tiles
 164     BE_WORK work;
 165     work.type = STORETILES;
 166     work.pfnWork = ProcessStoreTilesBE;
 167     work.desc.storeTiles = *pDesc;
 168
 169     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 170     {
 171         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 172         {
 173             pTileMgr->enqueue(x, y, &work);
 174         }
 175     }
 176
 177     AR_END(FEProcessStoreTiles, 0);
 178 }
 179
 180 //////////////////////////////////////////////////////////////////////////
 181 /// @brief FE handler for SwrInvalidateTiles.
 182 /// @param pContext - pointer to SWR context.
 183 /// @param pDC - pointer to draw context.
 184 /// @param workerId - thread's worker id. Even thread has a unique id.
 185 /// @param pUserData - Pointer to user data passed back to callback.
 186 /// @todo This should go away when we switch this to use compute threading.
 187 void ProcessDiscardInvalidateTiles(
 188     SWR_CONTEXT *pContext,
 189     DRAW_CONTEXT *pDC,
 190     uint32_t workerId,
 191     void *pUserData)
 192 {
 193     AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
 194     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
 195     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 196
 197     // compute macro tile bounds for the specified rect
 198     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
 199     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
 200     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
 201     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
 202
 203     if (pDesc->fullTilesOnly == false)
 204     {
 205         // include partial tiles
 206         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 207         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 208         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 209         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 210     }
 211
 212     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
 213     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
 214
 215     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
 216     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
 217
 218     // load tiles
 219     BE_WORK work;
 220     work.type = DISCARDINVALIDATETILES;
 221     work.pfnWork = ProcessDiscardInvalidateTilesBE;
 222     work.desc.discardInvalidateTiles = *pDesc;
 223
 224     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 225     {
 226         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 227         {
 228             pTileMgr->enqueue(x, y, &work);
 229         }
 230     }
 231
 232     AR_END(FEProcessInvalidateTiles, 0);
 233 }
 234
 235 //////////////////////////////////////////////////////////////////////////
 236 /// @brief Computes the number of primitives given the number of verts.
 237 /// @param mode - primitive topology for draw operation.
 238 /// @param numPrims - number of vertices or indices for draw.
 239 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 240 uint32_t GetNumPrims(
 241     PRIMITIVE_TOPOLOGY mode,
 242     uint32_t numPrims)
 243 {
 244     switch (mode)
 245     {
 246     case TOP_POINT_LIST: return numPrims;
 247     case TOP_TRIANGLE_LIST: return numPrims / 3;
 248     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
 249     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
 250     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
 251     case TOP_QUAD_LIST: return numPrims / 4;
 252     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
 253     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
 254     case TOP_LINE_LIST: return numPrims / 2;
 255     case TOP_LINE_LOOP: return numPrims;
 256     case TOP_RECT_LIST: return numPrims / 3;
 257     case TOP_LINE_LIST_ADJ: return numPrims / 4;
 258     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
 259     case TOP_TRI_LIST_ADJ: return numPrims / 6;
 260     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 261
 262     case TOP_PATCHLIST_1:
 263     case TOP_PATCHLIST_2:
 264     case TOP_PATCHLIST_3:
 265     case TOP_PATCHLIST_4:
 266     case TOP_PATCHLIST_5:
 267     case TOP_PATCHLIST_6:
 268     case TOP_PATCHLIST_7:
 269     case TOP_PATCHLIST_8:
 270     case TOP_PATCHLIST_9:
 271     case TOP_PATCHLIST_10:
 272     case TOP_PATCHLIST_11:
 273     case TOP_PATCHLIST_12:
 274     case TOP_PATCHLIST_13:
 275     case TOP_PATCHLIST_14:
 276     case TOP_PATCHLIST_15:
 277     case TOP_PATCHLIST_16:
 278     case TOP_PATCHLIST_17:
 279     case TOP_PATCHLIST_18:
 280     case TOP_PATCHLIST_19:
 281     case TOP_PATCHLIST_20:
 282     case TOP_PATCHLIST_21:
 283     case TOP_PATCHLIST_22:
 284     case TOP_PATCHLIST_23:
 285     case TOP_PATCHLIST_24:
 286     case TOP_PATCHLIST_25:
 287     case TOP_PATCHLIST_26:
 288     case TOP_PATCHLIST_27:
 289     case TOP_PATCHLIST_28:
 290     case TOP_PATCHLIST_29:
 291     case TOP_PATCHLIST_30:
 292     case TOP_PATCHLIST_31:
 293     case TOP_PATCHLIST_32:
 294         return numPrims / (mode - TOP_PATCHLIST_BASE);
 295
 296     case TOP_POLYGON:
 297     case TOP_POINT_LIST_BF:
 298     case TOP_LINE_STRIP_CONT:
 299     case TOP_LINE_STRIP_BF:
 300     case TOP_LINE_STRIP_CONT_BF:
 301     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 302     case TOP_TRI_STRIP_REVERSE:
 303     case TOP_PATCHLIST_BASE:
 304     case TOP_UNKNOWN:
 305         SWR_INVALID("Unsupported topology: %d", mode);
 306         return 0;
 307     }
 308
 309     return 0;
 310 }
 311
 312 //////////////////////////////////////////////////////////////////////////
 313 /// @brief Computes the number of verts given the number of primitives.
 314 /// @param mode - primitive topology for draw operation.
 315 /// @param numPrims - number of primitives for draw.
 316 uint32_t GetNumVerts(
 317     PRIMITIVE_TOPOLOGY mode,
 318     uint32_t numPrims)
 319 {
 320     switch (mode)
 321     {
 322     case TOP_POINT_LIST: return numPrims;
 323     case TOP_TRIANGLE_LIST: return numPrims * 3;
 324     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
 325     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
 326     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
 327     case TOP_QUAD_LIST: return numPrims * 4;
 328     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
 329     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
 330     case TOP_LINE_LIST: return numPrims * 2;
 331     case TOP_LINE_LOOP: return numPrims;
 332     case TOP_RECT_LIST: return numPrims * 3;
 333     case TOP_LINE_LIST_ADJ: return numPrims * 4;
 334     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
 335     case TOP_TRI_LIST_ADJ: return numPrims * 6;
 336     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
 337
 338     case TOP_PATCHLIST_1:
 339     case TOP_PATCHLIST_2:
 340     case TOP_PATCHLIST_3:
 341     case TOP_PATCHLIST_4:
 342     case TOP_PATCHLIST_5:
 343     case TOP_PATCHLIST_6:
 344     case TOP_PATCHLIST_7:
 345     case TOP_PATCHLIST_8:
 346     case TOP_PATCHLIST_9:
 347     case TOP_PATCHLIST_10:
 348     case TOP_PATCHLIST_11:
 349     case TOP_PATCHLIST_12:
 350     case TOP_PATCHLIST_13:
 351     case TOP_PATCHLIST_14:
 352     case TOP_PATCHLIST_15:
 353     case TOP_PATCHLIST_16:
 354     case TOP_PATCHLIST_17:
 355     case TOP_PATCHLIST_18:
 356     case TOP_PATCHLIST_19:
 357     case TOP_PATCHLIST_20:
 358     case TOP_PATCHLIST_21:
 359     case TOP_PATCHLIST_22:
 360     case TOP_PATCHLIST_23:
 361     case TOP_PATCHLIST_24:
 362     case TOP_PATCHLIST_25:
 363     case TOP_PATCHLIST_26:
 364     case TOP_PATCHLIST_27:
 365     case TOP_PATCHLIST_28:
 366     case TOP_PATCHLIST_29:
 367     case TOP_PATCHLIST_30:
 368     case TOP_PATCHLIST_31:
 369     case TOP_PATCHLIST_32:
 370         return numPrims * (mode - TOP_PATCHLIST_BASE);
 371
 372     case TOP_POLYGON:
 373     case TOP_POINT_LIST_BF:
 374     case TOP_LINE_STRIP_CONT:
 375     case TOP_LINE_STRIP_BF:
 376     case TOP_LINE_STRIP_CONT_BF:
 377     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 378     case TOP_TRI_STRIP_REVERSE:
 379     case TOP_PATCHLIST_BASE:
 380     case TOP_UNKNOWN:
 381         SWR_INVALID("Unsupported topology: %d", mode);
 382         return 0;
 383     }
 384
 385     return 0;
 386 }
 387
 388 //////////////////////////////////////////////////////////////////////////
 389 /// @brief Return number of verts per primitive.
 390 /// @param topology - topology
 391 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 392 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 393 {
 394     uint32_t numVerts = 0;
 395     switch (topology)
 396     {
 397     case TOP_POINT_LIST:
 398     case TOP_POINT_LIST_BF:
 399         numVerts = 1;
 400         break;
 401     case TOP_LINE_LIST:
 402     case TOP_LINE_STRIP:
 403     case TOP_LINE_LIST_ADJ:
 404     case TOP_LINE_LOOP:
 405     case TOP_LINE_STRIP_CONT:
 406     case TOP_LINE_STRIP_BF:
 407     case TOP_LISTSTRIP_ADJ:
 408         numVerts = 2;
 409         break;
 410     case TOP_TRIANGLE_LIST:
 411     case TOP_TRIANGLE_STRIP:
 412     case TOP_TRIANGLE_FAN:
 413     case TOP_TRI_LIST_ADJ:
 414     case TOP_TRI_STRIP_ADJ:
 415     case TOP_TRI_STRIP_REVERSE:
 416     case TOP_RECT_LIST:
 417         numVerts = 3;
 418         break;
 419     case TOP_QUAD_LIST:
 420     case TOP_QUAD_STRIP:
 421         numVerts = 4;
 422         break;
 423     case TOP_PATCHLIST_1:
 424     case TOP_PATCHLIST_2:
 425     case TOP_PATCHLIST_3:
 426     case TOP_PATCHLIST_4:
 427     case TOP_PATCHLIST_5:
 428     case TOP_PATCHLIST_6:
 429     case TOP_PATCHLIST_7:
 430     case TOP_PATCHLIST_8:
 431     case TOP_PATCHLIST_9:
 432     case TOP_PATCHLIST_10:
 433     case TOP_PATCHLIST_11:
 434     case TOP_PATCHLIST_12:
 435     case TOP_PATCHLIST_13:
 436     case TOP_PATCHLIST_14:
 437     case TOP_PATCHLIST_15:
 438     case TOP_PATCHLIST_16:
 439     case TOP_PATCHLIST_17:
 440     case TOP_PATCHLIST_18:
 441     case TOP_PATCHLIST_19:
 442     case TOP_PATCHLIST_20:
 443     case TOP_PATCHLIST_21:
 444     case TOP_PATCHLIST_22:
 445     case TOP_PATCHLIST_23:
 446     case TOP_PATCHLIST_24:
 447     case TOP_PATCHLIST_25:
 448     case TOP_PATCHLIST_26:
 449     case TOP_PATCHLIST_27:
 450     case TOP_PATCHLIST_28:
 451     case TOP_PATCHLIST_29:
 452     case TOP_PATCHLIST_30:
 453     case TOP_PATCHLIST_31:
 454     case TOP_PATCHLIST_32:
 455         numVerts = topology - TOP_PATCHLIST_BASE;
 456         break;
 457     default:
 458         SWR_INVALID("Unsupported topology: %d", topology);
 459         break;
 460     }
 461
 462     if (includeAdjVerts)
 463     {
 464         switch (topology)
 465         {
 466         case TOP_LISTSTRIP_ADJ:
 467         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
 468         case TOP_TRI_STRIP_ADJ:
 469         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
 470         default: break;
 471         }
 472     }
 473
 474     return numVerts;
 475 }
 476
 477 //////////////////////////////////////////////////////////////////////////
 478 /// @brief Generate mask from remaining work.
 479 /// @param numWorkItems - Number of items being worked on by a SIMD.
 480 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 481 {
 482     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
 483     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 484     return _simd_castps_si(vMask(mask));
 485 }
 486
 487 //////////////////////////////////////////////////////////////////////////
 488 /// @brief StreamOut - Streams vertex data out to SO buffers.
 489 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 490 /// @param pDC - pointer to draw context.
 491 /// @param workerId - thread's worker id. Even thread has a unique id.
 492 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 493 static void StreamOut(
 494     DRAW_CONTEXT* pDC,
 495     PA_STATE& pa,
 496     uint32_t workerId,
 497     uint32_t* pPrimData,
 498     uint32_t streamIndex)
 499 {
 500     SWR_CONTEXT *pContext = pDC->pContext;
 501
 502     AR_BEGIN(FEStreamout, pDC->drawId);
 503
 504     const API_STATE& state = GetApiState(pDC);
 505     const SWR_STREAMOUT_STATE &soState = state.soState;
 506
 507     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 508
 509     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
 510     uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
 511
 512     SWR_STREAMOUT_CONTEXT soContext = { 0 };
 513
 514     // Setup buffer state pointers.
 515     for (uint32_t i = 0; i < 4; ++i)
 516     {
 517         soContext.pBuffer[i] = &state.soBuffer[i];
 518     }
 519
 520     uint32_t numPrims = pa.NumPrims();
 521
 522     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 523     {
 524         DWORD slot = 0;
 525         uint32_t soMask = soState.streamMasks[streamIndex];
 526
 527         // Write all entries into primitive data buffer for SOS.
 528         while (_BitScanForward(&slot, soMask))
 529         {
 530             __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
 531             uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
 532             pa.AssembleSingle(paSlot, primIndex, attrib);
 533
 534             // Attribute offset is relative offset from start of vertex.
 535             // Note that attributes start at slot 1 in the PA buffer. We need to write this
 536             // to prim data starting at slot 0. Which is why we do (slot - 1).
 537             // Also note: GL works slightly differently, and needs slot 0
 538             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
 539
 540             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
 541             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
 542             {
 543                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 544
 545                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 546             }
 547
 548             soMask &= ~(1 << slot);
 549         }
 550
 551         // Update pPrimData pointer
 552         soContext.pPrimData = pPrimData;
 553
 554         // Call SOS
 555         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
 556         state.pfnSoFunc[streamIndex](soContext);
 557     }
 558
 559     // Update SO write offset. The driver provides memory for the update.
 560     for (uint32_t i = 0; i < 4; ++i)
 561     {
 562         if (state.soBuffer[i].pWriteOffset)
 563         {
 564             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 565         }
 566
 567         if (state.soBuffer[i].soWriteEnable)
 568         {
 569             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 570             pDC->dynState.SoWriteOffsetDirty[i] = true;
 571         }
 572     }
 573
 574     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
 575     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 576
 577     AR_END(FEStreamout, 1);
 578 }
 579
 580 #if USE_SIMD16_FRONTEND
 581 //////////////////////////////////////////////////////////////////////////
 582 /// Is value an even number (a multiple of two)
 583 ///
 584 template <typename T>
 585 INLINE static bool IsEven(T value)
 586 {
 587     return (value & 1) == 0;
 588 }
 589
 590 //////////////////////////////////////////////////////////////////////////
 591 /// Round up value to an even number (a multiple of two)
 592 ///
 593 template <typename T>
 594 INLINE static T RoundUpEven(T value)
 595 {
 596     return (value + 1) & ~1;
 597 }
 598
 599 //////////////////////////////////////////////////////////////////////////
 600 /// Round down value to an even number (a multiple of two)
 601 ///
 602 template <typename T>
 603 INLINE static T RoundDownEven(T value)
 604 {
 605     return value & ~1;
 606 }
 607
 608 //////////////////////////////////////////////////////////////////////////
 609 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
 610 ///
 611 /// vertexCount is in terms of the source simdvertexes and must be even
 612 ///
 613 /// attribCount will limit the vector copies to those attribs specified
 614 ///
 615 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
 616 ///
 617 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
 618 {
 619     SWR_ASSERT(vertex);
 620     SWR_ASSERT(vertex_simd16);
 621     SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
 622
 623     simd16vertex temp;
 624
 625     for (uint32_t i = 0; i < vertexCount; i += 2)
 626     {
 627         for (uint32_t j = 0; j < attribCount; j += 1)
 628         {
 629             for (uint32_t k = 0; k < 4; k += 1)
 630             {
 631                 temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
 632
 633                 if ((i + 1) < vertexCount)
 634                 {
 635                     temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
 636                 }
 637             }
 638         }
 639
 640         for (uint32_t j = 0; j < attribCount; j += 1)
 641         {
 642             vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
 643         }
 644     }
 645 }
 646
 647 #endif
 648 //////////////////////////////////////////////////////////////////////////
 649 /// @brief Computes number of invocations. The current index represents
 650 ///        the start of the SIMD. The max index represents how much work
 651 ///        items are remaining. If there is less then a SIMD's xmin of work
 652 ///        then return the remaining amount of work.
 653 /// @param curIndex - The start index for the SIMD.
 654 /// @param maxIndex - The last index for all work items.
 655 static INLINE uint32_t GetNumInvocations(
 656     uint32_t curIndex,
 657     uint32_t maxIndex)
 658 {
 659     uint32_t remainder = (maxIndex - curIndex);
 660 #if USE_SIMD16_FRONTEND
 661     return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
 662 #else
 663     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 664 #endif
 665 }
 666
 667 //////////////////////////////////////////////////////////////////////////
 668 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 669 ///        The geometry shader will loop over each active streamout buffer, assembling
 670 ///        primitives for the downstream stages. When multistream output is enabled,
 671 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 672 ///        buffer for the primitive assembler.
 673 /// @param stream - stream id to generate the cut buffer for
 674 /// @param pStreamIdBase - pointer to the stream ID buffer
 675 /// @param numEmittedVerts - Number of total verts emitted by the GS
 676 /// @param pCutBuffer - output buffer to write cuts to
 677 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
 678 {
 679     SWR_ASSERT(stream < MAX_SO_STREAMS);
 680
 681     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
 682     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 683
 684     for (uint32_t b = 0; b < numOutputBytes; ++b)
 685     {
 686         uint8_t curInputByte = pStreamIdBase[2*b];
 687         uint8_t outByte = 0;
 688         for (uint32_t i = 0; i < 4; ++i)
 689         {
 690             if ((curInputByte & 0x3) != stream)
 691             {
 692                 outByte |= (1 << i);
 693             }
 694             curInputByte >>= 2;
 695         }
 696
 697         curInputByte = pStreamIdBase[2 * b + 1];
 698         for (uint32_t i = 0; i < 4; ++i)
 699         {
 700             if ((curInputByte & 0x3) != stream)
 701             {
 702                 outByte |= (1 << (i + 4));
 703             }
 704             curInputByte >>= 2;
 705         }
 706
 707         *pCutBuffer++ = outByte;
 708     }
 709 }
 710
 711 THREAD SWR_GS_CONTEXT tlsGsContext;
 712
 713 template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
 714 struct GsBufferInfo
 715 {
 716     GsBufferInfo(const SWR_GS_STATE &gsState)
 717     {
 718         const uint32_t vertexCount = gsState.maxNumVerts;
 719         const uint32_t vertexStride = sizeof(SIMDVERTEX);
 720         const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;
 721
 722         vertexPrimitiveStride = vertexStride * numSimdBatches;
 723         vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
 724
 725         if (gsState.isSingleStream)
 726         {
 727             cutPrimitiveStride = (vertexCount + 7) / 8;
 728             cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
 729
 730             streamCutPrimitiveStride = 0;
 731             streamCutInstanceStride = 0;
 732         }
 733         else
 734         {
 735             cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
 736             cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
 737
 738             streamCutPrimitiveStride = (vertexCount + 7) / 8;
 739             streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
 740         }
 741     }
 742
 743     uint32_t vertexPrimitiveStride;
 744     uint32_t vertexInstanceStride;
 745
 746     uint32_t cutPrimitiveStride;
 747     uint32_t cutInstanceStride;
 748
 749     uint32_t streamCutPrimitiveStride;
 750     uint32_t streamCutInstanceStride;
 751 };
 752
 753 //////////////////////////////////////////////////////////////////////////
 754 /// @brief Implements GS stage.
 755 /// @param pDC - pointer to draw context.
 756 /// @param workerId - thread's worker id. Even thread has a unique id.
 757 /// @param pa - The primitive assembly object.
 758 /// @param pGsOut - output stream for GS
 759 template <
 760     typename HasStreamOutT,
 761     typename HasRastT>
 762 static void GeometryShaderStage(
 763     DRAW_CONTEXT *pDC,
 764     uint32_t workerId,
 765     PA_STATE& pa,
 766     void* pGsOut,
 767     void* pCutBuffer,
 768     void* pStreamCutBuffer,
 769     uint32_t* pSoPrimData,
 770 #if USE_SIMD16_FRONTEND
 771     uint32_t numPrims_simd8,
 772 #endif
 773     simdscalari primID)
 774 {
 775     SWR_CONTEXT *pContext = pDC->pContext;
 776
 777     AR_BEGIN(FEGeometryShader, pDC->drawId);
 778
 779     const API_STATE& state = GetApiState(pDC);
 780     const SWR_GS_STATE* pState = &state.gsState;
 781
 782     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
 783     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 784
 785     tlsGsContext.pStream = (uint8_t*)pGsOut;
 786     tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
 787     tlsGsContext.PrimitiveID = primID;
 788
 789     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 790     simdvector attrib[MAX_ATTRIBUTES];
 791
 792     // assemble all attributes for the input primitive
 793     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 794     {
 795         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
 796         pa.Assemble(attribSlot, attrib);
 797
 798         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 799         {
 800             tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
 801         }
 802     }
 803
 804     // assemble position
 805     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 806     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 807     {
 808         tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
 809     }
 810
 811 #if USE_SIMD16_FRONTEND
 812     const GsBufferInfo<simd16vertex, KNOB_SIMD16_WIDTH> bufferInfo(state.gsState);
 813 #else
 814     const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);
 815 #endif
 816
 817     // record valid prims from the frontend to avoid over binning the newly generated
 818     // prims from the GS
 819 #if USE_SIMD16_FRONTEND
 820     uint32_t numInputPrims = numPrims_simd8;
 821 #else
 822     uint32_t numInputPrims = pa.NumPrims();
 823 #endif
 824
 825     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 826     {
 827         tlsGsContext.InstanceID = instance;
 828         tlsGsContext.mask = GenerateMask(numInputPrims);
 829
 830         // execute the geometry shader
 831         state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 832
 833         tlsGsContext.pStream += bufferInfo.vertexInstanceStride;
 834         tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride;
 835     }
 836
 837     // set up new binner and state for the GS output topology
 838 #if USE_SIMD16_FRONTEND
 839     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
 840     if (HasRastT::value)
 841     {
 842         switch (pState->outputTopology)
 843         {
 844         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles_simd16; break;
 845         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines_simd16; break;
 846         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints_simd16; break;
 847         default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
 848         }
 849     }
 850
 851 #else
 852     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 853     if (HasRastT::value)
 854     {
 855         switch (pState->outputTopology)
 856         {
 857         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
 858         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
 859         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
 860         default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
 861         }
 862     }
 863
 864 #endif
 865     // foreach input prim:
 866     // - setup a new PA based on the emitted verts for that prim
 867     // - loop over the new verts, calling PA to assemble each prim
 868     uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
 869     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 870
 871     uint32_t totalPrimsGenerated = 0;
 872     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 873     {
 874         uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * bufferInfo.vertexPrimitiveStride;
 875         uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * bufferInfo.cutPrimitiveStride;
 876
 877         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 878         {
 879             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 880             if (numEmittedVerts == 0)
 881             {
 882                 continue;
 883             }
 884
 885             uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;
 886             uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;
 887
 888             uint32_t numAttribs = state.feNumAttributes;
 889
 890             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
 891             {
 892                 bool processCutVerts = false;
 893
 894                 uint8_t* pCutBuffer = pCutBase;
 895
 896                 // assign default stream ID, only relevant when GS is outputting a single stream
 897                 uint32_t streamID = 0;
 898                 if (pState->isSingleStream)
 899                 {
 900                     processCutVerts = true;
 901                     streamID = pState->singleStreamID;
 902                     if (streamID != stream) continue;
 903                 }
 904                 else
 905                 {
 906                     // early exit if this stream is not enabled for streamout
 907                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
 908                     {
 909                         continue;
 910                     }
 911
 912                     // multi-stream output, need to translate StreamID buffer to a cut buffer
 913                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
 914                     pCutBuffer = (uint8_t*)pStreamCutBuffer;
 915                     processCutVerts = false;
 916                 }
 917
 918 #if USE_SIMD16_FRONTEND
 919                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 920
 921 #else
 922                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 923
 924 #endif
 925                 while (gsPa.GetNextStreamOutput())
 926                 {
 927                     do
 928                     {
 929 #if USE_SIMD16_FRONTEND
 930                         simd16vector attrib_simd16[3];
 931
 932                         bool assemble = gsPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
 933
 934 #else
 935                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
 936
 937 #endif
 938                         if (assemble)
 939                         {
 940                             totalPrimsGenerated += gsPa.NumPrims();
 941
 942                             if (HasStreamOutT::value)
 943                             {
 944                                 gsPa.useAlternateOffset = false;
 945                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
 946                             }
 947
 948                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
 949                             {
 950 #if USE_SIMD16_FRONTEND
 951                                 simd16scalari vPrimId;
 952                                 // pull primitiveID from the GS output if available
 953                                 if (state.gsState.emitsPrimitiveID)
 954                                 {
 955                                     simd16vector primIdAttrib[3];
 956                                     gsPa.Assemble_simd16(VERTEX_PRIMID_SLOT, primIdAttrib);
 957                                     vPrimId = _simd16_castps_si(primIdAttrib[state.frontendState.topologyProvokingVertex].x);
 958                                 }
 959                                 else
 960                                 {
 961                                     vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
 962                                 }
 963
 964                                 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
 965                                 simd16scalari vViewPortIdx;
 966                                 if (state.gsState.emitsViewportArrayIndex)
 967                                 {
 968                                     simd16vector vpiAttrib[3];
 969                                     gsPa.Assemble_simd16(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
 970
 971                                     // OOB indices => forced to zero.
 972                                     simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 973                                     simd16scalari vClearMask = _simd16_cmplt_epi32(_simd16_castps_si(vpiAttrib[0].x), vNumViewports);
 974                                     vpiAttrib[0].x = _simd16_and_ps(_simd16_castsi_ps(vClearMask), vpiAttrib[0].x);
 975
 976                                     vViewPortIdx = _simd16_castps_si(vpiAttrib[0].x);
 977                                 }
 978                                 else
 979                                 {
 980                                     vViewPortIdx = _simd16_set1_epi32(0);
 981                                 }
 982
 983                                 gsPa.useAlternateOffset = false;
 984                                 pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
 985 #else
 986                                 simdscalari vPrimId;
 987                                 // pull primitiveID from the GS output if available
 988                                 if (state.gsState.emitsPrimitiveID)
 989                                 {
 990                                     simdvector primIdAttrib[3];
 991                                     gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
 992                                     vPrimId = _simd_castps_si(primIdAttrib[state.frontendState.topologyProvokingVertex].x);
 993                                 }
 994                                 else
 995                                 {
 996                                     vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
 997                                 }
 998
 999                                 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
1000                                 simdscalari vViewPortIdx;
1001                                 if (state.gsState.emitsViewportArrayIndex)
1002                                 {
1003                                     simdvector vpiAttrib[3];
1004                                     gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
1005
1006                                     // OOB indices => forced to zero.
1007                                     simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1008                                     simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
1009                                     vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
1010
1011                                     vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
1012                                 }
1013                                 else
1014                                 {
1015                                     vViewPortIdx = _simd_set1_epi32(0);
1016                                 }
1017
1018                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
1019 #endif
1020                             }
1021                         }
1022                     } while (gsPa.NextPrim());
1023                 }
1024             }
1025         }
1026     }
1027
1028     // update GS pipeline stats
1029     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
1030     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
1031     AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
1032     AR_END(FEGeometryShader, 1);
1033 }
1034
1035 //////////////////////////////////////////////////////////////////////////
1036 /// @brief Allocate GS buffers
1037 /// @param pDC - pointer to draw context.
1038 /// @param state - API state
1039 /// @param ppGsOut - pointer to GS output buffer allocation
1040 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1041 template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
1042 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
1043     void **ppStreamCutBuffer)
1044 {
1045     auto pArena = pDC->pArena;
1046     SWR_ASSERT(pArena != nullptr);
1047     SWR_ASSERT(state.gsState.gsEnable);
1048
1049     // allocate arena space to hold GS output verts
1050     // @todo pack attribs
1051     // @todo support multiple streams
1052
1053     const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState);
1054
1055     const uint32_t vertexBufferSize = state.gsState.instanceCount * bufferInfo.vertexInstanceStride;
1056
1057     *ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * sizeof(float));
1058
1059     // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
1060     // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
1061
1062     // allocate space for temporary per-stream cut buffer if multi-stream is enabled
1063     if (state.gsState.isSingleStream)
1064     {
1065         const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
1066
1067         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
1068         *ppStreamCutBuffer = nullptr;
1069     }
1070     else
1071     {
1072         const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
1073         const uint32_t streamCutBufferSize = state.gsState.instanceCount * bufferInfo.streamCutInstanceStride;
1074
1075         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
1076         *ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, SIMD_WIDTH * sizeof(float));
1077     }
1078 }
1079
1080 //////////////////////////////////////////////////////////////////////////
1081 /// @brief Contains all data generated by the HS and passed to the
1082 /// tessellator and DS.
1083 struct TessellationThreadLocalData
1084 {
1085     SWR_HS_CONTEXT hsContext;
1086     ScalarPatch patchData[KNOB_SIMD_WIDTH];
1087     void* pTxCtx;
1088     size_t tsCtxSize;
1089
1090     simdscalar* pDSOutput;
1091     size_t numDSOutputVectors;
1092 };
1093
1094 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1095
1096 //////////////////////////////////////////////////////////////////////////
1097 /// @brief Allocate tessellation data for this worker thread.
1098 INLINE
1099 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1100 {
1101     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
1102     if (gt_pTessellationThreadData == nullptr)
1103     {
1104         gt_pTessellationThreadData = (TessellationThreadLocalData*)
1105             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1106         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1107     }
1108 }
1109
1110 //////////////////////////////////////////////////////////////////////////
1111 /// @brief Implements Tessellation Stages.
1112 /// @param pDC - pointer to draw context.
1113 /// @param workerId - thread's worker id. Even thread has a unique id.
1114 /// @param pa - The primitive assembly object.
1115 /// @param pGsOut - output stream for GS
1116 template <
1117     typename HasGeometryShaderT,
1118     typename HasStreamOutT,
1119     typename HasRastT>
1120 static void TessellationStages(
1121     DRAW_CONTEXT *pDC,
1122     uint32_t workerId,
1123     PA_STATE& pa,
1124     void* pGsOut,
1125     void* pCutBuffer,
1126     void* pCutStreamBuffer,
1127     uint32_t* pSoPrimData,
1128 #if USE_SIMD16_FRONTEND
1129     uint32_t numPrims_simd8,
1130 #endif
1131     simdscalari primID)
1132 {
1133     SWR_CONTEXT *pContext = pDC->pContext;
1134     const API_STATE& state = GetApiState(pDC);
1135     const SWR_TS_STATE& tsState = state.tsState;
1136
1137     SWR_ASSERT(gt_pTessellationThreadData);
1138
1139     HANDLE tsCtx = TSInitCtx(
1140         tsState.domain,
1141         tsState.partitioning,
1142         tsState.tsOutputTopology,
1143         gt_pTessellationThreadData->pTxCtx,
1144         gt_pTessellationThreadData->tsCtxSize);
1145     if (tsCtx == nullptr)
1146     {
1147         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1148         tsCtx = TSInitCtx(
1149             tsState.domain,
1150             tsState.partitioning,
1151             tsState.tsOutputTopology,
1152             gt_pTessellationThreadData->pTxCtx,
1153             gt_pTessellationThreadData->tsCtxSize);
1154     }
1155     SWR_ASSERT(tsCtx);
1156
1157 #if USE_SIMD16_FRONTEND
1158     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1159     if (HasRastT::value)
1160     {
1161         switch (tsState.postDSTopology)
1162         {
1163         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
1164         case TOP_LINE_LIST:     pfnClipFunc = ClipLines_simd16; break;
1165         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints_simd16; break;
1166         default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1167         }
1168     }
1169
1170 #else
1171     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1172     if (HasRastT::value)
1173     {
1174         switch (tsState.postDSTopology)
1175         {
1176         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1177         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
1178         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
1179         default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1180         }
1181     }
1182
1183 #endif
1184     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1185     hsContext.pCPout = gt_pTessellationThreadData->patchData;
1186     hsContext.PrimitiveID = primID;
1187
1188     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1189     // Max storage for one attribute for an entire simdprimitive
1190     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1191
1192     // assemble all attributes for the input primitives
1193     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1194     {
1195         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1196         pa.Assemble(attribSlot, simdattrib);
1197
1198         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1199         {
1200             hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1201         }
1202     }
1203
1204 #if defined(_DEBUG)
1205     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1206 #endif
1207
1208 #if USE_SIMD16_FRONTEND
1209     uint32_t numPrims = numPrims_simd8;
1210 #else
1211     uint32_t numPrims = pa.NumPrims();
1212 #endif
1213     hsContext.mask = GenerateMask(numPrims);
1214
1215     // Run the HS
1216     AR_BEGIN(FEHullShader, pDC->drawId);
1217     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1218     AR_END(FEHullShader, 0);
1219
1220     UPDATE_STAT_FE(HsInvocations, numPrims);
1221
1222     const uint32_t* pPrimId = (const uint32_t*)&primID;
1223
1224     for (uint32_t p = 0; p < numPrims; ++p)
1225     {
1226         // Run Tessellator
1227         SWR_TS_TESSELLATED_DATA tsData = { 0 };
1228         AR_BEGIN(FETessellation, pDC->drawId);
1229         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1230         AR_EVENT(TessPrimCount(1));
1231         AR_END(FETessellation, 0);
1232
1233         if (tsData.NumPrimitives == 0)
1234         {
1235             continue;
1236         }
1237         SWR_ASSERT(tsData.NumDomainPoints);
1238
1239         // Allocate DS Output memory
1240         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1241         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1242 #if USE_SIMD16_FRONTEND
1243         size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs;      // simd8 -> simd16, padding
1244 #else
1245         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1246 #endif
1247         if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1248         {
1249             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1250             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1251 #if USE_SIMD16_FRONTEND
1252             gt_pTessellationThreadData->numDSOutputVectors = RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
1253 #else
1254             gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1255 #endif
1256         }
1257         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1258         SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1259
1260 #if defined(_DEBUG)
1261         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1262 #endif
1263
1264         // Run Domain Shader
1265         SWR_DS_CONTEXT dsContext;
1266         dsContext.PrimitiveID = pPrimId[p];
1267         dsContext.pCpIn = &hsContext.pCPout[p];
1268         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1269         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1270         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1271 #if USE_SIMD16_FRONTEND
1272         dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations);      // simd8 -> simd16
1273 #else
1274         dsContext.vectorStride = requiredDSVectorInvocations;
1275 #endif
1276
1277         uint32_t dsInvocations = 0;
1278
1279         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1280         {
1281             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1282
1283             AR_BEGIN(FEDomainShader, pDC->drawId);
1284             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1285             AR_END(FEDomainShader, 0);
1286
1287             dsInvocations += KNOB_SIMD_WIDTH;
1288         }
1289         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1290
1291 #if USE_SIMD16_FRONTEND
1292         SWR_ASSERT(IsEven(dsContext.vectorStride));                             // simd8 -> simd16
1293
1294 #endif
1295         PA_TESS tessPa(
1296             pDC,
1297 #if USE_SIMD16_FRONTEND
1298             reinterpret_cast<const simd16scalar *>(dsContext.pOutputData),      // simd8 -> simd16
1299             dsContext.vectorStride / 2,                                         // simd8 -> simd16
1300 #else
1301             dsContext.pOutputData,
1302             dsContext.vectorStride,
1303 #endif
1304             tsState.numDsOutputAttribs,
1305             tsData.ppIndices,
1306             tsData.NumPrimitives,
1307             tsState.postDSTopology);
1308
1309         while (tessPa.HasWork())
1310         {
1311 #if USE_SIMD16_FRONTEND
1312             const uint32_t numPrims = tessPa.NumPrims();
1313             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1314             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1315
1316             const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
1317             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1318             const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1319
1320 #endif
1321             if (HasGeometryShaderT::value)
1322             {
1323 #if USE_SIMD16_FRONTEND
1324                 tessPa.useAlternateOffset = false;
1325                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_lo, primID_lo);
1326
1327                 if (numPrims_hi)
1328                 {
1329                     tessPa.useAlternateOffset = true;
1330                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_hi, primID_hi);
1331                 }
1332 #else
1333                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1334                     pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1335                     _simd_set1_epi32(dsContext.PrimitiveID));
1336 #endif
1337             }
1338             else
1339             {
1340                 if (HasStreamOutT::value)
1341                 {
1342                     tessPa.useAlternateOffset = false;
1343                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1344                 }
1345
1346                 if (HasRastT::value)
1347                 {
1348 #if USE_SIMD16_FRONTEND
1349                     simd16vector    prim_simd16[3]; // Only deal with triangles, lines, or points
1350 #else
1351                     simdvector      prim[3];        // Only deal with triangles, lines, or points
1352 #endif
1353                     AR_BEGIN(FEPAAssemble, pDC->drawId);
1354                     bool assemble =
1355 #if USE_SIMD16_FRONTEND
1356                         tessPa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
1357 #else
1358                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1359 #endif
1360                     AR_END(FEPAAssemble, 1);
1361                     SWR_ASSERT(assemble);
1362
1363                     SWR_ASSERT(pfnClipFunc);
1364 #if USE_SIMD16_FRONTEND
1365                     tessPa.useAlternateOffset = false;
1366                     pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_set1_epi32(0));
1367 #else
1368                     pfnClipFunc(pDC, tessPa, workerId, prim,
1369                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1370 #endif
1371                 }
1372             }
1373
1374             tessPa.NextPrim();
1375
1376         } // while (tessPa.HasWork())
1377     } // for (uint32_t p = 0; p < numPrims; ++p)
1378
1379 #if USE_SIMD16_FRONTEND
1380     if (gt_pTessellationThreadData->pDSOutput != nullptr)
1381     {
1382         AlignedFree(gt_pTessellationThreadData->pDSOutput);
1383         gt_pTessellationThreadData->pDSOutput = nullptr;
1384     }
1385     gt_pTessellationThreadData->numDSOutputVectors = 0;
1386
1387 #endif
1388     TSDestroyCtx(tsCtx);
1389 }
1390
1391 THREAD PA_STATE::SIMDVERTEX *pVertexStore = nullptr;
1392 THREAD uint32_t gVertexStoreSize = 0;
1393
1394 //////////////////////////////////////////////////////////////////////////
1395 /// @brief FE handler for SwrDraw.
1396 /// @tparam IsIndexedT - Is indexed drawing enabled
1397 /// @tparam HasTessellationT - Is tessellation enabled
1398 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1399 /// @tparam HasStreamOutT - Is stream-out enabled
1400 /// @tparam HasRastT - Is rasterization enabled
1401 /// @param pContext - pointer to SWR context.
1402 /// @param pDC - pointer to draw context.
1403 /// @param workerId - thread's worker id.
1404 /// @param pUserData - Pointer to DRAW_WORK
1405 template <
1406     typename IsIndexedT,
1407     typename IsCutIndexEnabledT,
1408     typename HasTessellationT,
1409     typename HasGeometryShaderT,
1410     typename HasStreamOutT,
1411     typename HasRastT>
1412 void ProcessDraw(
1413     SWR_CONTEXT *pContext,
1414     DRAW_CONTEXT *pDC,
1415     uint32_t workerId,
1416     void *pUserData)
1417 {
1418
1419 #if KNOB_ENABLE_TOSS_POINTS
1420     if (KNOB_TOSS_QUEUE_FE)
1421     {
1422         return;
1423     }
1424 #endif
1425
1426     AR_BEGIN(FEProcessDraw, pDC->drawId);
1427
1428     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1429     const API_STATE&    state = GetApiState(pDC);
1430
1431     uint32_t indexSize = 0;
1432     uint32_t endVertex = work.numVerts;
1433
1434     const int32_t* pLastRequestedIndex = nullptr;
1435     if (IsIndexedT::value)
1436     {
1437         switch (work.type)
1438         {
1439         case R32_UINT:
1440             indexSize = sizeof(uint32_t);
1441             pLastRequestedIndex = &(work.pIB[endVertex]);
1442             break;
1443         case R16_UINT:
1444             indexSize = sizeof(uint16_t);
1445             // nasty address offset to last index
1446             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1447             break;
1448         case R8_UINT:
1449             indexSize = sizeof(uint8_t);
1450             // nasty address offset to last index
1451             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1452             break;
1453         default:
1454             SWR_INVALID("Invalid work.type: %d", work.type);
1455         }
1456     }
1457     else
1458     {
1459         // No cuts, prune partial primitives.
1460         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1461     }
1462
1463 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1464     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1465 #endif
1466
1467     void* pGsOut = nullptr;
1468     void* pCutBuffer = nullptr;
1469     void* pStreamCutBuffer = nullptr;
1470     if (HasGeometryShaderT::value)
1471     {
1472 #if USE_SIMD16_FRONTEND
1473         AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1474 #else
1475         AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1476 #endif
1477     }
1478
1479     if (HasTessellationT::value)
1480     {
1481         SWR_ASSERT(state.tsState.tsEnable == true);
1482         SWR_ASSERT(state.pfnHsFunc != nullptr);
1483         SWR_ASSERT(state.pfnDsFunc != nullptr);
1484
1485         AllocateTessellationData(pContext);
1486     }
1487     else
1488     {
1489         SWR_ASSERT(state.tsState.tsEnable == false);
1490         SWR_ASSERT(state.pfnHsFunc == nullptr);
1491         SWR_ASSERT(state.pfnDsFunc == nullptr);
1492     }
1493
1494     // allocate space for streamout input prim data
1495     uint32_t* pSoPrimData = nullptr;
1496     if (HasStreamOutT::value)
1497     {
1498         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1499     }
1500
1501     const uint32_t vertexCount = NumVertsPerPrim(state.topology, state.gsState.gsEnable);
1502
1503     SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
1504
1505     // grow the vertex store for the PA as necessary
1506     if (gVertexStoreSize < vertexCount)
1507     {
1508         if (pVertexStore != nullptr)
1509         {
1510             AlignedFree(pVertexStore);
1511         }
1512
1513         while (gVertexStoreSize < vertexCount)
1514         {
1515 #if USE_SIMD16_FRONTEND
1516             gVertexStoreSize += 4;  // grow in chunks of 4 simd16vertex
1517 #else
1518             gVertexStoreSize += 8;  // grow in chunks of 8 simdvertex
1519 #endif
1520         }
1521
1522         SWR_ASSERT(gVertexStoreSize <= MAX_NUM_VERTS_PER_PRIM);
1523
1524         pVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(gVertexStoreSize * sizeof(pVertexStore[0]), 64));
1525
1526         SWR_ASSERT(pVertexStore != nullptr);
1527     }
1528
1529     // choose primitive assembler
1530     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, pVertexStore, gVertexStoreSize);
1531     PA_STATE& pa = paFactory.GetPA();
1532
1533 #if USE_SIMD16_FRONTEND
1534     simdvertex          vin_lo;
1535     simdvertex          vin_hi;
1536     SWR_VS_CONTEXT      vsContext_lo;
1537     SWR_VS_CONTEXT      vsContext_hi;
1538
1539     vsContext_lo.pVin = &vin_lo;
1540     vsContext_hi.pVin = &vin_hi;
1541     vsContext_lo.AlternateOffset = 0;
1542     vsContext_hi.AlternateOffset = 1;
1543
1544     SWR_FETCH_CONTEXT   fetchInfo_lo = { 0 };
1545
1546     fetchInfo_lo.pStreams = &state.vertexBuffers[0];
1547     fetchInfo_lo.StartInstance = work.startInstance;
1548     fetchInfo_lo.StartVertex = 0;
1549
1550     if (IsIndexedT::value)
1551     {
1552         fetchInfo_lo.BaseVertex = work.baseVertex;
1553
1554         // if the entire index buffer isn't being consumed, set the last index
1555         // so that fetches < a SIMD wide will be masked off
1556         fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1557         if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
1558         {
1559             fetchInfo_lo.pLastIndex = pLastRequestedIndex;
1560         }
1561     }
1562     else
1563     {
1564         fetchInfo_lo.StartVertex = work.startVertex;
1565     }
1566
1567     SWR_FETCH_CONTEXT   fetchInfo_hi = fetchInfo_lo;
1568
1569     const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1570
1571     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1572     {
1573         uint32_t  i = 0;
1574
1575         simd16scalari vIndex;
1576
1577         if (IsIndexedT::value)
1578         {
1579             fetchInfo_lo.pIndices = work.pIB;
1580             fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize);    // 1/2 of KNOB_SIMD16_WIDTH
1581         }
1582         else
1583         {
1584             vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1585
1586             fetchInfo_lo.pIndices = (const int32_t *)&vIndex;
1587             fetchInfo_hi.pIndices = (const int32_t *)&vIndex + KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH
1588         }
1589
1590         fetchInfo_lo.CurInstance = instanceNum;
1591         fetchInfo_hi.CurInstance = instanceNum;
1592
1593         vsContext_lo.InstanceID = instanceNum;
1594         vsContext_hi.InstanceID = instanceNum;
1595
1596         while (pa.HasWork())
1597         {
1598             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1599             // So we need to keep this outside of (i < endVertex) check.
1600
1601             simdmask *pvCutIndices_lo = nullptr;
1602             simdmask *pvCutIndices_hi = nullptr;
1603
1604             if (IsIndexedT::value)
1605             {
1606                 // simd16mask <=> simdmask[2]
1607
1608                 pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
1609                 pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
1610             }
1611
1612             simd16vertex &vout = pa.GetNextVsOutput();
1613
1614             vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
1615             vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);
1616
1617             if (i < endVertex)
1618             {
1619                 // 1. Execute FS/VS for a single SIMD.
1620                 AR_BEGIN(FEFetchShader, pDC->drawId);
1621                 state.pfnFetchFunc(fetchInfo_lo, vin_lo);
1622
1623                 if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
1624                 {
1625                     state.pfnFetchFunc(fetchInfo_hi, vin_hi);
1626                 }
1627                 AR_END(FEFetchShader, 0);
1628
1629                 // forward fetch generated vertex IDs to the vertex shader
1630                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1631                 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1632
1633                 // Setup active mask for vertex shader.
1634                 vsContext_lo.mask = GenerateMask(endVertex - i);
1635                 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1636
1637                 // forward cut mask to the PA
1638                 if (IsIndexedT::value)
1639                 {
1640                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1641                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1642                 }
1643
1644                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1645
1646 #if KNOB_ENABLE_TOSS_POINTS
1647                 if (!KNOB_TOSS_FETCH)
1648 #endif
1649                 {
1650                     AR_BEGIN(FEVertexShader, pDC->drawId);
1651                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
1652
1653                     if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
1654                     {
1655                         state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
1656                     }
1657                     AR_END(FEVertexShader, 0);
1658
1659                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1660                 }
1661             }
1662
1663             // 2. Assemble primitives given the last two SIMD.
1664             do
1665             {
1666                 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1667
1668                 RDTSC_START(FEPAAssemble);
1669                 bool assemble = pa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
1670                 RDTSC_STOP(FEPAAssemble, 1, 0);
1671
1672 #if KNOB_ENABLE_TOSS_POINTS
1673                 if (!KNOB_TOSS_FETCH)
1674 #endif
1675                 {
1676 #if KNOB_ENABLE_TOSS_POINTS
1677                     if (!KNOB_TOSS_VS)
1678 #endif
1679                     {
1680                         if (assemble)
1681                         {
1682                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1683
1684                             const uint32_t numPrims = pa.NumPrims();
1685                             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1686                             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1687
1688                             const simd16scalari primID = pa.GetPrimID(work.startPrimID);
1689                             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1690                             const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1691
1692                             if (HasTessellationT::value)
1693                             {
1694                                 pa.useAlternateOffset = false;
1695                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
1696
1697                                 if (numPrims_hi)
1698                                 {
1699                                     pa.useAlternateOffset = true;
1700                                     TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
1701                                 }
1702                             }
1703                             else if (HasGeometryShaderT::value)
1704                             {
1705                                 pa.useAlternateOffset = false;
1706                                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
1707
1708                                 if (numPrims_hi)
1709                                 {
1710                                     pa.useAlternateOffset = true;
1711                                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
1712                                 }
1713                             }
1714                             else
1715                             {
1716                                 // If streamout is enabled then stream vertices out to memory.
1717                                 if (HasStreamOutT::value)
1718                                 {
1719                                     pa.useAlternateOffset = false;
1720                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1721                                 }
1722
1723                                 if (HasRastT::value)
1724                                 {
1725                                     SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
1726
1727                                     pa.useAlternateOffset = false;
1728                                     pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_setzero_si());
1729                                 }
1730                             }
1731                         }
1732                     }
1733                 }
1734             } while (pa.NextPrim());
1735
1736             if (IsIndexedT::value)
1737             {
1738                 fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1739                 fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1740             }
1741             else
1742             {
1743                 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
1744             }
1745
1746             i += KNOB_SIMD16_WIDTH;
1747         }
1748
1749         pa.Reset();
1750     }
1751
1752 #else
1753     simdvertex          vin;
1754     SWR_VS_CONTEXT      vsContext;
1755
1756     vsContext.pVin = &vin;
1757
1758     SWR_FETCH_CONTEXT   fetchInfo = { 0 };
1759
1760     fetchInfo.pStreams = &state.vertexBuffers[0];
1761     fetchInfo.StartInstance = work.startInstance;
1762     fetchInfo.StartVertex = 0;
1763
1764     if (IsIndexedT::value)
1765     {
1766         fetchInfo.BaseVertex = work.baseVertex;
1767
1768         // if the entire index buffer isn't being consumed, set the last index
1769         // so that fetches < a SIMD wide will be masked off
1770         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1771         if (pLastRequestedIndex < fetchInfo.pLastIndex)
1772         {
1773             fetchInfo.pLastIndex = pLastRequestedIndex;
1774         }
1775     }
1776     else
1777     {
1778         fetchInfo.StartVertex = work.startVertex;
1779     }
1780
1781     const simdscalari   vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1782
1783     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1784     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1785     {
1786         simdscalari vIndex;
1787         uint32_t  i = 0;
1788
1789         if (IsIndexedT::value)
1790         {
1791             fetchInfo.pIndices = work.pIB;
1792         }
1793         else
1794         {
1795             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1796             fetchInfo.pIndices = (const int32_t*)&vIndex;
1797         }
1798
1799         fetchInfo.CurInstance = instanceNum;
1800         vsContext.InstanceID = instanceNum;
1801
1802         while (pa.HasWork())
1803         {
1804             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1805             // So we need to keep this outside of (i < endVertex) check.
1806             simdmask* pvCutIndices = nullptr;
1807             if (IsIndexedT::value)
1808             {
1809                 pvCutIndices = &pa.GetNextVsIndices();
1810             }
1811
1812             simdvertex& vout = pa.GetNextVsOutput();
1813             vsContext.pVout = &vout;
1814
1815             if (i < endVertex)
1816             {
1817
1818                 // 1. Execute FS/VS for a single SIMD.
1819                 AR_BEGIN(FEFetchShader, pDC->drawId);
1820                 state.pfnFetchFunc(fetchInfo, vin);
1821                 AR_END(FEFetchShader, 0);
1822
1823                 // forward fetch generated vertex IDs to the vertex shader
1824                 vsContext.VertexID = fetchInfo.VertexID;
1825
1826                 // Setup active mask for vertex shader.
1827                 vsContext.mask = GenerateMask(endVertex - i);
1828
1829                 // forward cut mask to the PA
1830                 if (IsIndexedT::value)
1831                 {
1832                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1833                 }
1834
1835                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1836
1837 #if KNOB_ENABLE_TOSS_POINTS
1838                 if (!KNOB_TOSS_FETCH)
1839 #endif
1840                 {
1841                     AR_BEGIN(FEVertexShader, pDC->drawId);
1842                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1843                     AR_END(FEVertexShader, 0);
1844
1845                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1846                 }
1847             }
1848
1849             // 2. Assemble primitives given the last two SIMD.
1850             do
1851             {
1852                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1853                 // PaAssemble returns false if there is not enough verts to assemble.
1854                 AR_BEGIN(FEPAAssemble, pDC->drawId);
1855                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1856                 AR_END(FEPAAssemble, 1);
1857
1858 #if KNOB_ENABLE_TOSS_POINTS
1859                 if (!KNOB_TOSS_FETCH)
1860 #endif
1861                 {
1862 #if KNOB_ENABLE_TOSS_POINTS
1863                     if (!KNOB_TOSS_VS)
1864 #endif
1865                     {
1866                         if (assemble)
1867                         {
1868                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1869
1870                             if (HasTessellationT::value)
1871                             {
1872                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1873                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1874                             }
1875                             else if (HasGeometryShaderT::value)
1876                             {
1877                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1878                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1879                             }
1880                             else
1881                             {
1882                                 // If streamout is enabled then stream vertices out to memory.
1883                                 if (HasStreamOutT::value)
1884                                 {
1885                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1886                                 }
1887
1888                                 if (HasRastT::value)
1889                                 {
1890                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
1891
1892                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1893                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1894                                 }
1895                             }
1896                         }
1897                     }
1898                 }
1899             } while (pa.NextPrim());
1900
1901             if (IsIndexedT::value)
1902             {
1903                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1904             }
1905             else
1906             {
1907                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1908             }
1909
1910             i += KNOB_SIMD_WIDTH;
1911         }
1912         pa.Reset();
1913     }
1914
1915 #endif
1916
1917     AR_END(FEProcessDraw, numPrims * work.numInstances);
1918 }
1919
1920 struct FEDrawChooser
1921 {
1922     typedef PFN_FE_WORK_FUNC FuncType;
1923
1924     template <typename... ArgsB>
1925     static FuncType GetFunc()
1926     {
1927         return ProcessDraw<ArgsB...>;
1928     }
1929 };
1930
1931
1932 // Selector for correct templated Draw front-end function
1933 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1934     bool IsIndexed,
1935     bool IsCutIndexEnabled,
1936     bool HasTessellation,
1937     bool HasGeometryShader,
1938     bool HasStreamOut,
1939     bool HasRasterization)
1940 {
1941     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1942 }