swr/rast: SIMD16 FE - interleaved simdvertex output in GS

[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp

index ef90a24fe75d21af7ee1b9b1804f05afe5014e6d..e88246f478fa27334dc2271090f0f72f73e2593e 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -32,7 +32,6 @@
  #include "backend.h"
  #include "context.h"
  #include "rdtsc_core.h"
-#include "rasterizer.h"
  #include "utils.h"
  #include "threads.h"
  #include "pa.h"
@@ -49,15 +48,6 @@ static INLINE uint32_t GenMask(uint32_t numBits)
      return ((1U << numBits) - 1);
  }
  
-//////////////////////////////////////////////////////////////////////////
-/// @brief Offsets added to post-viewport vertex positions based on
-/// raster state.
-static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
-{
-    _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
-    _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
-};
-
  //////////////////////////////////////////////////////////////////////////
  /// @brief FE handler for SwrSync.
  /// @param pContext - pointer to SWR context.
@@ -71,37 +61,42 @@ void ProcessSync(
      uint32_t workerId,
      void *pUserData)
  {
-    SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
      BE_WORK work;
      work.type = SYNC;
      work.pfnWork = ProcessSyncBE;
-    work.desc.sync = *pSync;
  
      MacroTileMgr *pTileMgr = pDC->pTileMgr;
      pTileMgr->enqueue(0, 0, &work);
  }
  
  //////////////////////////////////////////////////////////////////////////
-/// @brief FE handler for SwrGetStats.
+/// @brief FE handler for SwrDestroyContext.
  /// @param pContext - pointer to SWR context.
  /// @param pDC - pointer to draw context.
  /// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param pUserData - Pointer to user data passed back to stats callback.
-/// @todo This should go away when we switch this to use compute threading.
-void ProcessQueryStats(
+/// @param pUserData - Pointer to user data passed back to sync callback.
+void ProcessShutdown(
      SWR_CONTEXT *pContext,
      DRAW_CONTEXT *pDC,
      uint32_t workerId,
      void *pUserData)
  {
-    QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData;
      BE_WORK work;
-    work.type = QUERYSTATS;
-    work.pfnWork = ProcessQueryStatsBE;
-    work.desc.queryStats = *pQueryStats;
+    work.type = SHUTDOWN;
+    work.pfnWork = ProcessShutdownBE;
  
      MacroTileMgr *pTileMgr = pDC->pTileMgr;
-    pTileMgr->enqueue(0, 0, &work);
+    // Enqueue at least 1 work item for each worker thread
+    // account for number of numa nodes
+    uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
+
+    for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
+    {
+        for (uint32_t n = 0; n < numNumaNodes; ++n)
+        {
+            pTileMgr->enqueue(i, n, &work);
+        }
+    }
  }
  
  //////////////////////////////////////////////////////////////////////////
@@ -117,26 +112,24 @@ void ProcessClear(
      uint32_t workerId,
      void *pUserData)
  {
-    CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+    CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
      MacroTileMgr *pTileMgr = pDC->pTileMgr;
  
-    const API_STATE& state = GetApiState(pDC);
-
      // queue a clear to each macro tile
-    // compute macro tile bounds for the current scissor/viewport
-    uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
-    uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
-    uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
-    uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
+    // compute macro tile bounds for the specified rect
+    uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
+    uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
+    uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
+    uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
  
      BE_WORK work;
      work.type = CLEAR;
      work.pfnWork = ProcessClearBE;
-    work.desc.clear = *pClear;
+    work.desc.clear = *pDesc;
  
-    for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
+    for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
      {
-        for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
+        for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
          {
              pTileMgr->enqueue(x, y, &work);
          }
@@ -156,35 +149,32 @@ void ProcessStoreTiles(
      uint32_t workerId,
      void *pUserData)
  {
-    RDTSC_START(FEProcessStoreTiles);
-    STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
+    AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
      MacroTileMgr *pTileMgr = pDC->pTileMgr;
-
-    const API_STATE& state = GetApiState(pDC);
+    STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
  
      // queue a store to each macro tile
-    // compute macro tile bounds for the current render target
-    const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
-    const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
-
-    uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
-    uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
+    // compute macro tile bounds for the specified rect
+    uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
+    uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
+    uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
+    uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
  
      // store tiles
      BE_WORK work;
      work.type = STORETILES;
-    work.pfnWork = ProcessStoreTileBE;
-    work.desc.storeTiles = *pStore;
+    work.pfnWork = ProcessStoreTilesBE;
+    work.desc.storeTiles = *pDesc;
  
-    for (uint32_t x = 0; x < numMacroTilesX; ++x)
+    for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
      {
-        for (uint32_t y = 0; y < numMacroTilesY; ++y)
+        for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
          {
              pTileMgr->enqueue(x, y, &work);
          }
      }
  
-    RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
+    AR_END(FEProcessStoreTiles, 0);
  }
  
  //////////////////////////////////////////////////////////////////////////
@@ -200,71 +190,46 @@ void ProcessDiscardInvalidateTiles(
      uint32_t workerId,
      void *pUserData)
  {
-    RDTSC_START(FEProcessInvalidateTiles);
-    DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
+    AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
+    DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
      MacroTileMgr *pTileMgr = pDC->pTileMgr;
  
-    SWR_RECT rect;
-
-    if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
-    {
-        // Valid rect
-        rect = pInv->rect;
-    }
-    else
-    {
-        // Use viewport dimensions
-        const API_STATE& state = GetApiState(pDC);
-
-        rect.left   = (uint32_t)state.vp[0].x;
-        rect.right  = (uint32_t)(state.vp[0].x + state.vp[0].width);
-        rect.top    = (uint32_t)state.vp[0].y;
-        rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
-    }
-
-    // queue a store to each macro tile
-    // compute macro tile bounds for the current render target
-    uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
-    uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
-
-    // Setup region assuming full tiles
-    uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
-    uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
-
-    uint32_t macroTileEndX = rect.right / macroWidth;
-    uint32_t macroTileEndY = rect.bottom / macroHeight;
+    // compute macro tile bounds for the specified rect
+    uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
+    uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
+    uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
+    uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
  
-    if (pInv->fullTilesOnly == false)
+    if (pDesc->fullTilesOnly == false)
      {
          // include partial tiles
-        macroTileStartX = rect.left / macroWidth;
-        macroTileStartY = rect.top / macroHeight;
-
-        macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
-        macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
+        macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
+        macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
+        macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
+        macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
      }
  
-    SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
-    SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
+    SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
+    SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
  
-    macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
-    macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
+    macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
+    macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
  
      // load tiles
      BE_WORK work;
      work.type = DISCARDINVALIDATETILES;
      work.pfnWork = ProcessDiscardInvalidateTilesBE;
-    work.desc.discardInvalidateTiles = *pInv;
+    work.desc.discardInvalidateTiles = *pDesc;
  
-    for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
+    for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
      {
-        for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
+        for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
          {
              pTileMgr->enqueue(x, y, &work);
          }
      }
  
-    RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
+    AR_END(FEProcessInvalidateTiles, 0);
  }
  
  //////////////////////////////////////////////////////////////////////////
@@ -337,7 +302,7 @@ uint32_t GetNumPrims(
      case TOP_TRI_STRIP_REVERSE:
      case TOP_PATCHLIST_BASE:
      case TOP_UNKNOWN:
-        SWR_ASSERT(false, "Unsupported topology: %d", mode);
+        SWR_INVALID("Unsupported topology: %d", mode);
          return 0;
      }
  
@@ -413,7 +378,7 @@ uint32_t GetNumVerts(
      case TOP_TRI_STRIP_REVERSE:
      case TOP_PATCHLIST_BASE:
      case TOP_UNKNOWN:
-        SWR_ASSERT(false, "Unsupported topology: %d", mode);
+        SWR_INVALID("Unsupported topology: %d", mode);
          return 0;
      }
  
@@ -490,7 +455,7 @@ INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVert
          numVerts = topology - TOP_PATCHLIST_BASE;
          break;
      default:
-        SWR_ASSERT(false, "Unsupported topology: %d", topology);
+        SWR_INVALID("Unsupported topology: %d", topology);
          break;
      }
  
@@ -530,11 +495,14 @@ static void StreamOut(
      PA_STATE& pa,
      uint32_t workerId,
      uint32_t* pPrimData,
+#if USE_SIMD16_FRONTEND
+    uint32_t numPrims_simd8,
+#endif
      uint32_t streamIndex)
  {
-    RDTSC_START(FEStreamout);
+    SWR_CONTEXT *pContext = pDC->pContext;
  
-    SWR_CONTEXT* pContext = pDC->pContext;
+    AR_BEGIN(FEStreamout, pDC->drawId);
  
      const API_STATE& state = GetApiState(pDC);
      const SWR_STREAMOUT_STATE &soState = state.soState;
@@ -542,7 +510,7 @@ static void StreamOut(
      uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
  
      // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
-    uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
+    uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
  
      SWR_STREAMOUT_CONTEXT soContext = { 0 };
  
@@ -552,7 +520,12 @@ static void StreamOut(
          soContext.pBuffer[i] = &state.soBuffer[i];
      }
  
+#if USE_SIMD16_FRONTEND
+    uint32_t numPrims = numPrims_simd8;
+#else
      uint32_t numPrims = pa.NumPrims();
+#endif
+
      for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
      {
          DWORD slot = 0;
@@ -578,6 +551,7 @@ static void StreamOut(
  
                  _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
              }
+
              soMask &= ~(1 << slot);
          }
  
@@ -595,23 +569,93 @@ static void StreamOut(
          if (state.soBuffer[i].pWriteOffset)
          {
              *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
+        }
  
-            // The SOS increments the existing write offset. So we don't want to increment
-            // the SoWriteOffset stat using an absolute offset instead of relative.
-            SET_STAT(SoWriteOffset[i], soContext.pBuffer[i]->streamOffset);
+        if (state.soBuffer[i].soWriteEnable)
+        {
+            pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
+            pDC->dynState.SoWriteOffsetDirty[i] = true;
          }
      }
  
-    UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
-    UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
+    UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
+    UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
+
+    AR_END(FEStreamout, 1);
+}
+
+#if USE_SIMD16_FRONTEND
+//////////////////////////////////////////////////////////////////////////
+/// Is value an even number (a multiple of two)
+///
+template <typename T>
+INLINE static bool IsEven(T value)
+{
+    return (value & 1) == 0;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Round up value to an even number (a multiple of two)
+///
+template <typename T>
+INLINE static T RoundUpEven(T value)
+{
+    return (value + 1) & ~1;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Round down value to an even number (a multiple of two)
+///
+template <typename T>
+INLINE static T RoundDownEven(T value)
+{
+    return value & ~1;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
+///
+/// vertexCount is in terms of the source simdvertexes and must be even
+///
+/// attribCount will limit the vector copies to those attribs specified
+///
+/// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
+///
+void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
+{
+    SWR_ASSERT(vertex);
+    SWR_ASSERT(vertex_simd16);
+    SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
+
+    simd16vertex temp;
+
+    for (uint32_t i = 0; i < vertexCount; i += 2)
+    {
+        for (uint32_t j = 0; j < attribCount; j += 1)
+        {
+            for (uint32_t k = 0; k < 4; k += 1)
+            {
+                temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
+
+                if ((i + 1) < vertexCount)
+                {
+                    temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
+                }
+            }
+        }
  
-    RDTSC_STOP(FEStreamout, 1, 0);
+        for (uint32_t j = 0; j < attribCount; j += 1)
+        {
+            vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
+        }
+    }
  }
  
+#endif
  //////////////////////////////////////////////////////////////////////////
  /// @brief Computes number of invocations. The current index represents
  ///        the start of the SIMD. The max index represents how much work
-///        items are remaining. If there is less then a SIMD's left of work
+///        items are remaining. If there is less then a SIMD's xmin of work
  ///        then return the remaining amount of work.
  /// @param curIndex - The start index for the SIMD.
  /// @param maxIndex - The last index for all work items.
@@ -620,7 +664,11 @@ static INLINE uint32_t GetNumInvocations(
      uint32_t maxIndex)
  {
      uint32_t remainder = (maxIndex - curIndex);
+#if USE_SIMD16_FRONTEND
+    return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
+#else
      return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
+#endif
  }
  
  //////////////////////////////////////////////////////////////////////////
@@ -662,13 +710,53 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num
              }
              curInputByte >>= 2;
          }
-        
+
          *pCutBuffer++ = outByte;
      }
  }
  
  THREAD SWR_GS_CONTEXT tlsGsContext;
  
+template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
+struct GsBufferInfo
+{
+    GsBufferInfo(const SWR_GS_STATE &gsState)
+    {
+        const uint32_t vertexCount = gsState.maxNumVerts;
+        const uint32_t vertexStride = sizeof(SIMDVERTEX);
+        const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;
+
+        vertexPrimitiveStride = vertexStride * numSimdBatches;
+        vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
+
+        if (gsState.isSingleStream)
+        {
+            cutPrimitiveStride = (vertexCount + 7) / 8;
+            cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
+
+            streamCutPrimitiveStride = 0;
+            streamCutInstanceStride = 0;
+        }
+        else
+        {
+            cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
+            cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
+
+            streamCutPrimitiveStride = (vertexCount + 7) / 8;
+            streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
+        }
+    }
+
+    uint32_t vertexPrimitiveStride;
+    uint32_t vertexInstanceStride;
+
+    uint32_t cutPrimitiveStride;
+    uint32_t cutInstanceStride;
+
+    uint32_t streamCutPrimitiveStride;
+    uint32_t streamCutInstanceStride;
+};
+
  //////////////////////////////////////////////////////////////////////////
  /// @brief Implements GS stage.
  /// @param pDC - pointer to draw context.
@@ -686,11 +774,14 @@ static void GeometryShaderStage(
      void* pCutBuffer,
      void* pStreamCutBuffer,
      uint32_t* pSoPrimData,
+#if USE_SIMD16_FRONTEND
+    uint32_t numPrims_simd8,
+#endif
      simdscalari primID)
  {
-    RDTSC_START(FEGeometryShader);
+    SWR_CONTEXT *pContext = pDC->pContext;
  
-    SWR_CONTEXT* pContext = pDC->pContext;
+    AR_BEGIN(FEGeometryShader, pDC->drawId);
  
      const API_STATE& state = GetApiState(pDC);
      const SWR_GS_STATE* pState = &state.gsState;
@@ -716,7 +807,7 @@ static void GeometryShaderStage(
              tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
          }
      }
-    
+
      // assemble position
      pa.Assemble(VERTEX_POSITION_SLOT, attrib);
      for (uint32_t i = 0; i < numVertsPerPrim; ++i)
@@ -724,27 +815,19 @@ static void GeometryShaderStage(
          tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
      }
  
-    const uint32_t vertexStride = sizeof(simdvertex);
-    const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
-    const uint32_t inputPrimStride = numSimdBatches * vertexStride;
-    const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
-    uint32_t cutPrimStride;
-    uint32_t cutInstanceStride;
-
-    if (pState->isSingleStream)
-    {
-        cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
-        cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
-    }
-    else
-    {
-        cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
-        cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
-    }
+#if USE_SIMD16_FRONTEND
+    const GsBufferInfo<simd16vertex, KNOB_SIMD16_WIDTH> bufferInfo(state.gsState);
+#else
+    const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);
+#endif
  
      // record valid prims from the frontend to avoid over binning the newly generated
      // prims from the GS
+#if USE_SIMD16_FRONTEND
+    uint32_t numInputPrims = numPrims_simd8;
+#else
      uint32_t numInputPrims = pa.NumPrims();
+#endif
  
      for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
      {
@@ -754,11 +837,25 @@ static void GeometryShaderStage(
          // execute the geometry shader
          state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
  
-        tlsGsContext.pStream += instanceStride;
-        tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
+        tlsGsContext.pStream += bufferInfo.vertexInstanceStride;
+        tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride;
      }
  
      // set up new binner and state for the GS output topology
+#if USE_SIMD16_FRONTEND
+    PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
+    if (HasRastT::value)
+    {
+        switch (pState->outputTopology)
+        {
+        case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles_simd16; break;
+        case TOP_LINE_STRIP:        pfnClipFunc = ClipLines_simd16; break;
+        case TOP_POINT_LIST:        pfnClipFunc = ClipPoints_simd16; break;
+        default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
+        }
+    }
+
+#else
      PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
      if (HasRastT::value)
      {
@@ -767,10 +864,11 @@ static void GeometryShaderStage(
          case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
          case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
          case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
-        default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
+        default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
          }
      }
  
+#endif
      // foreach input prim:
      // - setup a new PA based on the emitted verts for that prim
      // - loop over the new verts, calling PA to assemble each prim
@@ -780,8 +878,9 @@ static void GeometryShaderStage(
      uint32_t totalPrimsGenerated = 0;
      for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
      {
-        uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
-        uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
+        uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * bufferInfo.vertexPrimitiveStride;
+        uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * bufferInfo.cutPrimitiveStride;
+
          for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
          {
              uint32_t numEmittedVerts = pVertexCount[inputPrim];
@@ -790,18 +889,10 @@ static void GeometryShaderStage(
                  continue;
              }
  
-            uint8_t* pBase = pInstanceBase + instance * instanceStride;
-            uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
-            
-            DWORD numAttribs;
-            if (_BitScanReverse(&numAttribs, state.feAttribMask))
-            {
-                numAttribs++;
-            }
-            else
-            {
-                numAttribs = 0;
-            }
+            uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;
+            uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;
+
+            uint32_t numAttribs = state.feNumAttributes;
  
              for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
              {
@@ -831,39 +922,122 @@ static void GeometryShaderStage(
                      processCutVerts = false;
                  }
  
+#if USE_SIMD16_FRONTEND
+                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+
+#else
                  PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
  
+#endif
                  while (gsPa.GetNextStreamOutput())
                  {
                      do
                      {
+#if USE_SIMD16_FRONTEND
+                        simd16vector attrib_simd16[3];
+
+                        bool assemble = gsPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
+
+#else
                          bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
  
+#endif
                          if (assemble)
                          {
                              totalPrimsGenerated += gsPa.NumPrims();
  
                              if (HasStreamOutT::value)
                              {
+#if USE_SIMD16_FRONTEND
+                                const uint32_t numPrims = gsPa.NumPrims();
+                                const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
+                                const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
+
+                                gsPa.useAlternateOffset = false;
+                                StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_lo, stream);
+
+                                if (numPrims_hi)
+                                {
+                                    gsPa.useAlternateOffset = true;
+                                    StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_hi, stream);
+                                }
+#else
                                  StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
+#endif
                              }
  
                              if (HasRastT::value && state.soState.streamToRasterizer == stream)
                              {
+#if USE_SIMD16_FRONTEND
+                                simd16scalari vPrimId;
+                                // pull primitiveID from the GS output if available
+                                if (state.gsState.emitsPrimitiveID)
+                                {
+                                    simd16vector primIdAttrib[3];
+                                    gsPa.Assemble_simd16(VERTEX_PRIMID_SLOT, primIdAttrib);
+                                    vPrimId = _simd16_castps_si(primIdAttrib[state.frontendState.topologyProvokingVertex].x);
+                                }
+                                else
+                                {
+                                    vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
+                                }
+
+                                // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
+                                simd16scalari vViewPortIdx;
+                                if (state.gsState.emitsViewportArrayIndex)
+                                {
+                                    simd16vector vpiAttrib[3];
+                                    gsPa.Assemble_simd16(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
+
+                                    // OOB indices => forced to zero.
+                                    simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                                    simd16scalari vClearMask = _simd16_cmplt_epi32(_simd16_castps_si(vpiAttrib[0].x), vNumViewports);
+                                    vpiAttrib[0].x = _simd16_and_ps(_simd16_castsi_ps(vClearMask), vpiAttrib[0].x);
+
+                                    vViewPortIdx = _simd16_castps_si(vpiAttrib[0].x);
+                                }
+                                else
+                                {
+                                    vViewPortIdx = _simd16_set1_epi32(0);
+                                }
+
+                                gsPa.useAlternateOffset = false;
+                                pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
+#else
                                  simdscalari vPrimId;
                                  // pull primitiveID from the GS output if available
                                  if (state.gsState.emitsPrimitiveID)
                                  {
                                      simdvector primIdAttrib[3];
                                      gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
-                                    vPrimId = _simd_castps_si(primIdAttrib[0].x);
+                                    vPrimId = _simd_castps_si(primIdAttrib[state.frontendState.topologyProvokingVertex].x);
                                  }
                                  else
                                  {
                                      vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
                                  }
  
-                                pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
+                                // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
+                                simdscalari vViewPortIdx;
+                                if (state.gsState.emitsViewportArrayIndex)
+                                {
+                                    simdvector vpiAttrib[3];
+                                    gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
+
+                                    // OOB indices => forced to zero.
+                                    simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                                    simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
+                                    vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
+
+                                    vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
+                                }
+                                else
+                                {
+                                    vViewPortIdx = _simd_set1_epi32(0);
+                                }
+
+                                pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
+#endif
                              }
                          }
                      } while (gsPa.NextPrim());
@@ -873,10 +1047,10 @@ static void GeometryShaderStage(
      }
  
      // update GS pipeline stats
-    UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
-    UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
-
-    RDTSC_STOP(FEGeometryShader, 1, 0);
+    UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
+    UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
+    AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
+    AR_END(FEGeometryShader, 1);
  }
  
  //////////////////////////////////////////////////////////////////////////
@@ -885,24 +1059,23 @@ static void GeometryShaderStage(
  /// @param state - API state
  /// @param ppGsOut - pointer to GS output buffer allocation
  /// @param ppCutBuffer - pointer to GS output cut buffer allocation
+template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
  static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
      void **ppStreamCutBuffer)
  {
      auto pArena = pDC->pArena;
      SWR_ASSERT(pArena != nullptr);
      SWR_ASSERT(state.gsState.gsEnable);
+
      // allocate arena space to hold GS output verts
      // @todo pack attribs
      // @todo support multiple streams
-    const uint32_t vertexStride = sizeof(simdvertex);
-    const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
-    uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
-    *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
  
-    const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
-    const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
-    const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
-    const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
+    const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState);
+
+    const uint32_t vertexBufferSize = state.gsState.instanceCount * bufferInfo.vertexInstanceStride;
+
+    *ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * sizeof(float));
  
      // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
      // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
@@ -910,15 +1083,19 @@ static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state,
      // allocate space for temporary per-stream cut buffer if multi-stream is enabled
      if (state.gsState.isSingleStream)
      {
-        *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
+        const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
+
+        *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
          *ppStreamCutBuffer = nullptr;
      }
      else
      {
-        *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
-        *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
-    }
+        const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
+        const uint32_t streamCutBufferSize = state.gsState.instanceCount * bufferInfo.streamCutInstanceStride;
  
+        *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
+        *ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, SIMD_WIDTH * sizeof(float));
+    }
  }
  
  //////////////////////////////////////////////////////////////////////////
@@ -969,11 +1146,14 @@ static void TessellationStages(
      void* pCutBuffer,
      void* pCutStreamBuffer,
      uint32_t* pSoPrimData,
+#if USE_SIMD16_FRONTEND
+    uint32_t numPrims_simd8,
+#endif
      simdscalari primID)
  {
+    SWR_CONTEXT *pContext = pDC->pContext;
      const API_STATE& state = GetApiState(pDC);
      const SWR_TS_STATE& tsState = state.tsState;
-    SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
  
      SWR_ASSERT(gt_pTessellationThreadData);
  
@@ -995,6 +1175,20 @@ static void TessellationStages(
      }
      SWR_ASSERT(tsCtx);
  
+#if USE_SIMD16_FRONTEND
+    PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
+    if (HasRastT::value)
+    {
+        switch (tsState.postDSTopology)
+        {
+        case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
+        case TOP_LINE_LIST:     pfnClipFunc = ClipLines_simd16; break;
+        case TOP_POINT_LIST:    pfnClipFunc = ClipPoints_simd16; break;
+        default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
+        }
+    }
+
+#else
      PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
      if (HasRastT::value)
      {
@@ -1003,10 +1197,11 @@ static void TessellationStages(
          case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
          case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
          case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
-        default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
+        default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
          }
      }
  
+#endif
      SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
      hsContext.pCPout = gt_pTessellationThreadData->patchData;
      hsContext.PrimitiveID = primID;
@@ -1031,15 +1226,19 @@ static void TessellationStages(
      memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
  #endif
  
+#if USE_SIMD16_FRONTEND
+    uint32_t numPrims = numPrims_simd8;
+#else
      uint32_t numPrims = pa.NumPrims();
+#endif
      hsContext.mask = GenerateMask(numPrims);
  
      // Run the HS
-    RDTSC_START(FEHullShader);
+    AR_BEGIN(FEHullShader, pDC->drawId);
      state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
-    RDTSC_STOP(FEHullShader, 0, 0);
+    AR_END(FEHullShader, 0);
  
-    UPDATE_STAT(HsInvocations, numPrims);
+    UPDATE_STAT_FE(HsInvocations, numPrims);
  
      const uint32_t* pPrimId = (const uint32_t*)&primID;
  
@@ -1047,9 +1246,10 @@ static void TessellationStages(
      {
          // Run Tessellator
          SWR_TS_TESSELLATED_DATA tsData = { 0 };
-        RDTSC_START(FETessellation);
+        AR_BEGIN(FETessellation, pDC->drawId);
          TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
-        RDTSC_STOP(FETessellation, 0, 0);
+        AR_EVENT(TessPrimCount(1));
+        AR_END(FETessellation, 0);
  
          if (tsData.NumPrimitives == 0)
          {
@@ -1060,12 +1260,20 @@ static void TessellationStages(
          // Allocate DS Output memory
          uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
          size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
+#if USE_SIMD16_FRONTEND
+        size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs;      // simd8 -> simd16, padding
+#else
          size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
+#endif
          if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
          {
              AlignedFree(gt_pTessellationThreadData->pDSOutput);
              gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
+#if USE_SIMD16_FRONTEND
+            gt_pTessellationThreadData->numDSOutputVectors = RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
+#else
              gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
+#endif
          }
          SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
          SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
@@ -1081,7 +1289,11 @@ static void TessellationStages(
          dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
          dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
          dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
+#if USE_SIMD16_FRONTEND
+        dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations);      // simd8 -> simd16
+#else
          dsContext.vectorStride = requiredDSVectorInvocations;
+#endif
  
          uint32_t dsInvocations = 0;
  
@@ -1089,18 +1301,27 @@ static void TessellationStages(
          {
              dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
  
-            RDTSC_START(FEDomainShader);
+            AR_BEGIN(FEDomainShader, pDC->drawId);
              state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
-            RDTSC_STOP(FEDomainShader, 0, 0);
+            AR_END(FEDomainShader, 0);
  
              dsInvocations += KNOB_SIMD_WIDTH;
          }
-        UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
+        UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
+
+#if USE_SIMD16_FRONTEND
+        SWR_ASSERT(IsEven(dsContext.vectorStride));                             // simd8 -> simd16
  
+#endif
          PA_TESS tessPa(
              pDC,
+#if USE_SIMD16_FRONTEND
+            reinterpret_cast<const simd16scalar *>(dsContext.pOutputData),      // simd8 -> simd16
+            dsContext.vectorStride / 2,                                         // simd8 -> simd16
+#else
              dsContext.pOutputData,
              dsContext.vectorStride,
+#endif
              tsState.numDsOutputAttribs,
              tsData.ppIndices,
              tsData.NumPrimitives,
@@ -1108,33 +1329,76 @@ static void TessellationStages(
  
          while (tessPa.HasWork())
          {
+#if USE_SIMD16_FRONTEND
+            const uint32_t numPrims = tessPa.NumPrims();
+            const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
+            const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
+
+            const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
+            const simdscalari primID_lo = _simd16_extract_si(primID, 0);
+            const simdscalari primID_hi = _simd16_extract_si(primID, 1);
+
+#endif
              if (HasGeometryShaderT::value)
              {
+#if USE_SIMD16_FRONTEND
+                tessPa.useAlternateOffset = false;
+                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_lo, primID_lo);
+
+                if (numPrims_hi)
+                {
+                    tessPa.useAlternateOffset = true;
+                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_hi, primID_hi);
+                }
+#else
                  GeometryShaderStage<HasStreamOutT, HasRastT>(
                      pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
                      _simd_set1_epi32(dsContext.PrimitiveID));
+#endif
              }
              else
              {
                  if (HasStreamOutT::value)
                  {
+#if USE_SIMD16_FRONTEND
+                    tessPa.useAlternateOffset = false;
+                    StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_lo, 0);
+
+                    if (numPrims_hi)
+                    {
+                        tessPa.useAlternateOffset = true;
+                        StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_hi, 0);
+                    }
+#else
                      StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
+#endif
                  }
  
                  if (HasRastT::value)
                  {
-                    simdvector prim[3]; // Only deal with triangles, lines, or points
-                    RDTSC_START(FEPAAssemble);
-#if SWR_ENABLE_ASSERTS
-                    bool assemble =
+#if USE_SIMD16_FRONTEND
+                    simd16vector    prim_simd16[3]; // Only deal with triangles, lines, or points
+#else
+                    simdvector      prim[3];        // Only deal with triangles, lines, or points
  #endif
+                    AR_BEGIN(FEPAAssemble, pDC->drawId);
+                    bool assemble =
+#if USE_SIMD16_FRONTEND
+                        tessPa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
+#else
                          tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
-                    RDTSC_STOP(FEPAAssemble, 1, 0);
+#endif
+                    AR_END(FEPAAssemble, 1);
                      SWR_ASSERT(assemble);
  
                      SWR_ASSERT(pfnClipFunc);
+#if USE_SIMD16_FRONTEND
+                    tessPa.useAlternateOffset = false;
+                    pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_set1_epi32(0));
+#else
                      pfnClipFunc(pDC, tessPa, workerId, prim,
-                        GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
+                        GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
+#endif
                  }
              }
  
@@ -1143,9 +1407,21 @@ static void TessellationStages(
          } // while (tessPa.HasWork())
      } // for (uint32_t p = 0; p < numPrims; ++p)
  
+#if USE_SIMD16_FRONTEND
+    if (gt_pTessellationThreadData->pDSOutput != nullptr)
+    {
+        AlignedFree(gt_pTessellationThreadData->pDSOutput);
+        gt_pTessellationThreadData->pDSOutput = nullptr;
+    }
+    gt_pTessellationThreadData->numDSOutputVectors = 0;
+
+#endif
      TSDestroyCtx(tsCtx);
  }
  
+THREAD PA_STATE::SIMDVERTEX *pVertexStore = nullptr;
+THREAD uint32_t gVertexStoreSize = 0;
+
  //////////////////////////////////////////////////////////////////////////
  /// @brief FE handler for SwrDraw.
  /// @tparam IsIndexedT - Is indexed drawing enabled
@@ -1178,16 +1454,13 @@ void ProcessDraw(
      }
  #endif
  
-    RDTSC_START(FEProcessDraw);
+    AR_BEGIN(FEProcessDraw, pDC->drawId);
  
      DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
      const API_STATE&    state = GetApiState(pDC);
-    __m256i             vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-    SWR_VS_CONTEXT      vsContext;
-    simdvertex          vin;
  
-    int indexSize = 0;
-    uint32_t endVertex = work.numVerts; 
+    uint32_t indexSize = 0;
+    uint32_t endVertex = work.numVerts;
  
      const int32_t* pLastRequestedIndex = nullptr;
      if (IsIndexedT::value)
@@ -1209,7 +1482,7 @@ void ProcessDraw(
              pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
              break;
          default:
-            SWR_ASSERT(0);
+            SWR_INVALID("Invalid work.type: %d", work.type);
          }
      }
      else
@@ -1218,31 +1491,7 @@ void ProcessDraw(
          endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
      }
  
-    SWR_FETCH_CONTEXT fetchInfo = { 0 };
-    fetchInfo.pStreams = &state.vertexBuffers[0];
-    fetchInfo.StartInstance = work.startInstance;
-    fetchInfo.StartVertex = 0;
-
-    vsContext.pVin = &vin;
-
-    if (IsIndexedT::value)
-    {
-        fetchInfo.BaseVertex = work.baseVertex;
-
-        // if the entire index buffer isn't being consumed, set the last index
-        // so that fetches < a SIMD wide will be masked off
-        fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
-        if (pLastRequestedIndex < fetchInfo.pLastIndex)
-        {
-            fetchInfo.pLastIndex = pLastRequestedIndex;
-        }
-    }
-    else
-    {
-        fetchInfo.StartVertex = work.startVertex;
-    }
-
-#ifdef KNOB_ENABLE_RDTSC
+#if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
      uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
  #endif
  
@@ -1251,7 +1500,11 @@ void ProcessDraw(
      void* pStreamCutBuffer = nullptr;
      if (HasGeometryShaderT::value)
      {
-        AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+#if USE_SIMD16_FRONTEND
+        AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+#else
+        AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+#endif
      }
  
      if (HasTessellationT::value)
@@ -1274,92 +1527,177 @@ void ProcessDraw(
      if (HasStreamOutT::value)
      {
          pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
+    }
  
-        // update the
-        for (uint32_t i = 0; i < 4; ++i)
+    const uint32_t vertexCount = NumVertsPerPrim(state.topology, state.gsState.gsEnable);
+
+    SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
+
+    // grow the vertex store for the PA as necessary
+    if (gVertexStoreSize < vertexCount)
+    {
+        if (pVertexStore != nullptr)
+        {
+            AlignedFree(pVertexStore);
+        }
+
+        while (gVertexStoreSize < vertexCount)
          {
-            SET_STAT(SoWriteOffset[i], state.soBuffer[i].streamOffset);
+#if USE_SIMD16_FRONTEND
+            gVertexStoreSize += 4;  // grow in chunks of 4 simd16vertex
+#else
+            gVertexStoreSize += 8;  // grow in chunks of 8 simdvertex
+#endif
          }
  
+        SWR_ASSERT(gVertexStoreSize <= MAX_NUM_VERTS_PER_PRIM);
+
+        pVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(gVertexStoreSize * sizeof(pVertexStore[0]), 64));
+
+        SWR_ASSERT(pVertexStore != nullptr);
      }
  
      // choose primitive assembler
-    PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
+    PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, pVertexStore, gVertexStoreSize);
      PA_STATE& pa = paFactory.GetPA();
  
-    /// @todo: temporarily move instance loop in the FE to ensure SO ordering
+#if USE_SIMD16_FRONTEND
+    simdvertex          vin_lo;
+    simdvertex          vin_hi;
+    SWR_VS_CONTEXT      vsContext_lo;
+    SWR_VS_CONTEXT      vsContext_hi;
+
+    vsContext_lo.pVin = &vin_lo;
+    vsContext_hi.pVin = &vin_hi;
+    vsContext_lo.AlternateOffset = 0;
+    vsContext_hi.AlternateOffset = 1;
+
+    SWR_FETCH_CONTEXT   fetchInfo_lo = { 0 };
+
+    fetchInfo_lo.pStreams = &state.vertexBuffers[0];
+    fetchInfo_lo.StartInstance = work.startInstance;
+    fetchInfo_lo.StartVertex = 0;
+
+    if (IsIndexedT::value)
+    {
+        fetchInfo_lo.BaseVertex = work.baseVertex;
+
+        // if the entire index buffer isn't being consumed, set the last index
+        // so that fetches < a SIMD wide will be masked off
+        fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+        if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
+        {
+            fetchInfo_lo.pLastIndex = pLastRequestedIndex;
+        }
+    }
+    else
+    {
+        fetchInfo_lo.StartVertex = work.startVertex;
+    }
+
+    SWR_FETCH_CONTEXT   fetchInfo_hi = fetchInfo_lo;
+
+    const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+
      for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
      {
-        simdscalari vIndex;
          uint32_t  i = 0;
  
+        simd16scalari vIndex;
+
          if (IsIndexedT::value)
          {
-            fetchInfo.pIndices = work.pIB;
+            fetchInfo_lo.pIndices = work.pIB;
+            fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize);    // 1/2 of KNOB_SIMD16_WIDTH
          }
          else
          {
-            vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
-            fetchInfo.pIndices = (const int32_t*)&vIndex;
+            vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
+
+            fetchInfo_lo.pIndices = (const int32_t *)&vIndex;
+            fetchInfo_hi.pIndices = (const int32_t *)&vIndex + KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH
          }
  
-        fetchInfo.CurInstance = instanceNum;
-        vsContext.InstanceID = instanceNum;
+        fetchInfo_lo.CurInstance = instanceNum;
+        fetchInfo_hi.CurInstance = instanceNum;
+
+        vsContext_lo.InstanceID = instanceNum;
+        vsContext_hi.InstanceID = instanceNum;
  
          while (pa.HasWork())
          {
-            // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
+            // GetNextVsOutput currently has the side effect of updating some PA state machine state.
              // So we need to keep this outside of (i < endVertex) check.
-            simdmask* pvCutIndices = nullptr;
+
+            simdmask *pvCutIndices_lo = nullptr;
+            simdmask *pvCutIndices_hi = nullptr;
+
              if (IsIndexedT::value)
              {
-                pvCutIndices = &pa.GetNextVsIndices();
+                // simd16mask <=> simdmask[2]
+
+                pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
+                pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
              }
  
-            simdvertex& vout = pa.GetNextVsOutput();
-            vsContext.pVout = &vout;
+            simd16vertex &vout = pa.GetNextVsOutput();
+
+            vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
+            vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);
  
              if (i < endVertex)
              {
-
                  // 1. Execute FS/VS for a single SIMD.
-                RDTSC_START(FEFetchShader);
-                state.pfnFetchFunc(fetchInfo, vin);
-                RDTSC_STOP(FEFetchShader, 0, 0);
+                AR_BEGIN(FEFetchShader, pDC->drawId);
+                state.pfnFetchFunc(fetchInfo_lo, vin_lo);
+
+                if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
+                {
+                    state.pfnFetchFunc(fetchInfo_hi, vin_hi);
+                }
+                AR_END(FEFetchShader, 0);
  
                  // forward fetch generated vertex IDs to the vertex shader
-                vsContext.VertexID = fetchInfo.VertexID;
+                vsContext_lo.VertexID = fetchInfo_lo.VertexID;
+                vsContext_hi.VertexID = fetchInfo_hi.VertexID;
  
                  // Setup active mask for vertex shader.
-                vsContext.mask = GenerateMask(endVertex - i);
+                vsContext_lo.mask = GenerateMask(endVertex - i);
+                vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
  
                  // forward cut mask to the PA
                  if (IsIndexedT::value)
                  {
-                    *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
+                    *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
+                    *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
                  }
  
-                UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
+                UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
  
  #if KNOB_ENABLE_TOSS_POINTS
                  if (!KNOB_TOSS_FETCH)
  #endif
                  {
-                    RDTSC_START(FEVertexShader);
-                    state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
-                    RDTSC_STOP(FEVertexShader, 0, 0);
+                    AR_BEGIN(FEVertexShader, pDC->drawId);
+                    state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
  
-                    UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
+                    if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
+                    {
+                        state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
+                    }
+                    AR_END(FEVertexShader, 0);
+
+                    UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
                  }
              }
  
              // 2. Assemble primitives given the last two SIMD.
              do
              {
-                simdvector prim[MAX_NUM_VERTS_PER_PRIM];
-                // PaAssemble returns false if there is not enough verts to assemble.
+                simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
+
                  RDTSC_START(FEPAAssemble);
-                bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
+                bool assemble = pa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
                  RDTSC_STOP(FEPAAssemble, 1, 0);
  
  #if KNOB_ENABLE_TOSS_POINTS
@@ -1372,31 +1710,64 @@ void ProcessDraw(
                      {
                          if (assemble)
                          {
-                            UPDATE_STAT(IaPrimitives, pa.NumPrims());
+                            UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
+
+                            const uint32_t numPrims = pa.NumPrims();
+                            const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
+                            const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
+
+                            const simd16scalari primID = pa.GetPrimID(work.startPrimID);
+                            const simdscalari primID_lo = _simd16_extract_si(primID, 0);
+                            const simdscalari primID_hi = _simd16_extract_si(primID, 1);
  
                              if (HasTessellationT::value)
                              {
-                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
-                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+                                pa.useAlternateOffset = false;
+                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
+
+                                if (numPrims_hi)
+                                {
+                                    pa.useAlternateOffset = true;
+                                    TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
+                                }
                              }
                              else if (HasGeometryShaderT::value)
                              {
-                                GeometryShaderStage<HasStreamOutT, HasRastT>(
-                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+                                pa.useAlternateOffset = false;
+                                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
+
+                                if (numPrims_hi)
+                                {
+                                    pa.useAlternateOffset = true;
+                                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
+                                }
                              }
                              else
                              {
                                  // If streamout is enabled then stream vertices out to memory.
                                  if (HasStreamOutT::value)
                                  {
+#if 1
+                                    pa.useAlternateOffset = false;
+                                    StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_lo, 0);
+
+                                    if (numPrims_hi)
+                                    {
+                                        pa.useAlternateOffset = true;
+                                        StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_hi, 0);
+                                    }
+#else
+                                    pa.useAlternateOffset = false;
                                      StreamOut(pDC, pa, workerId, pSoPrimData, 0);
+#endif
                                  }
  
                                  if (HasRastT::value)
                                  {
-                                    SWR_ASSERT(pDC->pState->pfnProcessPrims);
-                                    pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
-                                        GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
+                                    SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
+
+                                    pa.useAlternateOffset = false;
+                                    pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_setzero_si());
                                  }
                              }
                          }
@@ -1404,978 +1775,210 @@ void ProcessDraw(
                  }
              } while (pa.NextPrim());
  
-            i += KNOB_SIMD_WIDTH;
              if (IsIndexedT::value)
              {
-                fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
+                fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
+                fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
              }
              else
              {
-                vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
+                vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
              }
-        }
-        pa.Reset();
-    }
-
-    RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
-}
  
-struct FEDrawChooser
-{
-    typedef PFN_FE_WORK_FUNC FuncType;
-
-    template <typename... ArgsB>
-    static FuncType GetFunc()
-    {
-        return ProcessDraw<ArgsB...>;
-    }
-};
-
-
-// Selector for correct templated Draw front-end function
-PFN_FE_WORK_FUNC GetProcessDrawFunc(
-    bool IsIndexed,
-    bool IsCutIndexEnabled,
-    bool HasTessellation,
-    bool HasGeometryShader,
-    bool HasStreamOut,
-    bool HasRasterization)
-{
-    return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
-}
-
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Processes attributes for the backend based on linkage mask and
-///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
-/// @param pDC - Draw context
-/// @param pa - Primitive Assembly state
-/// @param linkageMask - Specifies which VS outputs are routed to PS.
-/// @param pLinkageMap - maps VS attribute slot to PS slot
-/// @param triIndex - Triangle to process attributes for
-/// @param pBuffer - Output result
-template<uint32_t NumVerts>
-INLINE void ProcessAttributes(
-    DRAW_CONTEXT *pDC,
-    PA_STATE&pa,
-    uint32_t linkageMask,
-    const uint8_t* pLinkageMap,
-    uint32_t triIndex,
-    float *pBuffer)
-{
-    DWORD slot = 0;
-    uint32_t mapIdx = 0;
-    LONG constantInterpMask = pDC->pState->state.backendState.constantInterpolationMask;
-    const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
-
-    while (_BitScanForward(&slot, linkageMask))
-    {
-        linkageMask &= ~(1 << slot); // done with this bit.
-
-        // compute absolute slot in vertex attrib array
-        uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + pLinkageMap[mapIdx];
-
-        __m128 attrib[3];    // triangle attribs (always 4 wide)
-        pa.AssembleSingle(inputSlot, triIndex, attrib);
-
-        if (_bittest(&constantInterpMask, mapIdx))
-        {
-            for (uint32_t i = 0; i < NumVerts; ++i)
-            {
-                _mm_store_ps(pBuffer, attrib[provokingVertex]);
-                pBuffer += 4;
-            }
-        }
-        else
-        {
-            for (uint32_t i = 0; i < NumVerts; ++i)
-            {
-                _mm_store_ps(pBuffer, attrib[i]);
-                pBuffer += 4;
-            }
-        }
-
-        // pad out the attrib buffer to 3 verts to ensure the triangle
-        // interpolation code in the pixel shader works correctly for the
-        // 3 topologies - point, line, tri.  This effectively zeros out the
-        // effect of the missing vertices in the triangle interpolation.
-        for (uint32_t i = NumVerts; i < 3; ++i)
-        {
-            _mm_store_ps(pBuffer, attrib[NumVerts - 1]);
-            pBuffer += 4;
-        }
-
-        mapIdx++;
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Processes enabled user clip distances. Loads the active clip
-///        distances from the PA, sets up barycentric equations, and
-///        stores the results to the output buffer
-/// @param pa - Primitive Assembly state
-/// @param primIndex - primitive index to process
-/// @param clipDistMask - mask of enabled clip distances
-/// @param pUserClipBuffer - buffer to store results
-template<uint32_t NumVerts>
-void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
-{
-    DWORD clipDist;
-    while (_BitScanForward(&clipDist, clipDistMask))
-    {
-        clipDistMask &= ~(1 << clipDist);
-        uint32_t clipSlot = clipDist >> 2;
-        uint32_t clipComp = clipDist & 0x3;
-        uint32_t clipAttribSlot = clipSlot == 0 ?
-            VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
-
-        __m128 primClipDist[3];
-        pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
-
-        float vertClipDist[NumVerts];
-        for (uint32_t e = 0; e < NumVerts; ++e)
-        {
-            OSALIGNSIMD(float) aVertClipDist[4];
-            _mm_store_ps(aVertClipDist, primClipDist[e]);
-            vertClipDist[e] = aVertClipDist[clipComp];
-        };
-
-        // setup plane equations for barycentric interpolation in the backend
-        float baryCoeff[NumVerts];
-        for (uint32_t e = 0; e < NumVerts - 1; ++e)
-        {
-            baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
+            i += KNOB_SIMD16_WIDTH;
          }
-        baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
  
-        for (uint32_t e = 0; e < NumVerts; ++e)
-        {
-            *(pUserClipBuffer++) = baryCoeff[e];
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
-///        culling, viewport transform, etc.
-/// @param pDC - pointer to draw context.
-/// @param pa - The primitive assembly object.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param tri - Contains triangle position data for SIMDs worth of triangles.
-/// @param primID - Primitive ID for each triangle.
-void BinTriangles(
-    DRAW_CONTEXT *pDC,
-    PA_STATE& pa,
-    uint32_t workerId,
-    simdvector tri[3],
-    uint32_t triMask,
-    simdscalari primID)
-{
-    RDTSC_START(FEBinTriangles);
-
-    const API_STATE& state = GetApiState(pDC);
-    const SWR_RASTSTATE& rastState = state.rastState;
-    const SWR_FRONTEND_STATE& feState = state.frontendState;
-    const SWR_GS_STATE& gsState = state.gsState;
-    MacroTileMgr *pTileMgr = pDC->pTileMgr;
-
-    // Simple wireframe mode for debugging purposes only
-
-    simdscalar vRecipW0 = _simd_set1_ps(1.0f);
-    simdscalar vRecipW1 = _simd_set1_ps(1.0f);
-    simdscalar vRecipW2 = _simd_set1_ps(1.0f);
-
-    if (!feState.vpTransformDisable)
-    {
-        // perspective divide
-        vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
-        vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
-        vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
-
-        tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
-        tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
-        tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
-
-        tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
-        tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
-        tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
-
-        tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
-        tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
-        tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
-
-        // viewport transform to screen coords
-        viewportTransform<3>(tri, state.vpMatrix[0]);
-    }
-
-    // adjust for pixel center location
-    simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
-    tri[0].x = _simd_add_ps(tri[0].x, offset);
-    tri[0].y = _simd_add_ps(tri[0].y, offset);
-
-    tri[1].x = _simd_add_ps(tri[1].x, offset);
-    tri[1].y = _simd_add_ps(tri[1].y, offset);
-
-    tri[2].x = _simd_add_ps(tri[2].x, offset);
-    tri[2].y = _simd_add_ps(tri[2].y, offset);
-
-    // convert to fixed point
-    simdscalari vXi[3], vYi[3];
-    vXi[0] = fpToFixedPointVertical(tri[0].x);
-    vYi[0] = fpToFixedPointVertical(tri[0].y);
-    vXi[1] = fpToFixedPointVertical(tri[1].x);
-    vYi[1] = fpToFixedPointVertical(tri[1].y);
-    vXi[2] = fpToFixedPointVertical(tri[2].x);
-    vYi[2] = fpToFixedPointVertical(tri[2].y);
-
-    // triangle setup
-    simdscalari vAi[3], vBi[3];
-    triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
-        
-    // determinant
-    simdscalari vDet[2];
-    calcDeterminantIntVertical(vAi, vBi, vDet);
-
-    // cull zero area
-    int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
-    int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
-
-    int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
-
-    uint32_t origTriMask = triMask;
-    triMask &= ~cullZeroAreaMask;
-
-    // determine front winding tris
-    // CW  +det
-    // CCW -det
-    maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
-    maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
-    int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
-
-    uint32_t frontWindingTris;
-    if (rastState.frontWinding == SWR_FRONTWINDING_CW)
-    {
-        frontWindingTris = cwTriMask;
-    }
-    else
-    {
-        frontWindingTris = ~cwTriMask;
-    }
-
-    // cull
-    uint32_t cullTris;
-    switch ((SWR_CULLMODE)rastState.cullMode)
-    {
-    case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
-    case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
-    case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
-    case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
-    default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
-    }
-
-    triMask &= ~cullTris;
-
-    if (origTriMask ^ triMask)
-    {
-        RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
+        pa.Reset();
      }
  
-    // compute per tri backface
-    uint32_t frontFaceMask = frontWindingTris;
-
-    uint32_t *pPrimID = (uint32_t *)&primID;
-    DWORD triIndex = 0;
+#else
+    simdvertex          vin;
+    SWR_VS_CONTEXT      vsContext;
  
-    if (!triMask)
-    {
-        goto endBinTriangles;
-    }
+    vsContext.pVin = &vin;
  
-    // Calc bounding box of triangles
-    simdBBox bbox;
-    calcBoundingBoxIntVertical(vXi, vYi, bbox);
+    SWR_FETCH_CONTEXT   fetchInfo = { 0 };
  
-    // determine if triangle falls between pixel centers and discard
-    // only discard for non-MSAA case
-    // (left + 127) & ~255
-    // (right + 128) & ~255
+    fetchInfo.pStreams = &state.vertexBuffers[0];
+    fetchInfo.StartInstance = work.startInstance;
+    fetchInfo.StartVertex = 0;
  
-    if(rastState.sampleCount == SWR_MULTISAMPLE_1X)
+    if (IsIndexedT::value)
      {
-        origTriMask = triMask;
-
-        int cullCenterMask;
-        {
-            simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
-            left = _simd_and_si(left, _simd_set1_epi32(~255));
-            simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
-            right = _simd_and_si(right, _simd_set1_epi32(~255));
-
-            simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
-
-            simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
-            top = _simd_and_si(top, _simd_set1_epi32(~255));
-            simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
-            bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
-
-            simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
-            vMaskV = _simd_or_si(vMaskH, vMaskV);
-            cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
-        }
-
-        triMask &= ~cullCenterMask;
+        fetchInfo.BaseVertex = work.baseVertex;
  
-        if(origTriMask ^ triMask)
+        // if the entire index buffer isn't being consumed, set the last index
+        // so that fetches < a SIMD wide will be masked off
+        fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+        if (pLastRequestedIndex < fetchInfo.pLastIndex)
          {
-            RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
+            fetchInfo.pLastIndex = pLastRequestedIndex;
          }
      }
-
-    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
-    bbox.left   = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
-    bbox.top    = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
-    bbox.right  = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
-    bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
-
-    // Cull tris completely outside scissor
-    {
-        simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
-        simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
-        simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
-        uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
-        triMask = triMask & ~maskOutsideScissor;
-    }
-
-    if (!triMask)
-    {
-        goto endBinTriangles;
-    }
-
-    // Convert triangle bbox to macrotile units.
-    bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
-    bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-    bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
-    bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
-    OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
-    _simd_store_si((simdscalari*)aMTLeft, bbox.left);
-    _simd_store_si((simdscalari*)aMTRight, bbox.right);
-    _simd_store_si((simdscalari*)aMTTop, bbox.top);
-    _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
-
-    // transpose verts needed for backend
-    /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
-    vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
-    vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
-    vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
-    vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
-
-    // store render target array index
-    OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
-    if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
-    {
-        simdvector vRtai[3];
-        pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
-        simdscalari vRtaii;
-        vRtaii = _simd_castps_si(vRtai[0].x);
-        _simd_store_si((simdscalari*)aRTAI, vRtaii);
-    }
      else
      {
-        _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
+        fetchInfo.StartVertex = work.startVertex;
      }
  
+    const simdscalari   vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
  
-    // scan remaining valid triangles and bin each separately
-    while (_BitScanForward(&triIndex, triMask))
+    /// @todo: temporarily move instance loop in the FE to ensure SO ordering
+    for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
      {
-        uint32_t linkageCount = state.linkageCount;
-        uint32_t linkageMask  = state.linkageMask;
-        uint32_t numScalarAttribs = linkageCount * 4;
-        
-        BE_WORK work;
-        work.type = DRAW;
-
-        TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
-        desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
-        desc.triFlags.primID = pPrimID[triIndex];
-        desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
+        simdscalari vIndex;
+        uint32_t  i = 0;
  
-        if(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN)
+        if (IsIndexedT::value)
          {
-            work.pfnWork = gRasterizerTable[rastState.scissorEnable][rastState.sampleCount];
+            fetchInfo.pIndices = work.pIB;
          }
          else
          {
-            // for center sample pattern, all samples are at pixel center; calculate coverage
-            // once at center and broadcast the results in the backend
-            work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X];
+            vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
+            fetchInfo.pIndices = (const int32_t*)&vIndex;
          }
  
-        auto pArena = pDC->pArena;
-        SWR_ASSERT(pArena != nullptr);
-
-        // store active attribs
-        float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
-        desc.pAttribs = pAttribs;
-        desc.numAttribs = linkageCount;
-        ProcessAttributes<3>(pDC, pa, linkageMask, state.linkageMap, triIndex, desc.pAttribs);
-
-        // store triangle vertex data
-        desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
-
-        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
-        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
-        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
-        _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
-
-        // store user clip distances
-        if (rastState.clipDistanceMask)
-        {
-            uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
-            desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
-            ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
-        }
+        fetchInfo.CurInstance = instanceNum;
+        vsContext.InstanceID = instanceNum;
  
-        for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
+        while (pa.HasWork())
          {
-            for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
+            // GetNextVsOutput currently has the side effect of updating some PA state machine state.
+            // So we need to keep this outside of (i < endVertex) check.
+            simdmask* pvCutIndices = nullptr;
+            if (IsIndexedT::value)
              {
-#if KNOB_ENABLE_TOSS_POINTS
-                if (!KNOB_TOSS_SETUP_TRIS)
-#endif
-                {
-                    pTileMgr->enqueue(x, y, &work);
-                }
+                pvCutIndices = &pa.GetNextVsIndices();
              }
-        }
-        triMask &= ~(1 << triIndex);
-    }
-
-endBinTriangles:
-    RDTSC_STOP(FEBinTriangles, 1, 0);
-}
-
-
  
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bin SIMD points to the backend.  Only supports point size of 1
-/// @param pDC - pointer to draw context.
-/// @param pa - The primitive assembly object.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param tri - Contains point position data for SIMDs worth of points.
-/// @param primID - Primitive ID for each point.
-void BinPoints(
-    DRAW_CONTEXT *pDC,
-    PA_STATE& pa,
-    uint32_t workerId,
-    simdvector prim[3],
-    uint32_t primMask,
-    simdscalari primID)
-{
-    RDTSC_START(FEBinPoints);
-
-    simdvector& primVerts = prim[0];
-
-    const API_STATE& state = GetApiState(pDC);
-    const SWR_FRONTEND_STATE& feState = state.frontendState;
-    const SWR_GS_STATE& gsState = state.gsState;
-    const SWR_RASTSTATE& rastState = state.rastState;
-
-    if (!feState.vpTransformDisable)
-    {
-        // perspective divide
-        simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
-        primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
-        primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
-        primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
-
-        // viewport transform to screen coords
-        viewportTransform<1>(&primVerts, state.vpMatrix[0]);
-    }
-
-    // adjust for pixel center location
-    simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
-    primVerts.x = _simd_add_ps(primVerts.x, offset);
-    primVerts.y = _simd_add_ps(primVerts.y, offset);
-
-    // convert to fixed point
-    simdscalari vXi, vYi;
-    vXi = fpToFixedPointVertical(primVerts.x);
-    vYi = fpToFixedPointVertical(primVerts.y);
-
-    if (CanUseSimplePoints(pDC))
-    {
-        // adjust for top-left rule
-        vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
-        vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
-
-        // cull points off the top-left edge of the viewport
-        primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
-        primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
-
-        // compute macro tile coordinates 
-        simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
-        simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
-        OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
-        _simd_store_si((simdscalari*)aMacroX, macroX);
-        _simd_store_si((simdscalari*)aMacroY, macroY);
-
-        // compute raster tile coordinates
-        simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
-        simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-
-        // compute raster tile relative x,y for coverage mask
-        simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
-        simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
-
-        simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
-        simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
-
-        OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
-        OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
-        _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
-        _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
-
-        OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
-        OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
-        _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
-        _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
-
-        OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
-        _simd_store_ps((float*)aZ, primVerts.z);
-
-        // store render target array index
-        OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
-        if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
-        {
-            simdvector vRtai;
-            pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
-            simdscalari vRtaii = _simd_castps_si(vRtai.x);
-            _simd_store_si((simdscalari*)aRTAI, vRtaii);
-        }
-        else
-        {
-            _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
-        }
-
-        uint32_t *pPrimID = (uint32_t *)&primID;
-        DWORD primIndex = 0;
-        // scan remaining valid triangles and bin each separately
-        while (_BitScanForward(&primIndex, primMask))
-        {
-            uint32_t linkageCount = state.linkageCount;
-            uint32_t linkageMask = state.linkageMask;
-
-            uint32_t numScalarAttribs = linkageCount * 4;
-
-            BE_WORK work;
-            work.type = DRAW;
-
-            TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
-            // points are always front facing
-            desc.triFlags.frontFacing = 1;
-            desc.triFlags.primID = pPrimID[primIndex];
-            desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
-
-            work.pfnWork = RasterizeSimplePoint;
+            simdvertex& vout = pa.GetNextVsOutput();
+            vsContext.pVout = &vout;
  
-            auto pArena = pDC->pArena;
-            SWR_ASSERT(pArena != nullptr);
+            if (i < endVertex)
+            {
  
-            // store attributes
-            float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
-            desc.pAttribs = pAttribs;
-            desc.numAttribs = linkageCount;
+                // 1. Execute FS/VS for a single SIMD.
+                AR_BEGIN(FEFetchShader, pDC->drawId);
+                state.pfnFetchFunc(fetchInfo, vin);
+                AR_END(FEFetchShader, 0);
  
-            ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, pAttribs);
+                // forward fetch generated vertex IDs to the vertex shader
+                vsContext.VertexID = fetchInfo.VertexID;
  
-            // store raster tile aligned x, y, perspective correct z
-            float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
-            desc.pTriBuffer = pTriBuffer;
-            *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
-            *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
-            *pTriBuffer = aZ[primIndex];
+                // Setup active mask for vertex shader.
+                vsContext.mask = GenerateMask(endVertex - i);
  
-            uint32_t tX = aTileRelativeX[primIndex];
-            uint32_t tY = aTileRelativeY[primIndex];
+                // forward cut mask to the PA
+                if (IsIndexedT::value)
+                {
+                    *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
+                }
  
-            // pack the relative x,y into the coverageMask, the rasterizer will
-            // generate the true coverage mask from it
-            work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
+                UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
  
-            // bin it
-            MacroTileMgr *pTileMgr = pDC->pTileMgr;
  #if KNOB_ENABLE_TOSS_POINTS
-            if (!KNOB_TOSS_SETUP_TRIS)
+                if (!KNOB_TOSS_FETCH)
  #endif
-            {
-                pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
-            }
-            primMask &= ~(1 << primIndex);
-        }
-    }
-    else
-    {
-        // non simple points need to be potentially binned to multiple macro tiles
-        simdscalar vPointSize;
-        if (rastState.pointParam)
-        {
-            simdvector size[3];
-            pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
-            vPointSize = size[0].x;
-        }
-        else
-        {
-            vPointSize = _simd_set1_ps(rastState.pointSize);
-        }
-
-        // bloat point to bbox
-        simdBBox bbox;
-        bbox.left = bbox.right = vXi;
-        bbox.top = bbox.bottom = vYi;
-
-        simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
-        simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
-        bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
-        bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
-        bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
-        bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
-
-        // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
-        bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
-        bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
-        bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
-        bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
-
-        // Cull bloated points completely outside scissor
-        simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
-        simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
-        simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
-        uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
-        primMask = primMask & ~maskOutsideScissor;
-
-        // Convert bbox to macrotile units.
-        bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
-        bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-        bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
-        bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
-        OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
-        _simd_store_si((simdscalari*)aMTLeft, bbox.left);
-        _simd_store_si((simdscalari*)aMTRight, bbox.right);
-        _simd_store_si((simdscalari*)aMTTop, bbox.top);
-        _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
-
-        // store render target array index
-        OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
-        if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
-        {
-            simdvector vRtai[2];
-            pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
-            simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
-            _simd_store_si((simdscalari*)aRTAI, vRtaii);
-        }
-        else
-        {
-            _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
-        }
-
-        OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
-        _simd_store_ps((float*)aPointSize, vPointSize);
-
-        uint32_t *pPrimID = (uint32_t *)&primID;
-
-        OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
-        OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
-        OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
-
-        _simd_store_ps((float*)aPrimVertsX, primVerts.x);
-        _simd_store_ps((float*)aPrimVertsY, primVerts.y);
-        _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
-
-        // scan remaining valid prims and bin each separately
-        DWORD primIndex;
-        while (_BitScanForward(&primIndex, primMask))
-        {
-            uint32_t linkageCount = state.linkageCount;
-            uint32_t linkageMask = state.linkageMask;
-            uint32_t numScalarAttribs = linkageCount * 4;
-
-            BE_WORK work;
-            work.type = DRAW;
-
-            TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
-            desc.triFlags.frontFacing = 1;
-            desc.triFlags.primID = pPrimID[primIndex];
-            desc.triFlags.pointSize = aPointSize[primIndex];
-            desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
-
-            work.pfnWork = RasterizeTriPoint;
-
-            auto pArena = pDC->pArena;
-            SWR_ASSERT(pArena != nullptr);
-
-            // store active attribs
-            desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
-            desc.numAttribs = linkageCount;
-            ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
-
-            // store point vertex data
-            float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
-            desc.pTriBuffer = pTriBuffer;
-            *pTriBuffer++ = aPrimVertsX[primIndex];
-            *pTriBuffer++ = aPrimVertsY[primIndex];
-            *pTriBuffer = aPrimVertsZ[primIndex];
+                {
+                    AR_BEGIN(FEVertexShader, pDC->drawId);
+                    state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
+                    AR_END(FEVertexShader, 0);
  
-            // store user clip distances
-            if (rastState.clipDistanceMask)
-            {
-                uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
-                desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
-                ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
+                    UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
+                }
              }
  
-            MacroTileMgr *pTileMgr = pDC->pTileMgr;
-            for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
+            // 2. Assemble primitives given the last two SIMD.
+            do
              {
-                for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
+                simdvector prim[MAX_NUM_VERTS_PER_PRIM];
+                // PaAssemble returns false if there is not enough verts to assemble.
+                AR_BEGIN(FEPAAssemble, pDC->drawId);
+                bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
+                AR_END(FEPAAssemble, 1);
+
+#if KNOB_ENABLE_TOSS_POINTS
+                if (!KNOB_TOSS_FETCH)
+#endif
                  {
  #if KNOB_ENABLE_TOSS_POINTS
-                    if (!KNOB_TOSS_SETUP_TRIS)
+                    if (!KNOB_TOSS_VS)
  #endif
                      {
-                        pTileMgr->enqueue(x, y, &work);
+                        if (assemble)
+                        {
+                            UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
+
+                            if (HasTessellationT::value)
+                            {
+                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
+                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+                            }
+                            else if (HasGeometryShaderT::value)
+                            {
+                                GeometryShaderStage<HasStreamOutT, HasRastT>(
+                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+                            }
+                            else
+                            {
+                                // If streamout is enabled then stream vertices out to memory.
+                                if (HasStreamOutT::value)
+                                {
+                                    StreamOut(pDC, pa, workerId, pSoPrimData, 0);
+                                }
+
+                                if (HasRastT::value)
+                                {
+                                    SWR_ASSERT(pDC->pState->pfnProcessPrims);
+
+                                    pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
+                                        GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
+                                }
+                            }
+                        }
                      }
                  }
+            } while (pa.NextPrim());
+
+            if (IsIndexedT::value)
+            {
+                fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
+            }
+            else
+            {
+                vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
              }
  
-            primMask &= ~(1 << primIndex);
+            i += KNOB_SIMD_WIDTH;
          }
+        pa.Reset();
      }
  
+#endif
  
-
-    
-    RDTSC_STOP(FEBinPoints, 1, 0);
+    AR_END(FEProcessDraw, numPrims * work.numInstances);
  }
  
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bin SIMD lines to the backend.
-/// @param pDC - pointer to draw context.
-/// @param pa - The primitive assembly object.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param tri - Contains line position data for SIMDs worth of points.
-/// @param primID - Primitive ID for each line.
-void BinLines(
-    DRAW_CONTEXT *pDC,
-    PA_STATE& pa,
-    uint32_t workerId,
-    simdvector prim[],
-    uint32_t primMask,
-    simdscalari primID)
+struct FEDrawChooser
  {
-    RDTSC_START(FEBinLines);
-
-    const API_STATE& state = GetApiState(pDC);
-    const SWR_RASTSTATE& rastState = state.rastState;
-    const SWR_FRONTEND_STATE& feState = state.frontendState;
-    const SWR_GS_STATE& gsState = state.gsState;
-
-    simdscalar vRecipW0 = _simd_set1_ps(1.0f);
-    simdscalar vRecipW1 = _simd_set1_ps(1.0f);
-
-    if (!feState.vpTransformDisable)
-    {
-        // perspective divide
-        vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
-        vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
-
-        prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
-        prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
-
-        prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
-        prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
-
-        prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
-        prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
-
-        // viewport transform to screen coords
-        viewportTransform<2>(prim, state.vpMatrix[0]);
-    }
-
-    // adjust for pixel center location
-    simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
-    prim[0].x = _simd_add_ps(prim[0].x, offset);
-    prim[0].y = _simd_add_ps(prim[0].y, offset);
-
-    prim[1].x = _simd_add_ps(prim[1].x, offset);
-    prim[1].y = _simd_add_ps(prim[1].y, offset);
-
-    // convert to fixed point
-    simdscalari vXi[2], vYi[2];
-    vXi[0] = fpToFixedPointVertical(prim[0].x);
-    vYi[0] = fpToFixedPointVertical(prim[0].y);
-    vXi[1] = fpToFixedPointVertical(prim[1].x);
-    vYi[1] = fpToFixedPointVertical(prim[1].y);
-
-    // compute x-major vs y-major mask
-    simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
-    simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
-    simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
-    uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
-
-    // cull zero-length lines
-    simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
-    vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
-
-    primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
-
-    uint32_t *pPrimID = (uint32_t *)&primID;
-
-    simdscalar vUnused = _simd_setzero_ps();
-
-    // Calc bounding box of lines
-    simdBBox bbox;
-    bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
-    bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
-    bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
-    bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
-
-    // bloat bbox by line width along minor axis
-    simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
-    simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
-    simdBBox bloatBox;
-    bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
-    bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
-    bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
-    bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
-
-    bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
-    bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
-    bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
-    bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
-
-    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
-    bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
-    bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
-    bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
-    bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
-
-    // Cull prims completely outside scissor
-    {
-        simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
-        simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
-        simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
-        uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
-        primMask = primMask & ~maskOutsideScissor;
-    }
-
-    if (!primMask)
-    {
-        goto endBinLines;
-    }
-
-    // Convert triangle bbox to macrotile units.
-    bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
-    bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-    bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
-    bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
-    OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
-    _simd_store_si((simdscalari*)aMTLeft, bbox.left);
-    _simd_store_si((simdscalari*)aMTRight, bbox.right);
-    _simd_store_si((simdscalari*)aMTTop, bbox.top);
-    _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
-
-    // transpose verts needed for backend
-    /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
-    vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
-    vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
-    vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
-    vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
-
-    // store render target array index
-    OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
-    if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
-    {
-        simdvector vRtai[2];
-        pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
-        simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
-        _simd_store_si((simdscalari*)aRTAI, vRtaii);
-    }
-    else
-    {
-        _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
-    }
+    typedef PFN_FE_WORK_FUNC FuncType;
  
-    // scan remaining valid prims and bin each separately
-    DWORD primIndex;
-    while (_BitScanForward(&primIndex, primMask))
+    template <typename... ArgsB>
+    static FuncType GetFunc()
      {
-        uint32_t linkageCount = state.linkageCount;
-        uint32_t linkageMask = state.linkageMask;
-        uint32_t numScalarAttribs = linkageCount * 4;
-
-        BE_WORK work;
-        work.type = DRAW;
-
-        TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
-        desc.triFlags.frontFacing = 1;
-        desc.triFlags.primID = pPrimID[primIndex];
-        desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
-        desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
-
-        work.pfnWork = RasterizeLine;
-
-        auto pArena = pDC->pArena;
-        SWR_ASSERT(pArena != nullptr);
-
-        // store active attribs
-        desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
-        desc.numAttribs = linkageCount;
-        ProcessAttributes<2>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
-
-        // store line vertex data
-        desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
-        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
-
-        // store user clip distances
-        if (rastState.clipDistanceMask)
-        {
-            uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
-            desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
-            ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
-        }
-
-        MacroTileMgr *pTileMgr = pDC->pTileMgr;
-        for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
-        {
-            for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
-            {
-#if KNOB_ENABLE_TOSS_POINTS
-                if (!KNOB_TOSS_SETUP_TRIS)
-#endif
-                {
-                    pTileMgr->enqueue(x, y, &work);
-                }
-            }
-        }
-
-        primMask &= ~(1 << primIndex);
+        return ProcessDraw<ArgsB...>;
      }
+};
  
-endBinLines:
  
-    RDTSC_STOP(FEBinLines, 1, 0);
+// Selector for correct templated Draw front-end function
+PFN_FE_WORK_FUNC GetProcessDrawFunc(
+    bool IsIndexed,
+    bool IsCutIndexEnabled,
+    bool HasTessellation,
+    bool HasGeometryShader,
+    bool HasStreamOut,
+    bool HasRasterization)
+{
+    return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
  }