gallium/swr: Fix compilation warnings

[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.cpp
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp

index 9e2f35725c535947ab268c5eab50ab6a65b51188..13aa89ed5ddc385485f149e8e7fa2df29898fab6 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1,31 +1,31 @@
  /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file frontend.cpp
-*
-* @brief Implementation for Frontend which handles vertex processing,
-*        primitive assembly, clipping, binning, etc.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file frontend.cpp
+ *
+ * @brief Implementation for Frontend which handles vertex processing,
+ *        primitive assembly, clipping, binning, etc.
+ *
+ ******************************************************************************/
  
  #include "api.h"
  #include "frontend.h"
@@ -39,14 +39,7 @@
  #include "tilemgr.h"
  #include "tessellator.h"
  #include <limits>
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Helper macro to generate a bitmask
-static INLINE uint32_t GenMask(uint32_t numBits)
-{
-    SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
-    return ((1U << numBits) - 1);
-}
+#include <iostream>
  
  //////////////////////////////////////////////////////////////////////////
  /// @brief FE handler for SwrSync.
@@ -55,17 +48,13 @@ static INLINE uint32_t GenMask(uint32_t numBits)
  /// @param workerId - thread's worker id. Even thread has a unique id.
  /// @param pUserData - Pointer to user data passed back to sync callback.
  /// @todo This should go away when we switch this to use compute threading.
-void ProcessSync(
-    SWR_CONTEXT *pContext,
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    void *pUserData)
+void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
  {
      BE_WORK work;
-    work.type = SYNC;
+    work.type    = SYNC;
      work.pfnWork = ProcessSyncBE;
  
-    MacroTileMgr *pTileMgr = pDC->pTileMgr;
+    MacroTileMgr* pTileMgr = pDC->pTileMgr;
      pTileMgr->enqueue(0, 0, &work);
  }
  
@@ -75,17 +64,13 @@ void ProcessSync(
  /// @param pDC - pointer to draw context.
  /// @param workerId - thread's worker id. Even thread has a unique id.
  /// @param pUserData - Pointer to user data passed back to sync callback.
-void ProcessShutdown(
-    SWR_CONTEXT *pContext,
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    void *pUserData)
+void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
  {
      BE_WORK work;
-    work.type = SHUTDOWN;
+    work.type    = SHUTDOWN;
      work.pfnWork = ProcessShutdownBE;
  
-    MacroTileMgr *pTileMgr = pDC->pTileMgr;
+    MacroTileMgr* pTileMgr = pDC->pTileMgr;
      // Enqueue at least 1 work item for each worker thread
      // account for number of numa nodes
      uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
@@ -106,14 +91,10 @@ void ProcessShutdown(
  /// @param workerId - thread's worker id. Even thread has a unique id.
  /// @param pUserData - Pointer to user data passed back to clear callback.
  /// @todo This should go away when we switch this to use compute threading.
-void ProcessClear(
-    SWR_CONTEXT *pContext,
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    void *pUserData)
+void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
  {
-    CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
-    MacroTileMgr *pTileMgr = pDC->pTileMgr;
+    CLEAR_DESC*   pDesc    = (CLEAR_DESC*)pUserData;
+    MacroTileMgr* pTileMgr = pDC->pTileMgr;
  
      // queue a clear to each macro tile
      // compute macro tile bounds for the specified rect
@@ -123,8 +104,8 @@ void ProcessClear(
      uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
  
      BE_WORK work;
-    work.type = CLEAR;
-    work.pfnWork = ProcessClearBE;
+    work.type       = CLEAR;
+    work.pfnWork    = ProcessClearBE;
      work.desc.clear = *pDesc;
  
      for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
@@ -143,15 +124,11 @@ void ProcessClear(
  /// @param workerId - thread's worker id. Even thread has a unique id.
  /// @param pUserData - Pointer to user data passed back to callback.
  /// @todo This should go away when we switch this to use compute threading.
-void ProcessStoreTiles(
-    SWR_CONTEXT *pContext,
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    void *pUserData)
+void ProcessStoreTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
  {
-    AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
-    MacroTileMgr *pTileMgr = pDC->pTileMgr;
-    STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
+    RDTSC_BEGIN(pContext->pBucketMgr, FEProcessStoreTiles, pDC->drawId);
+    MacroTileMgr*     pTileMgr = pDC->pTileMgr;
+    STORE_TILES_DESC* pDesc    = (STORE_TILES_DESC*)pUserData;
  
      // queue a store to each macro tile
      // compute macro tile bounds for the specified rect
@@ -162,8 +139,8 @@ void ProcessStoreTiles(
  
      // store tiles
      BE_WORK work;
-    work.type = STORETILES;
-    work.pfnWork = ProcessStoreTilesBE;
+    work.type            = STORETILES;
+    work.pfnWork         = ProcessStoreTilesBE;
      work.desc.storeTiles = *pDesc;
  
      for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
@@ -174,7 +151,7 @@ void ProcessStoreTiles(
          }
      }
  
-    AR_END(FEProcessStoreTiles, 0);
+    RDTSC_END(pContext->pBucketMgr, FEProcessStoreTiles, 0);
  }
  
  //////////////////////////////////////////////////////////////////////////
@@ -184,15 +161,14 @@ void ProcessStoreTiles(
  /// @param workerId - thread's worker id. Even thread has a unique id.
  /// @param pUserData - Pointer to user data passed back to callback.
  /// @todo This should go away when we switch this to use compute threading.
-void ProcessDiscardInvalidateTiles(
-    SWR_CONTEXT *pContext,
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    void *pUserData)
+void ProcessDiscardInvalidateTiles(SWR_CONTEXT*  pContext,
+                                   DRAW_CONTEXT* pDC,
+                                   uint32_t      workerId,
+                                   void*         pUserData)
  {
-    AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
-    DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
-    MacroTileMgr *pTileMgr = pDC->pTileMgr;
+    RDTSC_BEGIN(pContext->pBucketMgr, FEProcessInvalidateTiles, pDC->drawId);
+    DISCARD_INVALIDATE_TILES_DESC* pDesc    = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
+    MacroTileMgr*                  pTileMgr = pDC->pTileMgr;
  
      // compute macro tile bounds for the specified rect
      uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
@@ -217,8 +193,8 @@ void ProcessDiscardInvalidateTiles(
  
      // load tiles
      BE_WORK work;
-    work.type = DISCARDINVALIDATETILES;
-    work.pfnWork = ProcessDiscardInvalidateTilesBE;
+    work.type                        = DISCARDINVALIDATETILES;
+    work.pfnWork                     = ProcessDiscardInvalidateTilesBE;
      work.desc.discardInvalidateTiles = *pDesc;
  
      for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
@@ -229,7 +205,7 @@ void ProcessDiscardInvalidateTiles(
          }
      }
  
-    AR_END(FEProcessInvalidateTiles, 0);
+    RDTSC_END(pContext->pBucketMgr, FEProcessInvalidateTiles, 0);
  }
  
  //////////////////////////////////////////////////////////////////////////
@@ -237,27 +213,40 @@ void ProcessDiscardInvalidateTiles(
  /// @param mode - primitive topology for draw operation.
  /// @param numPrims - number of vertices or indices for draw.
  /// @todo Frontend needs to be refactored. This will go in appropriate place then.
-uint32_t GetNumPrims(
-    PRIMITIVE_TOPOLOGY mode,
-    uint32_t numPrims)
+uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
  {
      switch (mode)
      {
-    case TOP_POINT_LIST: return numPrims;
-    case TOP_TRIANGLE_LIST: return numPrims / 3;
-    case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
-    case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
-    case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
-    case TOP_QUAD_LIST: return numPrims / 4;
-    case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
-    case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
-    case TOP_LINE_LIST: return numPrims / 2;
-    case TOP_LINE_LOOP: return numPrims;
-    case TOP_RECT_LIST: return numPrims / 3;
-    case TOP_LINE_LIST_ADJ: return numPrims / 4;
-    case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
-    case TOP_TRI_LIST_ADJ: return numPrims / 6;
-    case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
+    case TOP_POINT_LIST:
+        return numPrims;
+    case TOP_TRIANGLE_LIST:
+        return numPrims / 3;
+    case TOP_TRIANGLE_STRIP:
+        return numPrims < 3 ? 0 : numPrims - 2;
+    case TOP_TRIANGLE_FAN:
+        return numPrims < 3 ? 0 : numPrims - 2;
+    case TOP_TRIANGLE_DISC:
+        return numPrims < 2 ? 0 : numPrims - 1;
+    case TOP_QUAD_LIST:
+        return numPrims / 4;
+    case TOP_QUAD_STRIP:
+        return numPrims < 4 ? 0 : (numPrims - 2) / 2;
+    case TOP_LINE_STRIP:
+        return numPrims < 2 ? 0 : numPrims - 1;
+    case TOP_LINE_LIST:
+        return numPrims / 2;
+    case TOP_LINE_LOOP:
+        return numPrims;
+    case TOP_RECT_LIST:
+        return numPrims / 3;
+    case TOP_LINE_LIST_ADJ:
+        return numPrims / 4;
+    case TOP_LISTSTRIP_ADJ:
+        return numPrims < 3 ? 0 : numPrims - 3;
+    case TOP_TRI_LIST_ADJ:
+        return numPrims / 6;
+    case TOP_TRI_STRIP_ADJ:
+        return numPrims < 4 ? 0 : (numPrims / 2) - 2;
  
      case TOP_PATCHLIST_1:
      case TOP_PATCHLIST_2:
@@ -313,27 +302,40 @@ uint32_t GetNumPrims(
  /// @brief Computes the number of verts given the number of primitives.
  /// @param mode - primitive topology for draw operation.
  /// @param numPrims - number of primitives for draw.
-uint32_t GetNumVerts(
-    PRIMITIVE_TOPOLOGY mode,
-    uint32_t numPrims)
+uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
  {
      switch (mode)
      {
-    case TOP_POINT_LIST: return numPrims;
-    case TOP_TRIANGLE_LIST: return numPrims * 3;
-    case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
-    case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
-    case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
-    case TOP_QUAD_LIST: return numPrims * 4;
-    case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
-    case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
-    case TOP_LINE_LIST: return numPrims * 2;
-    case TOP_LINE_LOOP: return numPrims;
-    case TOP_RECT_LIST: return numPrims * 3;
-    case TOP_LINE_LIST_ADJ: return numPrims * 4;
-    case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
-    case TOP_TRI_LIST_ADJ: return numPrims * 6;
-    case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
+    case TOP_POINT_LIST:
+        return numPrims;
+    case TOP_TRIANGLE_LIST:
+        return numPrims * 3;
+    case TOP_TRIANGLE_STRIP:
+        return numPrims ? numPrims + 2 : 0;
+    case TOP_TRIANGLE_FAN:
+        return numPrims ? numPrims + 2 : 0;
+    case TOP_TRIANGLE_DISC:
+        return numPrims ? numPrims + 1 : 0;
+    case TOP_QUAD_LIST:
+        return numPrims * 4;
+    case TOP_QUAD_STRIP:
+        return numPrims ? numPrims * 2 + 2 : 0;
+    case TOP_LINE_STRIP:
+        return numPrims ? numPrims + 1 : 0;
+    case TOP_LINE_LIST:
+        return numPrims * 2;
+    case TOP_LINE_LOOP:
+        return numPrims;
+    case TOP_RECT_LIST:
+        return numPrims * 3;
+    case TOP_LINE_LIST_ADJ:
+        return numPrims * 4;
+    case TOP_LISTSTRIP_ADJ:
+        return numPrims ? numPrims + 3 : 0;
+    case TOP_TRI_LIST_ADJ:
+        return numPrims * 6;
+    case TOP_TRI_STRIP_ADJ:
+        return numPrims ? (numPrims + 2) * 2 : 0;
  
      case TOP_PATCHLIST_1:
      case TOP_PATCHLIST_2:
@@ -389,7 +391,7 @@ uint32_t GetNumVerts(
  /// @brief Return number of verts per primitive.
  /// @param topology - topology
  /// @param includeAdjVerts - include adjacent verts in primitive vertices
-INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
+uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
  {
      uint32_t numVerts = 0;
      switch (topology)
@@ -464,10 +466,15 @@ INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVert
          switch (topology)
          {
          case TOP_LISTSTRIP_ADJ:
-        case TOP_LINE_LIST_ADJ: numVerts = 4; break;
+        case TOP_LINE_LIST_ADJ:
+            numVerts = 4;
+            break;
          case TOP_TRI_STRIP_ADJ:
-        case TOP_TRI_LIST_ADJ: numVerts = 6; break;
-        default: break;
+        case TOP_TRI_LIST_ADJ:
+            numVerts = 6;
+            break;
+        default:
+            break;
          }
      }
  
@@ -479,9 +486,18 @@ INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVert
  /// @param numWorkItems - Number of items being worked on by a SIMD.
  static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
  {
-    uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
+    uint32_t numActive =
+        (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
      uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
-    return _simd_castps_si(vMask(mask));
+    return _simd_castps_si(_simd_vmask_ps(mask));
+}
+
+static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
+{
+    uint32_t numActive =
+        (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
+    uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
+    return _simd16_castps_si(_simd16_vmask_ps(mask));
  }
  
  //////////////////////////////////////////////////////////////////////////
@@ -491,25 +507,22 @@ static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
  /// @param workerId - thread's worker id. Even thread has a unique id.
  /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
  static void StreamOut(
-    DRAW_CONTEXT* pDC,
-    PA_STATE& pa,
-    uint32_t workerId,
-    uint32_t* pPrimData,
-    uint32_t streamIndex)
+    DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t workerId, uint32_t* pPrimData, uint32_t streamIndex)
  {
-    SWR_CONTEXT *pContext = pDC->pContext;
+    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEStreamout, pDC->drawId);
  
-    AR_BEGIN(FEStreamout, pDC->drawId);
+    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
  
-    const API_STATE& state = GetApiState(pDC);
-    const SWR_STREAMOUT_STATE &soState = state.soState;
+    const API_STATE&           state   = GetApiState(pDC);
+    const SWR_STREAMOUT_STATE& soState = state.soState;
  
      uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
  
-    // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
+    // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each
+    // vertex.
      uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
  
-    SWR_STREAMOUT_CONTEXT soContext = { 0 };
+    SWR_STREAMOUT_CONTEXT soContext = {0};
  
      // Setup buffer state pointers.
      for (uint32_t i = 0; i < 4; ++i)
@@ -521,14 +534,14 @@ static void StreamOut(
  
      for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
      {
-        DWORD slot = 0;
-        uint32_t soMask = soState.streamMasks[streamIndex];
+        DWORD    slot   = 0;
+        uint64_t soMask = soState.streamMasks[streamIndex];
  
          // Write all entries into primitive data buffer for SOS.
-        while (_BitScanForward(&slot, soMask))
+        while (_BitScanForward64(&slot, soMask))
          {
-            __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
-            uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
+            simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
+            uint32_t    paSlot = slot + soState.vertexAttribOffset[streamIndex];
              pa.AssembleSingle(paSlot, primIndex, attrib);
  
              // Attribute offset is relative offset from start of vertex.
@@ -540,20 +553,22 @@ static void StreamOut(
              // Store each vertex's attrib at appropriate locations in pPrimData buffer.
              for (uint32_t v = 0; v < soVertsPerPrim; ++v)
              {
-                uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
+                uint32_t* pPrimDataAttrib =
+                    pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
  
                  _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
              }
  
-            soMask &= ~(1 << slot);
+            soMask &= ~(uint64_t(1) << slot);
          }
  
-        // Update pPrimData pointer 
+        // Update pPrimData pointer
          soContext.pPrimData = pPrimData;
  
          // Call SOS
-        SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
-        state.pfnSoFunc[streamIndex](soContext);
+        SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr,
+                   "Trying to execute uninitialized streamout jit function.");
+        state.pfnSoFunc[streamIndex](GetPrivateState(pDC), pWorkerData, soContext);
      }
  
      // Update SO write offset. The driver provides memory for the update.
@@ -561,7 +576,10 @@ static void StreamOut(
      {
          if (state.soBuffer[i].pWriteOffset)
          {
-            *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
+            bool  nullTileAccessed = false;
+            void* pWriteOffset     = pDC->pContext->pfnTranslateGfxptrForWrite(
+                GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed, pWorkerData);
+            *((uint32_t*)pWriteOffset) = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
          }
  
          if (state.soBuffer[i].soWriteEnable)
@@ -571,10 +589,12 @@ static void StreamOut(
          }
      }
  
+    pDC->dynState.soPrims += soContext.numPrimsWritten;
+
      UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
      UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
  
-    AR_END(FEStreamout, 1);
+    RDTSC_END(pDC->pContext->pBucketMgr, FEStreamout, 1);
  }
  
  #if USE_SIMD16_FRONTEND
@@ -614,7 +634,10 @@ INLINE static T RoundDownEven(T value)
  ///
  /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
  ///
-void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
+void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex*     vertex_simd16,
+                                           const simdvertex* vertex,
+                                           uint32_t          vertexCount,
+                                           uint32_t          attribCount)
  {
      SWR_ASSERT(vertex);
      SWR_ASSERT(vertex_simd16);
@@ -628,11 +651,13 @@ void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const si
          {
              for (uint32_t k = 0; k < 4; k += 1)
              {
-                temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
+                temp.attrib[j][k] =
+                    _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
  
                  if ((i + 1) < vertexCount)
                  {
-                    temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
+                    temp.attrib[j][k] =
+                        _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
                  }
              }
          }
@@ -652,9 +677,7 @@ void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const si
  ///        then return the remaining amount of work.
  /// @param curIndex - The start index for the SIMD.
  /// @param maxIndex - The last index for all work items.
-static INLINE uint32_t GetNumInvocations(
-    uint32_t curIndex,
-    uint32_t maxIndex)
+static INLINE uint32_t GetNumInvocations(uint32_t curIndex, uint32_t maxIndex)
  {
      uint32_t remainder = (maxIndex - curIndex);
  #if USE_SIMD16_FRONTEND
@@ -674,17 +697,19 @@ static INLINE uint32_t GetNumInvocations(
  /// @param pStreamIdBase - pointer to the stream ID buffer
  /// @param numEmittedVerts - Number of total verts emitted by the GS
  /// @param pCutBuffer - output buffer to write cuts to
-void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
+void ProcessStreamIdBuffer(uint32_t stream,
+                           uint8_t* pStreamIdBase,
+                           uint32_t numEmittedVerts,
+                           uint8_t* pCutBuffer)
  {
      SWR_ASSERT(stream < MAX_SO_STREAMS);
  
-    uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
-    uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
+    uint32_t numOutputBytes = AlignUp(numEmittedVerts, 8) / 8;
  
      for (uint32_t b = 0; b < numOutputBytes; ++b)
      {
-        uint8_t curInputByte = pStreamIdBase[2*b];
-        uint8_t outByte = 0;
+        uint8_t curInputByte = pStreamIdBase[2 * b];
+        uint8_t outByte      = 0;
          for (uint32_t i = 0; i < 4; ++i)
          {
              if ((curInputByte & 0x3) != stream)
@@ -708,47 +733,83 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num
      }
  }
  
-THREAD SWR_GS_CONTEXT tlsGsContext;
+// Buffers that are allocated if GS is enabled
+struct GsBuffers
+{
+    uint8_t* pGsIn;
+    uint8_t* pGsOut[KNOB_SIMD_WIDTH];
+    uint8_t* pGsTransposed;
+    void*    pStreamCutBuffer;
+};
  
-template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
-struct GsBufferInfo
+//////////////////////////////////////////////////////////////////////////
+/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
+/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive
+/// assembler
+/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
+/// @param numVerts - Number of vertices outputted by the GS
+/// @param numAttribs - Number of attributes per vertex
+template <typename SIMD_T, uint32_t SimdWidth>
+void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
  {
-    GsBufferInfo(const SWR_GS_STATE &gsState)
+    uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
+    uint32_t dstVertexStride = numAttribs * sizeof(Float<SIMD_T>) * 4;
+
+    OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
+
+    for (uint32_t i = 0; i < SimdWidth; ++i)
      {
-        const uint32_t vertexCount = gsState.maxNumVerts;
-        const uint32_t vertexStride = sizeof(SIMDVERTEX);
-        const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;
+        gatherOffsets[i] = srcVertexStride * i;
+    }
+    auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]);
  
-        vertexPrimitiveStride = vertexStride * numSimdBatches;
-        vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
+    uint32_t numSimd        = AlignUp(numVerts, SimdWidth) / SimdWidth;
+    uint32_t remainingVerts = numVerts;
  
-        if (gsState.isSingleStream)
-        {
-            cutPrimitiveStride = (vertexCount + 7) / 8;
-            cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
+    for (uint32_t s = 0; s < numSimd; ++s)
+    {
+        uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
+        uint8_t* pDstBase = pDst + s * dstVertexStride;
  
-            streamCutPrimitiveStride = 0;
-            streamCutInstanceStride = 0;
-        }
-        else
-        {
-            cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
-            cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
+        // Compute mask to prevent src overflow
+        uint32_t mask = std::min(remainingVerts, SimdWidth);
+        mask          = GenMask(mask);
+        auto vMask    = SIMD_T::vmask_ps(mask);
+        auto viMask   = SIMD_T::castps_si(vMask);
  
-            streamCutPrimitiveStride = (vertexCount + 7) / 8;
-            streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
+        for (uint32_t a = 0; a < numAttribs; ++a)
+        {
+            auto attribGatherX = SIMD_T::mask_i32gather_ps(
+                SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
+            auto attribGatherY = SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
+                                                           (const float*)(pSrcBase + sizeof(float)),
+                                                           vGatherOffsets,
+                                                           vMask);
+            auto attribGatherZ =
+                SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
+                                          (const float*)(pSrcBase + sizeof(float) * 2),
+                                          vGatherOffsets,
+                                          vMask);
+            auto attribGatherW =
+                SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
+                                          (const float*)(pSrcBase + sizeof(float) * 3),
+                                          vGatherOffsets,
+                                          vMask);
+
+            SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
+            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
+            SIMD_T::maskstore_ps(
+                (float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ);
+            SIMD_T::maskstore_ps(
+                (float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW);
+
+            pSrcBase += sizeof(float) * 4;
+            pDstBase += sizeof(Float<SIMD_T>) * 4;
          }
+        remainingVerts -= SimdWidth;
      }
+}
  
-    uint32_t vertexPrimitiveStride;
-    uint32_t vertexInstanceStride;
-
-    uint32_t cutPrimitiveStride;
-    uint32_t cutInstanceStride;
-
-    uint32_t streamCutPrimitiveStride;
-    uint32_t streamCutInstanceStride;
-};
  
  //////////////////////////////////////////////////////////////////////////
  /// @brief Implements GS stage.
@@ -756,64 +817,50 @@ struct GsBufferInfo
  /// @param workerId - thread's worker id. Even thread has a unique id.
  /// @param pa - The primitive assembly object.
  /// @param pGsOut - output stream for GS
-template <
-    typename HasStreamOutT,
-    typename HasRastT>
-static void GeometryShaderStage(
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    PA_STATE& pa,
-    void* pGsOut,
-    void* pCutBuffer,
-    void* pStreamCutBuffer,
-    uint32_t* pSoPrimData,
+template <typename HasStreamOutT, typename HasRastT>
+static void GeometryShaderStage(DRAW_CONTEXT* pDC,
+                                uint32_t      workerId,
+                                PA_STATE&     pa,
+                                GsBuffers*    pGsBuffers,
+                                uint32_t*     pSoPrimData,
  #if USE_SIMD16_FRONTEND
-    uint32_t numPrims_simd8,
+                                uint32_t numPrims_simd8,
  #endif
-    simdscalari primID)
+                                simdscalari const& primID)
  {
-    SWR_CONTEXT *pContext = pDC->pContext;
+    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEGeometryShader, pDC->drawId);
  
-    AR_BEGIN(FEGeometryShader, pDC->drawId);
+    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
  
-    const API_STATE& state = GetApiState(pDC);
+    const API_STATE&    state  = GetApiState(pDC);
      const SWR_GS_STATE* pState = &state.gsState;
+    SWR_GS_CONTEXT      gsContext;
  
-    SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
-    SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
+    static uint8_t sNullBuffer[128] = {0};
  
-    tlsGsContext.pStream = (uint8_t*)pGsOut;
-    tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
-    tlsGsContext.PrimitiveID = primID;
+    for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+    {
+        gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
+    }
+    gsContext.pVerts      = (simdvector*)pGsBuffers->pGsIn;
+    gsContext.PrimitiveID = primID;
  
-    uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
+    uint32_t   numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
      simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
  
      // assemble all attributes for the input primitive
+    gsContext.inputVertStride = pState->inputVertStride;
      for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
      {
-        uint32_t attribSlot = pState->vertexAttribOffset + slot;
-        pa.Assemble(attribSlot, attrib);
+        uint32_t attribOffset = slot + pState->vertexAttribOffset;
+        pa.Assemble(attribOffset, attrib);
  
          for (uint32_t i = 0; i < numVertsPerPrim; ++i)
          {
-            tlsGsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = attrib[i];
+            gsContext.pVerts[attribOffset + pState->inputVertStride * i] = attrib[i];
          }
      }
  
-    // assemble position
-    pa.Assemble(VERTEX_POSITION_SLOT, attrib);
-    for (uint32_t i = 0; i < numVertsPerPrim; ++i)
-    {
-        tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
-    }
-
-#if USE_SIMD16_FRONTEND
-    const GsBufferInfo<simd16vertex, KNOB_SIMD16_WIDTH> bufferInfo(state.gsState);
-#else
-    const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);
-#endif
-
      // record valid prims from the frontend to avoid over binning the newly generated
      // prims from the GS
  #if USE_SIMD16_FRONTEND
@@ -824,14 +871,17 @@ static void GeometryShaderStage(
  
      for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
      {
-        tlsGsContext.InstanceID = instance;
-        tlsGsContext.mask = GenerateMask(numInputPrims);
+        gsContext.InstanceID = instance;
+        gsContext.mask       = GenerateMask(numInputPrims);
  
          // execute the geometry shader
-        state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
+        state.pfnGsFunc(GetPrivateState(pDC), pWorkerData, &gsContext);
+        AR_EVENT(GSStats((HANDLE)&gsContext.stats));
  
-        tlsGsContext.pStream += bufferInfo.vertexInstanceStride;
-        tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride;
+        for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+        {
+            gsContext.pStreams[i] += pState->allocationSize;
+        }
      }
  
      // set up new binner and state for the GS output topology
@@ -841,23 +891,43 @@ static void GeometryShaderStage(
      {
          switch (pState->outputTopology)
          {
-        case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles_simd16; break;
-        case TOP_LINE_STRIP:        pfnClipFunc = ClipLines_simd16; break;
-        case TOP_POINT_LIST:        pfnClipFunc = ClipPoints_simd16; break;
-        default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
+        case TOP_RECT_LIST:
+            pfnClipFunc = ClipRectangles_simd16;
+            break;
+        case TOP_TRIANGLE_STRIP:
+            pfnClipFunc = ClipTriangles_simd16;
+            break;
+        case TOP_LINE_STRIP:
+            pfnClipFunc = ClipLines_simd16;
+            break;
+        case TOP_POINT_LIST:
+            pfnClipFunc = ClipPoints_simd16;
+            break;
+        default:
+            SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
          }
      }
  
  #else
-    PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
+    PFN_PROCESS_PRIMS pfnClipFunc   = nullptr;
      if (HasRastT::value)
      {
          switch (pState->outputTopology)
          {
-        case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
-        case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
-        case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
-        default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
+        case TOP_RECT_LIST:
+            pfnClipFunc = ClipRectangles;
+            break;
+        case TOP_TRIANGLE_STRIP:
+            pfnClipFunc = ClipTriangles;
+            break;
+        case TOP_LINE_STRIP:
+            pfnClipFunc = ClipLines;
+            break;
+        case TOP_POINT_LIST:
+            pfnClipFunc = ClipPoints;
+            break;
+        default:
+            SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
          }
      }
  
@@ -865,41 +935,65 @@ static void GeometryShaderStage(
      // foreach input prim:
      // - setup a new PA based on the emitted verts for that prim
      // - loop over the new verts, calling PA to assemble each prim
-    uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
      uint32_t* pPrimitiveId = (uint32_t*)&primID;
  
      uint32_t totalPrimsGenerated = 0;
      for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
      {
-        uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * bufferInfo.vertexPrimitiveStride;
-        uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * bufferInfo.cutPrimitiveStride;
+        uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
+
+        // Vertex count is either emitted by shader or static
+        uint32_t vertexCount = 0;
+        if (pState->staticVertexCount)
+        {
+            vertexCount = pState->staticVertexCount;
+        }
+        else
+        {
+            // If emitted in shader, it should be the stored in the first dword of the output buffer
+            vertexCount = *(uint32_t*)pInstanceBase;
+        }
  
          for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
          {
-            uint32_t numEmittedVerts = pVertexCount[inputPrim];
+            uint32_t numEmittedVerts = vertexCount;
              if (numEmittedVerts == 0)
              {
                  continue;
              }
  
-            uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;
-            uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;
+            uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
+            uint8_t* pCutBase =
+                pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
+            uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
+
+#if USE_SIMD16_FRONTEND
+            TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
+                                                          pVertexBaseAOS,
+                                                          vertexCount,
+                                                          pState->outputVertexSize);
+#else
+            TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
+                                                        pVertexBaseAOS,
+                                                        vertexCount,
+                                                        pState->outputVertexSize);
+#endif
  
              uint32_t numAttribs = state.feNumAttributes;
  
              for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
              {
-                bool processCutVerts = false;
-
-                uint8_t* pCutBuffer = pCutBase;
+                bool     processCutVerts = false;
+                uint8_t* pCutBuffer      = pCutBase;
  
                  // assign default stream ID, only relevant when GS is outputting a single stream
                  uint32_t streamID = 0;
                  if (pState->isSingleStream)
                  {
                      processCutVerts = true;
-                    streamID = pState->singleStreamID;
-                    if (streamID != stream) continue;
+                    streamID        = pState->singleStreamID;
+                    if (streamID != stream)
+                        continue;
                  }
                  else
                  {
@@ -910,16 +1004,35 @@ static void GeometryShaderStage(
                      }
  
                      // multi-stream output, need to translate StreamID buffer to a cut buffer
-                    ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
-                    pCutBuffer = (uint8_t*)pStreamCutBuffer;
+                    ProcessStreamIdBuffer(
+                        stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
+                    pCutBuffer      = (uint8_t*)pGsBuffers->pStreamCutBuffer;
                      processCutVerts = false;
                  }
  
  #if USE_SIMD16_FRONTEND
-                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+                PA_STATE_CUT gsPa(pDC,
+                                  (uint8_t*)pGsBuffers->pGsTransposed,
+                                  numEmittedVerts,
+                                  pState->outputVertexSize,
+                                  reinterpret_cast<simd16mask*>(pCutBuffer),
+                                  numEmittedVerts,
+                                  numAttribs,
+                                  pState->outputTopology,
+                                  processCutVerts,
+                                  pa.numVertsPerPrim);
  
  #else
-                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+                PA_STATE_CUT gsPa(pDC,
+                                  (uint8_t*)pGsBuffers->pGsTransposed,
+                                  numEmittedVerts,
+                                  pState->outputVertexSize,
+                                  pCutBuffer,
+                                  numEmittedVerts,
+                                  numAttribs,
+                                  pState->outputTopology,
+                                  processCutVerts,
+                                  pa.numVertsPerPrim);
  
  #endif
                  while (gsPa.GetNextStreamOutput())
@@ -929,7 +1042,7 @@ static void GeometryShaderStage(
  #if USE_SIMD16_FRONTEND
                          simd16vector attrib_simd16[3];
  
-                        bool assemble = gsPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
+                        bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
  
  #else
                          bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
@@ -941,7 +1054,9 @@ static void GeometryShaderStage(
  
                              if (HasStreamOutT::value)
                              {
+#if ENABLE_AVX512_SIMD16
                                  gsPa.useAlternateOffset = false;
+#endif
                                  StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
                              }
  
@@ -950,11 +1065,92 @@ static void GeometryShaderStage(
  #if USE_SIMD16_FRONTEND
                                  simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
  
-                                gsPa.useAlternateOffset = false;
-                                pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId);
+                                // Gather data from the SVG if provided.
+                                simd16scalari vViewportIdx = SIMD16::setzero_si();
+                                simd16scalari vRtIdx       = SIMD16::setzero_si();
+                                SIMD16::Vec4  svgAttrib[4];
+
+                                if (state.backendState.readViewportArrayIndex ||
+                                    state.backendState.readRenderTargetArrayIndex)
+                                {
+                                    gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
+                                }
+
+                                if (state.backendState.readViewportArrayIndex)
+                                {
+                                    vViewportIdx =
+                                        SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+                                    gsPa.viewportArrayActive = true;
+                                }
+                                if (state.backendState.readRenderTargetArrayIndex)
+                                {
+                                    vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+                                    gsPa.rtArrayActive = true;
+                                }
+
+                                {
+                                    // OOB VPAI indices => forced to zero.
+                                    vViewportIdx =
+                                        SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
+                                    simd16scalari vNumViewports =
+                                        SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                                    simd16scalari vClearMask =
+                                        SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
+                                    vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
+
+                                    gsPa.useAlternateOffset = false;
+                                    pfnClipFunc(pDC,
+                                                gsPa,
+                                                workerId,
+                                                attrib_simd16,
+                                                GenMask(gsPa.NumPrims()),
+                                                vPrimId,
+                                                vViewportIdx,
+                                                vRtIdx);
+                                }
  #else
                                  simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
-                                pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
+
+                                // Gather data from the SVG if provided.
+                                simdscalari vViewportIdx = SIMD::setzero_si();
+                                simdscalari vRtIdx       = SIMD::setzero_si();
+                                SIMD::Vec4  svgAttrib[4];
+
+                                if (state.backendState.readViewportArrayIndex ||
+                                    state.backendState.readRenderTargetArrayIndex)
+                                {
+                                    gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
+                                }
+
+                                if (state.backendState.readViewportArrayIndex)
+                                {
+                                    vViewportIdx =
+                                        SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+
+                                    // OOB VPAI indices => forced to zero.
+                                    vViewportIdx =
+                                        SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
+                                    simdscalari vNumViewports =
+                                        SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                                    simdscalari vClearMask =
+                                        SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
+                                    vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
+                                    gsPa.viewportArrayActive = true;
+                                }
+                                if (state.backendState.readRenderTargetArrayIndex)
+                                {
+                                    vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+                                    gsPa.rtArrayActive = true;
+                                }
+
+                                pfnClipFunc(pDC,
+                                            gsPa,
+                                            workerId,
+                                            attrib,
+                                            GenMask(gsPa.NumPrims()),
+                                            vPrimId,
+                                            vViewportIdx,
+                                            vRtIdx);
  #endif
                              }
                          }
@@ -967,8 +1163,8 @@ static void GeometryShaderStage(
      // update GS pipeline stats
      UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
      UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
-    AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
-    AR_END(FEGeometryShader, 1);
+    AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim * numInputPrims));
+    RDTSC_END(pDC->pContext->pBucketMgr, FEGeometryShader, 1);
  }
  
  //////////////////////////////////////////////////////////////////////////
@@ -977,42 +1173,45 @@ static void GeometryShaderStage(
  /// @param state - API state
  /// @param ppGsOut - pointer to GS output buffer allocation
  /// @param ppCutBuffer - pointer to GS output cut buffer allocation
-template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
-static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
-    void **ppStreamCutBuffer)
+template <typename SIMD_T, uint32_t SIMD_WIDTH>
+static INLINE void AllocateGsBuffers(DRAW_CONTEXT*    pDC,
+                                     const API_STATE& state,
+                                     uint32_t         vertsPerPrim,
+                                     GsBuffers*       pGsBuffers)
  {
      auto pArena = pDC->pArena;
      SWR_ASSERT(pArena != nullptr);
      SWR_ASSERT(state.gsState.gsEnable);
  
-    // allocate arena space to hold GS output verts
-    // @todo pack attribs
-    // @todo support multiple streams
+    const SWR_GS_STATE& gsState = state.gsState;
  
-    const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState);
+    // Allocate storage for vertex inputs
+    uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
+    pGsBuffers->pGsIn           = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
  
-    const uint32_t vertexBufferSize = state.gsState.instanceCount * bufferInfo.vertexInstanceStride;
+    // Allocate arena space to hold GS output verts
+    const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
  
-    *ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * sizeof(float));
+    for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+    {
+        pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
+    }
  
-    // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
-    // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
+    // Allocate storage for transposed GS output
+    uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
+    uint32_t transposedBufferSize =
+        numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>);
+    pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
  
-    // allocate space for temporary per-stream cut buffer if multi-stream is enabled
+    // Allocate storage to hold temporary stream->cut buffer, if necessary
      if (state.gsState.isSingleStream)
      {
-        const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
-
-        *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
-        *ppStreamCutBuffer = nullptr;
+        pGsBuffers->pStreamCutBuffer = nullptr;
      }
      else
      {
-        const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
-        const uint32_t streamCutBufferSize = state.gsState.instanceCount * bufferInfo.streamCutInstanceStride;
-
-        *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
-        *ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, SIMD_WIDTH * sizeof(float));
+        pGsBuffers->pStreamCutBuffer =
+            (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
      }
  }
  
@@ -1022,12 +1221,14 @@ static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state,
  struct TessellationThreadLocalData
  {
      SWR_HS_CONTEXT hsContext;
-    ScalarPatch patchData[KNOB_SIMD_WIDTH];
-    void* pTxCtx;
-    size_t tsCtxSize;
+    void*          pTxCtx;
+    size_t         tsCtxSize;
+
+    uint8_t*    pHSOutput;
+    size_t      hsOutputAllocSize;
  
      simdscalar* pDSOutput;
-    size_t numDSOutputVectors;
+    size_t      dsOutputAllocSize;
  };
  
  THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
@@ -1040,9 +1241,9 @@ static void AllocateTessellationData(SWR_CONTEXT* pContext)
      /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
      if (gt_pTessellationThreadData == nullptr)
      {
-        gt_pTessellationThreadData = (TessellationThreadLocalData*)
-            AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
-        memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
+        gt_pTessellationThreadData =
+            (TessellationThreadLocalData*)AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
+        memset((void*)gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
      }
  }
  
@@ -1052,44 +1253,37 @@ static void AllocateTessellationData(SWR_CONTEXT* pContext)
  /// @param workerId - thread's worker id. Even thread has a unique id.
  /// @param pa - The primitive assembly object.
  /// @param pGsOut - output stream for GS
-template <
-    typename HasGeometryShaderT,
-    typename HasStreamOutT,
-    typename HasRastT>
-static void TessellationStages(
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    PA_STATE& pa,
-    void* pGsOut,
-    void* pCutBuffer,
-    void* pCutStreamBuffer,
-    uint32_t* pSoPrimData,
+template <typename HasGeometryShaderT, typename HasStreamOutT, typename HasRastT>
+static void TessellationStages(DRAW_CONTEXT* pDC,
+                               uint32_t      workerId,
+                               PA_STATE&     pa,
+                               GsBuffers*    pGsBuffers,
+                               uint32_t*     pSoPrimData,
  #if USE_SIMD16_FRONTEND
-    uint32_t numPrims_simd8,
+                               uint32_t numPrims_simd8,
  #endif
-    simdscalari primID)
+                               simdscalari const& primID)
  {
-    SWR_CONTEXT *pContext = pDC->pContext;
-    const API_STATE& state = GetApiState(pDC);
+    const API_STATE&    state   = GetApiState(pDC);
      const SWR_TS_STATE& tsState = state.tsState;
+    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
  
      SWR_ASSERT(gt_pTessellationThreadData);
  
-    HANDLE tsCtx = TSInitCtx(
-        tsState.domain,
-        tsState.partitioning,
-        tsState.tsOutputTopology,
-        gt_pTessellationThreadData->pTxCtx,
-        gt_pTessellationThreadData->tsCtxSize);
+    HANDLE tsCtx = TSInitCtx(tsState.domain,
+                             tsState.partitioning,
+                             tsState.tsOutputTopology,
+                             gt_pTessellationThreadData->pTxCtx,
+                             gt_pTessellationThreadData->tsCtxSize);
      if (tsCtx == nullptr)
      {
-        gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
-        tsCtx = TSInitCtx(
-            tsState.domain,
-            tsState.partitioning,
-            tsState.tsOutputTopology,
-            gt_pTessellationThreadData->pTxCtx,
-            gt_pTessellationThreadData->tsCtxSize);
+        gt_pTessellationThreadData->pTxCtx =
+            AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
+        tsCtx = TSInitCtx(tsState.domain,
+                          tsState.partitioning,
+                          tsState.tsOutputTopology,
+                          gt_pTessellationThreadData->pTxCtx,
+                          gt_pTessellationThreadData->tsCtxSize);
      }
      SWR_ASSERT(tsCtx);
  
@@ -1099,10 +1293,17 @@ static void TessellationStages(
      {
          switch (tsState.postDSTopology)
          {
-        case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
-        case TOP_LINE_LIST:     pfnClipFunc = ClipLines_simd16; break;
-        case TOP_POINT_LIST:    pfnClipFunc = ClipPoints_simd16; break;
-        default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
+        case TOP_TRIANGLE_LIST:
+            pfnClipFunc = ClipTriangles_simd16;
+            break;
+        case TOP_LINE_LIST:
+            pfnClipFunc = ClipLines_simd16;
+            break;
+        case TOP_POINT_LIST:
+            pfnClipFunc = ClipPoints_simd16;
+            break;
+        default:
+            SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
          }
      }
  
@@ -1112,37 +1313,64 @@ static void TessellationStages(
      {
          switch (tsState.postDSTopology)
          {
-        case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
-        case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
-        case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
-        default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
+        case TOP_TRIANGLE_LIST:
+            pfnClipFunc = ClipTriangles;
+            break;
+        case TOP_LINE_LIST:
+            pfnClipFunc = ClipLines;
+            break;
+        case TOP_POINT_LIST:
+            pfnClipFunc = ClipPoints;
+            break;
+        default:
+            SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
          }
      }
  
  #endif
-    SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
-    hsContext.pCPout = gt_pTessellationThreadData->patchData;
-    hsContext.PrimitiveID = primID;
+    SWR_HS_CONTEXT& hsContext       = gt_pTessellationThreadData->hsContext;
+    hsContext.PrimitiveID           = primID;
+    hsContext.outputSize = tsState.hsAllocationSize;
  
      uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
      // Max storage for one attribute for an entire simdprimitive
      simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
  
+    // Assemble position separately
+    // TESS_TODO: this could be avoided - fix it
+    pa.Assemble(VERTEX_POSITION_SLOT, simdattrib);
+    for (uint32_t i = 0; i < numVertsPerPrim; ++i) {
+        hsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = simdattrib[i];
+    }
+
      // assemble all attributes for the input primitives
      for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
      {
-        uint32_t attribSlot = tsState.vertexAttribOffset + slot;
+        uint32_t attribSlot = tsState.srcVertexAttribOffset + slot;
          pa.Assemble(attribSlot, simdattrib);
  
          for (uint32_t i = 0; i < numVertsPerPrim; ++i)
          {
-            hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i];
+            hsContext.vert[i].attrib[tsState.vertexAttribOffset + slot] = simdattrib[i];
          }
      }
  
+    // Allocate HS output storage
+    uint32_t requiredAllocSize = KNOB_SIMD_WIDTH * tsState.hsAllocationSize;
+
+    if (requiredAllocSize > gt_pTessellationThreadData->hsOutputAllocSize)
+    {
+        AlignedFree(gt_pTessellationThreadData->pHSOutput);
+        gt_pTessellationThreadData->pHSOutput = (uint8_t*)AlignedMalloc(requiredAllocSize, 64);
+        gt_pTessellationThreadData->hsOutputAllocSize = requiredAllocSize;
+    }
+
+    hsContext.pCPout = (ScalarPatch*)gt_pTessellationThreadData->pHSOutput;
+
  #if defined(_DEBUG)
-    memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
+    //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
  #endif
+    memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
  
  #if USE_SIMD16_FRONTEND
      uint32_t numPrims = numPrims_simd8;
@@ -1152,22 +1380,28 @@ static void TessellationStages(
      hsContext.mask = GenerateMask(numPrims);
  
      // Run the HS
-    AR_BEGIN(FEHullShader, pDC->drawId);
-    state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
-    AR_END(FEHullShader, 0);
+    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEHullShader, pDC->drawId);
+    state.pfnHsFunc(GetPrivateState(pDC), pWorkerData, &hsContext);
+    RDTSC_END(pDC->pContext->pBucketMgr, FEHullShader, 0);
  
      UPDATE_STAT_FE(HsInvocations, numPrims);
+    AR_EVENT(HSStats((HANDLE)&hsContext.stats));
  
      const uint32_t* pPrimId = (const uint32_t*)&primID;
  
      for (uint32_t p = 0; p < numPrims; ++p)
      {
-        // Run Tessellator
-        SWR_TS_TESSELLATED_DATA tsData = { 0 };
-        AR_BEGIN(FETessellation, pDC->drawId);
-        TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
+        ScalarPatch* pCPout = (ScalarPatch*)(gt_pTessellationThreadData->pHSOutput + tsState.hsAllocationSize * p);
+
+        SWR_TESSELLATION_FACTORS tessFactors;
+        tessFactors                    = hsContext.pCPout[p].tessFactors;
+
+          // Run Tessellator
+        SWR_TS_TESSELLATED_DATA tsData = {0};
+        RDTSC_BEGIN(pDC->pContext->pBucketMgr, FETessellation, pDC->drawId);
+        TSTessellate(tsCtx, tessFactors, tsData);
          AR_EVENT(TessPrimCount(1));
-        AR_END(FETessellation, 0);
+        RDTSC_END(pDC->pContext->pBucketMgr, FETessellation, 0);
  
          if (tsData.NumPrimitives == 0)
          {
@@ -1176,25 +1410,24 @@ static void TessellationStages(
          SWR_ASSERT(tsData.NumDomainPoints);
  
          // Allocate DS Output memory
-        uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
-        size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
+        uint32_t requiredDSVectorInvocations =
+            AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
  #if USE_SIMD16_FRONTEND
-        size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs;      // simd8 -> simd16, padding
+        size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) *
+                                   tsState.dsAllocationSize; // simd8 -> simd16, padding
  #else
-        size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
+        size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
+        size_t requiredAllocSize       = sizeof(simdvector) * requiredDSOutputVectors;
  #endif
-        if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
+        if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
          {
              AlignedFree(gt_pTessellationThreadData->pDSOutput);
-            gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
-#if USE_SIMD16_FRONTEND
-            gt_pTessellationThreadData->numDSOutputVectors = RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
-#else
-            gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
-#endif
+            gt_pTessellationThreadData->pDSOutput =
+                (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
+            gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
          }
          SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
-        SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
+        SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);
  
  #if defined(_DEBUG)
          memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
@@ -1202,76 +1435,88 @@ static void TessellationStages(
  
          // Run Domain Shader
          SWR_DS_CONTEXT dsContext;
-        dsContext.PrimitiveID = pPrimId[p];
-        dsContext.pCpIn = &hsContext.pCPout[p];
-        dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
-        dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
-        dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
+        dsContext.PrimitiveID           = pPrimId[p];
+        dsContext.pCpIn                 = pCPout;
+        dsContext.pDomainU              = (simdscalar*)tsData.pDomainPointsU;
+        dsContext.pDomainV              = (simdscalar*)tsData.pDomainPointsV;
+        dsContext.pOutputData           = gt_pTessellationThreadData->pDSOutput;
+        dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
  #if USE_SIMD16_FRONTEND
-        dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations);      // simd8 -> simd16
+        dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
  #else
-        dsContext.vectorStride = requiredDSVectorInvocations;
+        dsContext.vectorStride         = requiredDSVectorInvocations;
  #endif
  
          uint32_t dsInvocations = 0;
  
-        for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
+        for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations;
+             ++dsContext.vectorOffset)
          {
              dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
  
-            AR_BEGIN(FEDomainShader, pDC->drawId);
-            state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
-            AR_END(FEDomainShader, 0);
+            RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEDomainShader, pDC->drawId);
+            state.pfnDsFunc(GetPrivateState(pDC), pWorkerData, &dsContext);
+            RDTSC_END(pDC->pContext->pBucketMgr, FEDomainShader, 0);
+
+            AR_EVENT(DSStats((HANDLE)&dsContext.stats));
  
              dsInvocations += KNOB_SIMD_WIDTH;
          }
          UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
  
  #if USE_SIMD16_FRONTEND
-        SWR_ASSERT(IsEven(dsContext.vectorStride));                             // simd8 -> simd16
+        SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
  
  #endif
          PA_TESS tessPa(
              pDC,
  #if USE_SIMD16_FRONTEND
-            reinterpret_cast<const simd16scalar *>(dsContext.pOutputData),      // simd8 -> simd16
-            dsContext.vectorStride / 2,                                         // simd8 -> simd16
+            reinterpret_cast<const simd16scalar*>(dsContext.pOutputData), // simd8 -> simd16
+            dsContext.vectorStride / 2,                                   // simd8 -> simd16
  #else
              dsContext.pOutputData,
              dsContext.vectorStride,
  #endif
              SWR_VTX_NUM_SLOTS,
-            tsState.numDsOutputAttribs,
+            tsState.numDsOutputAttribs + tsState.dsOutVtxAttribOffset,
              tsData.ppIndices,
              tsData.NumPrimitives,
-            tsState.postDSTopology);
+            tsState.postDSTopology,
+            NumVertsPerPrim(tsState.postDSTopology, false));
  
          while (tessPa.HasWork())
          {
  #if USE_SIMD16_FRONTEND
-            const uint32_t numPrims = tessPa.NumPrims();
+            const uint32_t numPrims    = tessPa.NumPrims();
              const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
-            const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
+            const uint32_t numPrims_hi =
+                std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
  
-            const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
-            const simdscalari primID_lo = _simd16_extract_si(primID, 0);
-            const simdscalari primID_hi = _simd16_extract_si(primID, 1);
+            const simd16scalari primID    = _simd16_set1_epi32(dsContext.PrimitiveID);
+            const simdscalari   primID_lo = _simd16_extract_si(primID, 0);
+            const simdscalari   primID_hi = _simd16_extract_si(primID, 1);
  
  #endif
              if (HasGeometryShaderT::value)
              {
  #if USE_SIMD16_FRONTEND
                  tessPa.useAlternateOffset = false;
-                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_lo, primID_lo);
+                GeometryShaderStage<HasStreamOutT, HasRastT>(
+                    pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
  
                  if (numPrims_hi)
                  {
                      tessPa.useAlternateOffset = true;
-                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_hi, primID_hi);
+                    GeometryShaderStage<HasStreamOutT, HasRastT>(
+                        pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
                  }
  #else
                  GeometryShaderStage<HasStreamOutT, HasRastT>(
-                    pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
+                    pDC,
+                    workerId,
+                    tessPa,
+                    pGsBuffers,
+                    pSoPrimData,
                      _simd_set1_epi32(dsContext.PrimitiveID));
  #endif
              }
@@ -1279,34 +1524,108 @@ static void TessellationStages(
              {
                  if (HasStreamOutT::value)
                  {
+#if ENABLE_AVX512_SIMD16
                      tessPa.useAlternateOffset = false;
+#endif
                      StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
                  }
  
                  if (HasRastT::value)
                  {
  #if USE_SIMD16_FRONTEND
-                    simd16vector    prim_simd16[3]; // Only deal with triangles, lines, or points
+                    simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
  #else
-                    simdvector      prim[3];        // Only deal with triangles, lines, or points
+                    simdvector prim[3]; // Only deal with triangles, lines, or points
  #endif
-                    AR_BEGIN(FEPAAssemble, pDC->drawId);
+                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
                      bool assemble =
  #if USE_SIMD16_FRONTEND
-                        tessPa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
+                        tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
  #else
                          tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
  #endif
-                    AR_END(FEPAAssemble, 1);
+                    RDTSC_END(pDC->pContext->pBucketMgr, FEPAAssemble, 1);
                      SWR_ASSERT(assemble);
  
                      SWR_ASSERT(pfnClipFunc);
  #if USE_SIMD16_FRONTEND
-                    tessPa.useAlternateOffset = false;
-                    pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID);
+                    // Gather data from the SVG if provided.
+                    simd16scalari vViewportIdx = SIMD16::setzero_si();
+                    simd16scalari vRtIdx       = SIMD16::setzero_si();
+                    SIMD16::Vec4 svgAttrib[4] = {SIMD16::setzero_ps()};
+
+                    if (state.backendState.readViewportArrayIndex ||
+                        state.backendState.readRenderTargetArrayIndex)
+                    {
+                        tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
+                    }
+
+                    if (state.backendState.readViewportArrayIndex)
+                    {
+                        vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+                        tessPa.viewportArrayActive = true;
+                    }
+                    if (state.backendState.readRenderTargetArrayIndex)
+                    {
+                        vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+                        tessPa.rtArrayActive = true;
+                    }
+
+
+                    {
+                        // OOB VPAI indices => forced to zero.
+                        vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
+                        simd16scalari vNumViewports =
+                            SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                        simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
+                        vViewportIdx             = SIMD16::and_si(vClearMask, vViewportIdx);
+
+                        tessPa.useAlternateOffset = false;
+                        pfnClipFunc(pDC,
+                                    tessPa,
+                                    workerId,
+                                    prim_simd16,
+                                    GenMask(numPrims),
+                                    primID,
+                                    vViewportIdx,
+                                    vRtIdx);
+                    }
  #else
-                    pfnClipFunc(pDC, tessPa, workerId, prim,
-                        GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
+                    // Gather data from the SGV if provided.
+                    simdscalari vViewportIdx = SIMD::setzero_si();
+                    simdscalari vRtIdx       = SIMD::setzero_si();
+                    SIMD::Vec4  svgAttrib[4];
+
+                    if (state.backendState.readViewportArrayIndex ||
+                        state.backendState.readRenderTargetArrayIndex)
+                    {
+                        tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
+                    }
+
+                    if (state.backendState.readViewportArrayIndex)
+                    {
+                        vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+
+                        // OOB VPAI indices => forced to zero.
+                        vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
+                        simdscalari vNumViewports  = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                        simdscalari vClearMask     = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
+                        vViewportIdx               = SIMD::and_si(vClearMask, vViewportIdx);
+                        tessPa.viewportArrayActive = true;
+                    }
+                    if (state.backendState.readRenderTargetArrayIndex)
+                    {
+                        vRtIdx               = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+                        tessPa.rtArrayActive = true;
+                    }
+                    pfnClipFunc(pDC,
+                                tessPa,
+                                workerId,
+                                prim,
+                                GenMask(tessPa.NumPrims()),
+                                _simd_set1_epi32(dsContext.PrimitiveID),
+                                vViewportIdx,
+                                vRtIdx);
  #endif
                  }
              }
@@ -1314,7 +1633,7 @@ static void TessellationStages(
              tessPa.NextPrim();
  
          } // while (tessPa.HasWork())
-    } // for (uint32_t p = 0; p < numPrims; ++p)
+    }     // for (uint32_t p = 0; p < numPrims; ++p)
  
  #if USE_SIMD16_FRONTEND
      if (gt_pTessellationThreadData->pDSOutput != nullptr)
@@ -1322,14 +1641,14 @@ static void TessellationStages(
          AlignedFree(gt_pTessellationThreadData->pDSOutput);
          gt_pTessellationThreadData->pDSOutput = nullptr;
      }
-    gt_pTessellationThreadData->numDSOutputVectors = 0;
+    gt_pTessellationThreadData->dsOutputAllocSize = 0;
  
  #endif
      TSDestroyCtx(tsCtx);
  }
  
-THREAD PA_STATE::SIMDVERTEX *pVertexStore = nullptr;
-THREAD uint32_t gVertexStoreSize = 0;
+THREAD PA_STATE::SIMDVERTEX* gpVertexStore = nullptr;
+THREAD uint32_t gVertexStoreSize           = 0;
  
  //////////////////////////////////////////////////////////////////////////
  /// @brief FE handler for SwrDraw.
@@ -1342,20 +1661,14 @@ THREAD uint32_t gVertexStoreSize = 0;
  /// @param pDC - pointer to draw context.
  /// @param workerId - thread's worker id.
  /// @param pUserData - Pointer to DRAW_WORK
-template <
-    typename IsIndexedT,
-    typename IsCutIndexEnabledT,
-    typename HasTessellationT,
-    typename HasGeometryShaderT,
-    typename HasStreamOutT,
-    typename HasRastT>
-void ProcessDraw(
-    SWR_CONTEXT *pContext,
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    void *pUserData)
+template <typename IsIndexedT,
+          typename IsCutIndexEnabledT,
+          typename HasTessellationT,
+          typename HasGeometryShaderT,
+          typename HasStreamOutT,
+          typename HasRastT>
+void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
  {
-
  #if KNOB_ENABLE_TOSS_POINTS
      if (KNOB_TOSS_QUEUE_FE)
      {
@@ -1363,36 +1676,34 @@ void ProcessDraw(
      }
  #endif
  
-    AR_BEGIN(FEProcessDraw, pDC->drawId);
+    RDTSC_BEGIN(pContext->pBucketMgr, FEProcessDraw, pDC->drawId);
  
-    DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
-    const API_STATE&    state = GetApiState(pDC);
+    void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
+
+    DRAW_WORK&       work  = *(DRAW_WORK*)pUserData;
+    const API_STATE& state = GetApiState(pDC);
  
      uint32_t indexSize = 0;
      uint32_t endVertex = work.numVerts;
  
-    const int32_t* pLastRequestedIndex = nullptr;
+    gfxptr_t xpLastRequestedIndex = 0;
      if (IsIndexedT::value)
      {
          switch (work.type)
          {
          case R32_UINT:
              indexSize = sizeof(uint32_t);
-            pLastRequestedIndex = &(work.pIB[endVertex]);
              break;
          case R16_UINT:
              indexSize = sizeof(uint16_t);
-            // nasty address offset to last index
-            pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
              break;
          case R8_UINT:
              indexSize = sizeof(uint8_t);
-            // nasty address offset to last index
-            pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
              break;
          default:
              SWR_INVALID("Invalid work.type: %d", work.type);
          }
+        xpLastRequestedIndex = work.xpIB + endVertex * indexSize;
      }
      else
      {
@@ -1404,15 +1715,15 @@ void ProcessDraw(
      uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
  #endif
  
-    void* pGsOut = nullptr;
-    void* pCutBuffer = nullptr;
-    void* pStreamCutBuffer = nullptr;
+    GsBuffers gsBuffers;
      if (HasGeometryShaderT::value)
      {
  #if USE_SIMD16_FRONTEND
-        AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+        AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(
+            pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
  #else
-        AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+        AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(
+            pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
  #endif
      }
  
@@ -1442,50 +1753,69 @@ void ProcessDraw(
  #if USE_SIMD16_FRONTEND
      uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
  #else
-    uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
+    uint32_t          simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
  #endif
  
      SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
  
      // Compute storage requirements for vertex store
      // TODO: allocation needs to be rethought for better cut support
-    uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
+    uint32_t numVerts        = vertexCount + 2; // Need extra space for PA state machine
      uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
  
      // grow the vertex store for the PA as necessary
      if (gVertexStoreSize < vertexStoreSize)
      {
-        if (pVertexStore != nullptr)
+        if (gpVertexStore != nullptr)
          {
-            AlignedFree(pVertexStore);
+            AlignedFree(gpVertexStore);
+            gpVertexStore = nullptr;
          }
  
-        pVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(vertexStoreSize, 64));
+        SWR_ASSERT(gpVertexStore == nullptr);
+
+        gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX*>(AlignedMalloc(vertexStoreSize, 64));
          gVertexStoreSize = vertexStoreSize;
  
-        SWR_ASSERT(pVertexStore != nullptr);
+        SWR_ASSERT(gpVertexStore != nullptr);
      }
  
      // choose primitive assembler
-    PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, pVertexStore, numVerts, state.frontendState.vsVertexSize);
-    PA_STATE& pa = paFactory.GetPA();
+
+    PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC,
+                                                         state.topology,
+                                                         work.numVerts,
+                                                         gpVertexStore,
+                                                         numVerts,
+                                                         state.frontendState.vsVertexSize,
+                                                         GetNumVerts(state.topology, 1));
+    PA_STATE&                                  pa = paFactory.GetPA();
  
  #if USE_SIMD16_FRONTEND
-    simdvertex          vin_lo;
-    simdvertex          vin_hi;
-    SWR_VS_CONTEXT      vsContext_lo;
-    SWR_VS_CONTEXT      vsContext_hi;
+#if USE_SIMD16_SHADERS
+    simd16vertex vin;
+#else
+    simdvertex vin_lo;
+    simdvertex vin_hi;
+#endif
+    SWR_VS_CONTEXT vsContext_lo;
+    SWR_VS_CONTEXT vsContext_hi;
  
+#if USE_SIMD16_SHADERS
+    vsContext_lo.pVin = reinterpret_cast<simdvertex*>(&vin);
+    vsContext_hi.pVin = reinterpret_cast<simdvertex*>(&vin);
+#else
      vsContext_lo.pVin = &vin_lo;
      vsContext_hi.pVin = &vin_hi;
+#endif
      vsContext_lo.AlternateOffset = 0;
      vsContext_hi.AlternateOffset = 1;
  
-    SWR_FETCH_CONTEXT   fetchInfo_lo = { 0 };
+    SWR_FETCH_CONTEXT fetchInfo_lo = {0};
  
-    fetchInfo_lo.pStreams = &state.vertexBuffers[0];
+    fetchInfo_lo.pStreams      = &state.vertexBuffers[0];
      fetchInfo_lo.StartInstance = work.startInstance;
-    fetchInfo_lo.StartVertex = 0;
+    fetchInfo_lo.StartVertex   = 0;
  
      if (IsIndexedT::value)
      {
@@ -1493,10 +1823,10 @@ void ProcessDraw(
  
          // if the entire index buffer isn't being consumed, set the last index
          // so that fetches < a SIMD wide will be masked off
-        fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
-        if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
+        fetchInfo_lo.xpLastIndex = state.indexBuffer.xpIndices + state.indexBuffer.size;
+        if (xpLastRequestedIndex < fetchInfo_lo.xpLastIndex)
          {
-            fetchInfo_lo.pLastIndex = pLastRequestedIndex;
+            fetchInfo_lo.xpLastIndex = xpLastRequestedIndex;
          }
      }
      else
@@ -1504,27 +1834,31 @@ void ProcessDraw(
          fetchInfo_lo.StartVertex = work.startVertex;
      }
  
-    SWR_FETCH_CONTEXT   fetchInfo_hi = fetchInfo_lo;
+    SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
  
-    const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    const simd16scalari vScale =
+        _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
  
      for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
      {
-        uint32_t  i = 0;
+        uint32_t i = 0;
  
          simd16scalari vIndex;
  
          if (IsIndexedT::value)
          {
-            fetchInfo_lo.pIndices = work.pIB;
-            fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize);    // 1/2 of KNOB_SIMD16_WIDTH
+            fetchInfo_lo.xpIndices = work.xpIB;
+            fetchInfo_hi.xpIndices =
+                fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize; // 1/2 of KNOB_SIMD16_WIDTH
          }
          else
          {
              vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
  
-            fetchInfo_lo.pIndices = (const int32_t *)&vIndex;
-            fetchInfo_hi.pIndices = (const int32_t *)&vIndex + KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH
+            fetchInfo_lo.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex);
+            fetchInfo_hi.xpIndices = pDC->pContext->pfnMakeGfxPtr(
+                GetPrivateState(pDC),
+                &vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
          }
  
          fetchInfo_lo.CurInstance = instanceNum;
@@ -1535,50 +1869,92 @@ void ProcessDraw(
  
          while (pa.HasWork())
          {
-            // GetNextVsOutput currently has the side effect of updating some PA state machine state.
-            // So we need to keep this outside of (i < endVertex) check.
+            // GetNextVsOutput currently has the side effect of updating some PA state machine
+            // state. So we need to keep this outside of (i < endVertex) check.
  
-            simdmask *pvCutIndices_lo = nullptr;
-            simdmask *pvCutIndices_hi = nullptr;
+            simdmask* pvCutIndices_lo = nullptr;
+            simdmask* pvCutIndices_hi = nullptr;
  
              if (IsIndexedT::value)
              {
                  // simd16mask <=> simdmask[2]
  
-                pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
-                pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
+                pvCutIndices_lo = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[0];
+                pvCutIndices_hi = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[1];
              }
  
-            simd16vertex &vout = pa.GetNextVsOutput();
+            simd16vertex& vout = pa.GetNextVsOutput();
  
-            vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
-            vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);
+            vsContext_lo.pVout = reinterpret_cast<simdvertex*>(&vout);
+            vsContext_hi.pVout = reinterpret_cast<simdvertex*>(&vout);
  
              if (i < endVertex)
              {
+                if (!IsIndexedT::value)
+                {
+                    fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices;
+                    uint32_t offset;
+                    offset = std::min(endVertex - i, (uint32_t)KNOB_SIMD16_WIDTH);
+                    offset *= 4; // convert from index to address
+#if USE_SIMD16_SHADERS
+                    fetchInfo_lo.xpLastIndex += offset;
+#else
+                    fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t)KNOB_SIMD_WIDTH);
+                    uint32_t offset2 =
+                        std::min(offset, (uint32_t)KNOB_SIMD16_WIDTH) - KNOB_SIMD_WIDTH;
+                    assert(offset >= 0);
+                    fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices;
+                    fetchInfo_hi.xpLastIndex += offset2;
+#endif
+                }
                  // 1. Execute FS/VS for a single SIMD.
-                AR_BEGIN(FEFetchShader, pDC->drawId);
-                state.pfnFetchFunc(fetchInfo_lo, vin_lo);
+                RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
+#if USE_SIMD16_SHADERS
+                state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin);
+#else
+                state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin_lo);
  
-                if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
+                if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
                  {
-                    state.pfnFetchFunc(fetchInfo_hi, vin_hi);
+                    state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_hi, vin_hi);
                  }
-                AR_END(FEFetchShader, 0);
+#endif
+                RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
  
                  // forward fetch generated vertex IDs to the vertex shader
+#if USE_SIMD16_SHADERS
+#if USE_SIMD16_VS
+                vsContext_lo.VertexID16 =
+                    _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
+                vsContext_lo.VertexID16 =
+                    _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
+#else
+                vsContext_lo.VertexID = fetchInfo_lo.VertexID;
+                vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
+#endif
+#else
                  vsContext_lo.VertexID = fetchInfo_lo.VertexID;
                  vsContext_hi.VertexID = fetchInfo_hi.VertexID;
+#endif
  
                  // Setup active mask for vertex shader.
-                vsContext_lo.mask = GenerateMask(endVertex - i);
-                vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
+#if USE_SIMD16_VS
+                vsContext_lo.mask16 = GenerateMask16(endVertex - i);
+#else
+                vsContext_lo.mask     = GenerateMask(endVertex - i);
+                vsContext_hi.mask     = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
+#endif
  
                  // forward cut mask to the PA
                  if (IsIndexedT::value)
                  {
+#if USE_SIMD16_SHADERS
+                    *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
+                    *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
+#else
                      *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
                      *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
+#endif
                  }
  
                  UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
@@ -1587,14 +1963,21 @@ void ProcessDraw(
                  if (!KNOB_TOSS_FETCH)
  #endif
                  {
-                    AR_BEGIN(FEVertexShader, pDC->drawId);
-                    state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
+                    RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
+#if USE_SIMD16_VS
+                    state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
+                    AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
+#else
+                    state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
+                    AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
  
-                    if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
+                    if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
                      {
-                        state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
+                        state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_hi);
+                        AR_EVENT(VSStats((HANDLE)&vsContext_hi.stats));
                      }
-                    AR_END(FEVertexShader, 0);
+#endif
+                    RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
  
                      UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
                  }
@@ -1605,9 +1988,9 @@ void ProcessDraw(
              {
                  simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
  
-                RDTSC_START(FEPAAssemble);
-                bool assemble = pa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
-                RDTSC_STOP(FEPAAssemble, 1, 0);
+                RDTSC_START(pContext->pBucketMgr, FEPAAssemble);
+                bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
+                RDTSC_STOP(pContext->pBucketMgr, FEPAAssemble, 1, 0);
  
  #if KNOB_ENABLE_TOSS_POINTS
                  if (!KNOB_TOSS_FETCH)
@@ -1622,33 +2005,61 @@ void ProcessDraw(
                              UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
  
                              const uint32_t numPrims = pa.NumPrims();
-                            const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
-                            const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
+                            const uint32_t numPrims_lo =
+                                std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
+                            const uint32_t numPrims_hi =
+                                std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
  
-                            const simd16scalari primID = pa.GetPrimID(work.startPrimID);
-                            const simdscalari primID_lo = _simd16_extract_si(primID, 0);
-                            const simdscalari primID_hi = _simd16_extract_si(primID, 1);
+                            const simd16scalari primID    = pa.GetPrimID(work.startPrimID);
+                            const simdscalari   primID_lo = _simd16_extract_si(primID, 0);
+                            const simdscalari   primID_hi = _simd16_extract_si(primID, 1);
  
                              if (HasTessellationT::value)
                              {
                                  pa.useAlternateOffset = false;
-                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
+                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
+                                    pDC,
+                                    workerId,
+                                    pa,
+                                    &gsBuffers,
+                                    pSoPrimData,
+                                    numPrims_lo,
+                                    primID_lo);
  
                                  if (numPrims_hi)
                                  {
                                      pa.useAlternateOffset = true;
-                                    TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
+                                    TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
+                                        pDC,
+                                        workerId,
+                                        pa,
+                                        &gsBuffers,
+                                        pSoPrimData,
+                                        numPrims_hi,
+                                        primID_hi);
                                  }
                              }
                              else if (HasGeometryShaderT::value)
                              {
                                  pa.useAlternateOffset = false;
-                                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
+                                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
+                                                                             workerId,
+                                                                             pa,
+                                                                             &gsBuffers,
+                                                                             pSoPrimData,
+                                                                             numPrims_lo,
+                                                                             primID_lo);
  
                                  if (numPrims_hi)
                                  {
                                      pa.useAlternateOffset = true;
-                                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
+                                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
+                                                                                 workerId,
+                                                                                 pa,
+                                                                                 &gsBuffers,
+                                                                                 pSoPrimData,
+                                                                                 numPrims_hi,
+                                                                                 primID_hi);
                                  }
                              }
                              else
@@ -1663,9 +2074,48 @@ void ProcessDraw(
                                  if (HasRastT::value)
                                  {
                                      SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
-
-                                    pa.useAlternateOffset = false;
-                                    pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID);
+                                    // Gather data from the SVG if provided.
+                                    simd16scalari vpai = SIMD16::setzero_si();
+                                    simd16scalari rtai = SIMD16::setzero_si();
+                                    SIMD16::Vec4  svgAttrib[4];
+
+                                    if (state.backendState.readViewportArrayIndex ||
+                                        state.backendState.readRenderTargetArrayIndex)
+                                    {
+                                        pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
+                                    }
+
+                                    if (state.backendState.readViewportArrayIndex)
+                                    {
+                                        vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+                                        pa.viewportArrayActive = true;
+                                    }
+                                    if (state.backendState.readRenderTargetArrayIndex)
+                                    {
+                                        rtai =
+                                            SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+                                        pa.rtArrayActive = true;
+                                    }
+
+                                    {
+                                        // OOB VPAI indices => forced to zero.
+                                        vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
+                                        simd16scalari vNumViewports =
+                                            SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                                        simd16scalari vClearMask =
+                                            SIMD16::cmplt_epi32(vpai, vNumViewports);
+                                        vpai = SIMD16::and_si(vClearMask, vpai);
+
+                                        pa.useAlternateOffset = false;
+                                        pDC->pState->pfnProcessPrims_simd16(pDC,
+                                                                            pa,
+                                                                            workerId,
+                                                                            prim_simd16,
+                                                                            GenMask(numPrims),
+                                                                            primID,
+                                                                            vpai,
+                                                                            rtai);
+                                    }
                                  }
                              }
                          }
@@ -1675,8 +2125,8 @@ void ProcessDraw(
  
              if (IsIndexedT::value)
              {
-                fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
-                fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
+                fetchInfo_lo.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
+                fetchInfo_hi.xpIndices = fetchInfo_hi.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
              }
              else
              {
@@ -1690,12 +2140,12 @@ void ProcessDraw(
      }
  
  #else
-    SWR_VS_CONTEXT      vsContext;
-    SWR_FETCH_CONTEXT   fetchInfo = { 0 };
+    SWR_VS_CONTEXT    vsContext;
+    SWR_FETCH_CONTEXT fetchInfo = {0};
  
-    fetchInfo.pStreams = &state.vertexBuffers[0];
+    fetchInfo.pStreams      = &state.vertexBuffers[0];
      fetchInfo.StartInstance = work.startInstance;
-    fetchInfo.StartVertex = 0;
+    fetchInfo.StartVertex   = 0;
  
      if (IsIndexedT::value)
      {
@@ -1703,10 +2153,11 @@ void ProcessDraw(
  
          // if the entire index buffer isn't being consumed, set the last index
          // so that fetches < a SIMD wide will be masked off
-        fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
-        if (pLastRequestedIndex < fetchInfo.pLastIndex)
+        fetchInfo.pLastIndex =
+            (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+        if (xpLastRequestedIndex < fetchInfo.pLastIndex)
          {
-            fetchInfo.pLastIndex = pLastRequestedIndex;
+            fetchInfo.pLastIndex = xpLastRequestedIndex;
          }
      }
      else
@@ -1714,13 +2165,13 @@ void ProcessDraw(
          fetchInfo.StartVertex = work.startVertex;
      }
  
-    const simdscalari   vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+    const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
  
      /// @todo: temporarily move instance loop in the FE to ensure SO ordering
      for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
      {
          simdscalari vIndex;
-        uint32_t  i = 0;
+        uint32_t    i = 0;
  
          if (IsIndexedT::value)
          {
@@ -1728,17 +2179,17 @@ void ProcessDraw(
          }
          else
          {
-            vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
+            vIndex             = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
              fetchInfo.pIndices = (const int32_t*)&vIndex;
          }
  
          fetchInfo.CurInstance = instanceNum;
-        vsContext.InstanceID = instanceNum;
+        vsContext.InstanceID  = instanceNum;
  
          while (pa.HasWork())
          {
-            // GetNextVsOutput currently has the side effect of updating some PA state machine state.
-            // So we need to keep this outside of (i < endVertex) check.
+            // GetNextVsOutput currently has the side effect of updating some PA state machine
+            // state. So we need to keep this outside of (i < endVertex) check.
              simdmask* pvCutIndices = nullptr;
              if (IsIndexedT::value)
              {
@@ -1746,16 +2197,15 @@ void ProcessDraw(
              }
  
              simdvertex& vout = pa.GetNextVsOutput();
-            vsContext.pVin = &vout;
-            vsContext.pVout = &vout;
+            vsContext.pVin   = &vout;
+            vsContext.pVout  = &vout;
  
              if (i < endVertex)
              {
-
                  // 1. Execute FS/VS for a single SIMD.
-                AR_BEGIN(FEFetchShader, pDC->drawId);
-                state.pfnFetchFunc(fetchInfo, vout);
-                AR_END(FEFetchShader, 0);
+                RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
+                state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo, vout);
+                RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
  
                  // forward fetch generated vertex IDs to the vertex shader
                  vsContext.VertexID = fetchInfo.VertexID;
@@ -1775,11 +2225,12 @@ void ProcessDraw(
                  if (!KNOB_TOSS_FETCH)
  #endif
                  {
-                    AR_BEGIN(FEVertexShader, pDC->drawId);
-                    state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
-                    AR_END(FEVertexShader, 0);
+                    RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
+                    state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext);
+                    RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
  
                      UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
+                    AR_EVENT(VSStats((HANDLE)&vsContext.stats));
                  }
              }
  
@@ -1788,9 +2239,9 @@ void ProcessDraw(
              {
                  simdvector prim[MAX_NUM_VERTS_PER_PRIM];
                  // PaAssemble returns false if there is not enough verts to assemble.
-                AR_BEGIN(FEPAAssemble, pDC->drawId);
+                RDTSC_BEGIN(pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
                  bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
-                AR_END(FEPAAssemble, 1);
+                RDTSC_END(pContext->pBucketMgr, FEPAAssemble, 1);
  
  #if KNOB_ENABLE_TOSS_POINTS
                  if (!KNOB_TOSS_FETCH)
@@ -1807,12 +2258,22 @@ void ProcessDraw(
                              if (HasTessellationT::value)
                              {
                                  TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
-                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+                                    pDC,
+                                    workerId,
+                                    pa,
+                                    &gsBuffers,
+                                    pSoPrimData,
+                                    pa.GetPrimID(work.startPrimID));
                              }
                              else if (HasGeometryShaderT::value)
                              {
                                  GeometryShaderStage<HasStreamOutT, HasRastT>(
-                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+                                    pDC,
+                                    workerId,
+                                    pa,
+                                    &gsBuffers,
+                                    pSoPrimData,
+                                    pa.GetPrimID(work.startPrimID));
                              }
                              else
                              {
@@ -1826,8 +2287,47 @@ void ProcessDraw(
                                  {
                                      SWR_ASSERT(pDC->pState->pfnProcessPrims);
  
-                                    pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
-                                        GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
+                                    // Gather data from the SVG if provided.
+                                    simdscalari vViewportIdx = SIMD::setzero_si();
+                                    simdscalari vRtIdx       = SIMD::setzero_si();
+                                    SIMD::Vec4  svgAttrib[4];
+
+                                    if (state.backendState.readViewportArrayIndex ||
+                                        state.backendState.readRenderTargetArrayIndex)
+                                    {
+                                        pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
+                                    }
+
+                                    if (state.backendState.readViewportArrayIndex)
+                                    {
+                                        vViewportIdx =
+                                            SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+
+                                        // OOB VPAI indices => forced to zero.
+                                        vViewportIdx =
+                                            SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
+                                        simdscalari vNumViewports =
+                                            SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                                        simdscalari vClearMask =
+                                            SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
+                                        vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
+                                        pa.viewportArrayActive = true;
+                                    }
+                                    if (state.backendState.readRenderTargetArrayIndex)
+                                    {
+                                        vRtIdx =
+                                            SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+                                        pa.rtArrayActive = true;
+                                    }
+
+                                    pDC->pState->pfnProcessPrims(pDC,
+                                                                 pa,
+                                                                 workerId,
+                                                                 prim,
+                                                                 GenMask(pa.NumPrims()),
+                                                                 pa.GetPrimID(work.startPrimID),
+                                                                 vViewportIdx,
+                                                                 vRtIdx);
                                  }
                              }
                          }
@@ -1837,7 +2337,8 @@ void ProcessDraw(
  
              if (IsIndexedT::value)
              {
-                fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
+                fetchInfo.pIndices =
+                    (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
              }
              else
              {
@@ -1851,7 +2352,7 @@ void ProcessDraw(
  
  #endif
  
-    AR_END(FEProcessDraw, numPrims * work.numInstances);
+    RDTSC_END(pContext->pBucketMgr, FEProcessDraw, numPrims * work.numInstances);
  }
  
  struct FEDrawChooser
@@ -1865,15 +2366,18 @@ struct FEDrawChooser
      }
  };
  
-
  // Selector for correct templated Draw front-end function
-PFN_FE_WORK_FUNC GetProcessDrawFunc(
-    bool IsIndexed,
-    bool IsCutIndexEnabled,
-    bool HasTessellation,
-    bool HasGeometryShader,
-    bool HasStreamOut,
-    bool HasRasterization)
+PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
+                                    bool IsCutIndexEnabled,
+                                    bool HasTessellation,
+                                    bool HasGeometryShader,
+                                    bool HasStreamOut,
+                                    bool HasRasterization)
  {
-    return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
+    return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed,
+                                                       IsCutIndexEnabled,
+                                                       HasTessellation,
+                                                       HasGeometryShader,
+                                                       HasStreamOut,
+                                                       HasRasterization);
  }