swr/rasterizer: enable using AOS vertex data format

[mesa.git] / src / gallium / drivers / swr / rasterizer / core / pa.h
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h

index 2028d9fbcfeb02ae387507242b5b81327f38f459..635bf195e4bef7835130fc3f97bb0781dce1ff10 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -1,61 +1,118 @@
  /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file pa.h
-*
-* @brief Definitions for primitive assembly.
-*        N primitives are assembled at a time, where N is the SIMD width.
-*        A state machine, that is specific for a given topology, drives the
-*        assembly of vertices into triangles.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file pa.h
+ *
+ * @brief Definitions for primitive assembly.
+ *        N primitives are assembled at a time, where N is the SIMD width.
+ *        A state machine, that is specific for a given topology, drives the
+ *        assembly of vertices into triangles.
+ *
+ ******************************************************************************/
  #pragma once
  
  #include "frontend.h"
  
  struct PA_STATE
  {
-    DRAW_CONTEXT *pDC;              // draw context
-    uint8_t* pStreamBase;           // vertex stream
-    uint32_t streamSizeInVerts;     // total size of the input stream in verts
+#if USE_SIMD16_FRONTEND
+    enum
+    {
+        SIMD_WIDTH      = KNOB_SIMD16_WIDTH,
+        SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
+        SIMD_WIDTH_LOG2 = 4
+    };
+
+    typedef simd16mask SIMDMASK;
+
+    typedef simd16scalar SIMDSCALAR;
+    typedef simd16vector SIMDVECTOR;
+    typedef simd16vertex SIMDVERTEX;
+
+    typedef simd16scalari SIMDSCALARI;
+
+#else
+    enum
+    {
+        SIMD_WIDTH      = KNOB_SIMD_WIDTH,
+        SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
+        SIMD_WIDTH_LOG2 = 3
+    };
+
+    typedef simdmask SIMDMASK;
+
+    typedef simdscalar SIMDSCALAR;
+    typedef simdvector SIMDVECTOR;
+    typedef simdvertex SIMDVERTEX;
+
+    typedef simdscalari SIMDSCALARI;
  
-    // The topology the binner will use. In some cases the FE changes the topology from the api state.
-    PRIMITIVE_TOPOLOGY binTopology;
+#endif
+    DRAW_CONTEXT* pDC{nullptr};         // draw context
+    uint8_t*      pStreamBase{nullptr}; // vertex stream
+    uint32_t      streamSizeInVerts{0}; // total size of the input stream in verts
+    uint32_t      vertexStride{0};      // stride of a vertex in simdvector units
+
+    // The topology the binner will use. In some cases the FE changes the topology from the api
+    // state.
+    PRIMITIVE_TOPOLOGY binTopology{TOP_UNKNOWN};
+
+#if ENABLE_AVX512_SIMD16
+    bool useAlternateOffset{false};
+#endif
+
+    bool     viewportArrayActive{false};
+    bool     rtArrayActive{false};
+    uint32_t numVertsPerPrim{0};
  
      PA_STATE() {}
-    PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
-        pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
+    PA_STATE(DRAW_CONTEXT* in_pDC,
+             uint8_t*      in_pStreamBase,
+             uint32_t      in_streamSizeInVerts,
+             uint32_t      in_vertexStride,
+             uint32_t      in_numVertsPerPrim) :
+        pDC(in_pDC),
+        pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts),
+        vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim)
+    {
+    }
  
-    virtual bool HasWork() = 0;
+    virtual bool        HasWork()                                    = 0;
      virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
+#if ENABLE_AVX512_SIMD16
+    virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
+#endif
      virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
-    virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
-    virtual bool NextPrim() = 0;
-    virtual simdvertex& GetNextVsOutput() = 0;
-    virtual bool GetNextStreamOutput() = 0;
-    virtual simdmask& GetNextVsIndices() = 0;
-    virtual uint32_t NumPrims() = 0;
-    virtual void Reset() = 0;
-    virtual simdscalari GetPrimID(uint32_t startID) = 0;
+#if ENABLE_AVX512_SIMD16
+    virtual bool Assemble(uint32_t slot, simd16vector verts[]) = 0;
+#endif
+    virtual void        AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
+    virtual bool        NextPrim()                                                             = 0;
+    virtual SIMDVERTEX& GetNextVsOutput()                                                      = 0;
+    virtual bool        GetNextStreamOutput()                                                  = 0;
+    virtual SIMDMASK&   GetNextVsIndices()                                                     = 0;
+    virtual uint32_t    NumPrims()                                                             = 0;
+    virtual void        Reset()                                                                = 0;
+    virtual SIMDSCALARI GetPrimID(uint32_t startID)                                            = 0;
  };
  
  // The Optimized PA is a state machine that assembles triangles from vertex shader simd
@@ -69,69 +126,104 @@ struct PA_STATE
  //                1.    We call this the current and previous simd vertex.
  //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
  //                    order to assemble the second triangle, for a triangle list, we'll need the
-//                    last vertex from the previous simd and the first 2 vertices from the current simd.
+//                    last vertex from the previous simd and the first 2 vertices from the current
+//                    simd.
  //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
  //
  // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
  // cuts
  struct PA_STATE_OPT : public PA_STATE
  {
-    simdvertex leadingVertex;           // For tri-fan
-    uint32_t numPrims;              // Total number of primitives for draw.
-    uint32_t numPrimsComplete;      // Total number of complete primitives.
-
-    uint32_t numSimdPrims;          // Number of prims in current simd.
+    uint32_t numPrims{0};         // Total number of primitives for draw.
+    uint32_t numPrimsComplete{0}; // Total number of complete primitives.
  
-    uint32_t cur;                   // index to current VS output.
-    uint32_t prev;                  // index to prev VS output. Not really needed in the state.
-    uint32_t first;                 // index to first VS output. Used for trifan.
+    uint32_t numSimdPrims{0}; // Number of prims in current simd.
  
-    uint32_t counter;               // state counter
-    bool reset;                     // reset state
+    uint32_t       cur{0};   // index to current VS output.
+    uint32_t       prev{0};  // index to prev VS output. Not really needed in the state.
+    const uint32_t first{0}; // index to first VS output. Used for tri fan and line loop.
  
-    uint32_t primIDIncr;            // how much to increment for each vector (typically vector / {1, 2})
-    simdscalari primID;
+    uint32_t counter{0};   // state counter
+    bool     reset{false}; // reset state
  
-    typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
-    typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+    uint32_t    primIDIncr{0}; // how much to increment for each vector (typically vector / {1, 2})
+    SIMDSCALARI primID;
  
-    PFN_PA_FUNC        pfnPaFunc;        // PA state machine function for assembling 4 triangles.
-    PFN_PA_SINGLE_FUNC pfnPaSingleFunc;  // PA state machine function for assembling single triangle.
-    PFN_PA_FUNC        pfnPaFuncReset;   // initial state to set on reset
+    typedef bool (*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+#if ENABLE_AVX512_SIMD16
+    typedef bool (*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
+#endif
+    typedef void (*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa,
+                                       uint32_t      slot,
+                                       uint32_t      primIndex,
+                                       simd4scalar   verts[]);
+
+    PFN_PA_FUNC pfnPaFunc{nullptr}; // PA state machine function for assembling 4 triangles.
+#if ENABLE_AVX512_SIMD16
+    PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{nullptr};
+#endif
+    PFN_PA_SINGLE_FUNC pfnPaSingleFunc{
+        nullptr}; // PA state machine function for assembling single triangle.
+    PFN_PA_FUNC pfnPaFuncReset{nullptr}; // initial state to set on reset
+#if ENABLE_AVX512_SIMD16
+    PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{nullptr};
+#endif
  
      // state used to advance the PA when Next is called
-    PFN_PA_FUNC        pfnPaNextFunc;
-    uint32_t           nextNumSimdPrims;
-    uint32_t           nextNumPrimsIncrement;
-    bool               nextReset;
-    bool               isStreaming;
-
-    simdmask tmpIndices;             // temporary index store for unused virtual function
-    
+    PFN_PA_FUNC pfnPaNextFunc{nullptr};
+#if ENABLE_AVX512_SIMD16
+    PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{nullptr};
+#endif
+    uint32_t nextNumSimdPrims{0};
+    uint32_t nextNumPrimsIncrement{0};
+    bool     nextReset{false};
+    bool     isStreaming{false};
+
+    SIMDMASK junkIndices{0}; // temporary index store for unused virtual function
+
      PA_STATE_OPT() {}
-    PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
-        bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
+    PA_STATE_OPT(DRAW_CONTEXT*      pDC,
+                 uint32_t           numPrims,
+                 uint8_t*           pStream,
+                 uint32_t           streamSizeInVerts,
+                 uint32_t           vertexStride,
+                 bool               in_isStreaming,
+                 uint32_t           numVertsPerPrim,
+                 PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
+
+    bool HasWork() { return (this->numPrimsComplete < this->numPrims) ? true : false; }
  
-    bool HasWork()
+    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
      {
-        return (this->numPrimsComplete < this->numPrims) ? true : false;
+        SWR_ASSERT(slot < vertexStride);
+        uint32_t    offset     = index * vertexStride + slot;
+        simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
+        return vertexSlot;
      }
  
-    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
+#if ENABLE_AVX512_SIMD16
+    simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
      {
-        simdvertex* pVertex = (simdvertex*)pStreamBase;
-        return pVertex[index].attrib[slot];
+        SWR_ASSERT(slot < vertexStride);
+        uint32_t      offset     = index * vertexStride + slot;
+        simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
+        return vertexSlot;
      }
  
+#endif
      // Assembles 4 triangles. Each simdvector is a single vertex from 4
      // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
-    bool Assemble(uint32_t slot, simdvector verts[])
+    bool Assemble(uint32_t slot, simdvector verts[]) { return this->pfnPaFunc(*this, slot, verts); }
+
+#if ENABLE_AVX512_SIMD16
+    bool Assemble(uint32_t slot, simd16vector verts[])
      {
-        return this->pfnPaFunc(*this, slot, verts);
+        return this->pfnPaFunc_simd16(*this, slot, verts);
      }
  
+#endif
      // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
-    void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
+    void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
      {
          return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
      }
@@ -139,6 +231,9 @@ struct PA_STATE_OPT : public PA_STATE
      bool NextPrim()
      {
          this->pfnPaFunc = this->pfnPaNextFunc;
+#if ENABLE_AVX512_SIMD16
+        this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
+#endif
          this->numSimdPrims = this->nextNumSimdPrims;
          this->numPrimsComplete += this->nextNumPrimsIncrement;
          this->reset = this->nextReset;
@@ -158,270 +253,314 @@ struct PA_STATE_OPT : public PA_STATE
          else
          {
              this->counter = (this->reset) ? 0 : (this->counter + 1);
-            this->reset = false;
+            this->reset   = false;
          }
  
-        this->pfnPaFunc = this->pfnPaNextFunc;
-
          if (!HasWork())
          {
-            morePrims = false;    // no more to do
+            morePrims = false; // no more to do
          }
  
          return morePrims;
      }
  
-    simdvertex& GetNextVsOutput()
+    SIMDVERTEX& GetNextVsOutput()
      {
+        const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
+
          // increment cur and prev indices
-        const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH;
-        this->prev = this->cur;  // prev is undefined for first state.
-        this->cur = this->counter % numSimdVerts;
+        if (counter < numSimdVerts)
+        {
+            // prev undefined for first state
+            prev = cur;
+            cur  = counter;
+        }
+        else
+        {
+            // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in
+            // the buffer
+            uint32_t temp = prev;
+
+            prev = cur;
+            cur  = temp;
+        }
+
+        SWR_ASSERT(cur < numSimdVerts);
+        SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
  
-        simdvertex* pVertex = (simdvertex*)pStreamBase;
-        return pVertex[this->cur];
+        return *(SIMDVERTEX*)pVertex;
      }
-    
-    simdmask& GetNextVsIndices()
+
+    SIMDMASK& GetNextVsIndices()
      {
          // unused in optimized PA, pass tmp buffer back
-        return tmpIndices;
+        return junkIndices;
      }
  
      bool GetNextStreamOutput()
      {
          this->prev = this->cur;
-        this->cur = this->counter;
+        this->cur  = this->counter;
  
          return HasWork();
      }
  
      uint32_t NumPrims()
      {
-        return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
-            (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH;
+        return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims)
+                   ? (SIMD_WIDTH -
+                      (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims))
+                   : SIMD_WIDTH;
+    }
+
+    void SetNextState(PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
+                      PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
+                      uint32_t                         numSimdPrims      = 0,
+                      uint32_t                         numPrimsIncrement = 0,
+                      bool                             reset             = false)
+    {
+        this->pfnPaNextFunc         = pfnPaNextFunc;
+        this->nextNumSimdPrims      = numSimdPrims;
+        this->nextNumPrimsIncrement = numPrimsIncrement;
+        this->nextReset             = reset;
+
+        this->pfnPaSingleFunc = pfnPaNextSingleFunc;
      }
  
-    void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
-        PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
-        uint32_t numSimdPrims = 0,
-        uint32_t numPrimsIncrement = 0,
-        bool reset = false)
+#if ENABLE_AVX512_SIMD16
+    void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
+                             PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
+                             PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
+                             uint32_t                         numSimdPrims      = 0,
+                             uint32_t                         numPrimsIncrement = 0,
+                             bool                             reset             = false)
      {
-        this->pfnPaNextFunc = pfnPaNextFunc;
-        this->nextNumSimdPrims = numSimdPrims;
+        this->pfnPaNextFunc_simd16  = pfnPaNextFunc_simd16;
+        this->pfnPaNextFunc         = pfnPaNextFunc;
+        this->nextNumSimdPrims      = numSimdPrims;
          this->nextNumPrimsIncrement = numPrimsIncrement;
-        this->nextReset = reset;
+        this->nextReset             = reset;
  
          this->pfnPaSingleFunc = pfnPaNextSingleFunc;
      }
  
+#endif
      void Reset()
      {
+#if ENABLE_AVX512_SIMD16
+        useAlternateOffset = false;
+
+#endif
          this->pfnPaFunc = this->pfnPaFuncReset;
+#if ENABLE_AVX512_SIMD16
+        this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
+#endif
          this->numPrimsComplete = 0;
-        this->numSimdPrims = 0;
-        this->cur = 0;
-        this->prev = 0;
-        this->first = 0;
-        this->counter = 0;
-        this->reset = false;
+        this->numSimdPrims     = 0;
+        this->cur              = 0;
+        this->prev             = 0;
+        this->counter          = 0;
+        this->reset            = false;
      }
  
-    simdscalari GetPrimID(uint32_t startID)
+    SIMDSCALARI GetPrimID(uint32_t startID)
      {
-        return _simd_add_epi32(this->primID,
-            _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH)));
+#if USE_SIMD16_FRONTEND
+        return _simd16_add_epi32(
+            this->primID,
+            _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
+#else
+        return _simd_add_epi32(
+            this->primID,
+            _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
+#endif
      }
  };
  
  // helper C wrappers to avoid having to rewrite all the PA topology state functions
-INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
-    PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
-    uint32_t numSimdPrims = 0,
-    uint32_t numPrimsIncrement = 0,
-    bool reset = false)
-{
-    return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
-}
-INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
-{
-    return pa.GetSimdVector(index, slot);
-}
-
-INLINE __m128 swizzleLane0(const simdvector &a)
-{
-    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
-    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
-    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
-}
-
-INLINE __m128 swizzleLane1(const simdvector &a)
-{
-    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
-    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
-    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
-}
-
-INLINE __m128 swizzleLane2(const simdvector &a)
-{
-    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
-    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
-    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
-}
-
-INLINE __m128 swizzleLane3(const simdvector &a)
-{
-    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
-    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
-    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
-}
-
-INLINE __m128 swizzleLane4(const simdvector &a)
+INLINE void SetNextPaState(PA_STATE_OPT&                    pa,
+                           PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
+                           PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
+                           uint32_t                         numSimdPrims      = 0,
+                           uint32_t                         numPrimsIncrement = 0,
+                           bool                             reset             = false)
  {
-    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
-    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
-    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
-
+    return pa.SetNextState(
+        pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
  }
  
-INLINE __m128 swizzleLane5(const simdvector &a)
+#if ENABLE_AVX512_SIMD16
+INLINE void SetNextPaState_simd16(PA_STATE_OPT&                    pa,
+                                  PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
+                                  PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
+                                  PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
+                                  uint32_t                         numSimdPrims      = 0,
+                                  uint32_t                         numPrimsIncrement = 0,
+                                  bool                             reset             = false)
  {
-    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
-    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
-    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
+    return pa.SetNextState_simd16(pfnPaNextFunc_simd16,
+                                  pfnPaNextFunc,
+                                  pfnPaNextSingleFunc,
+                                  numSimdPrims,
+                                  numPrimsIncrement,
+                                  reset);
  }
  
-INLINE __m128 swizzleLane6(const simdvector &a)
-{
-    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
-    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
-    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
-}
-
-INLINE __m128 swizzleLane7(const simdvector &a)
+#endif
+INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
  {
-    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
-    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
-    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
+    return pa.GetSimdVector(index, slot);
  }
  
-INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
+#if ENABLE_AVX512_SIMD16
+INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
  {
-    switch (lane) {
-    case 0:
-        return swizzleLane0(a);
-    case 1:
-        return swizzleLane1(a);
-    case 2:
-        return swizzleLane2(a);
-    case 3:
-        return swizzleLane3(a);
-    case 4:
-        return swizzleLane4(a);
-    case 5:
-        return swizzleLane5(a);
-    case 6:
-        return swizzleLane6(a);
-    case 7:
-        return swizzleLane7(a);
-    default:
-        return _mm_setzero_ps();
-    }
+    return pa.GetSimdVector_simd16(index, slot);
  }
  
+#endif
  // Cut-aware primitive assembler.
  struct PA_STATE_CUT : public PA_STATE
  {
-    simdmask* pCutIndices;          // cut indices buffer, 1 bit per vertex
-    uint32_t numVerts;              // number of vertices available in buffer store
-    uint32_t numAttribs;            // number of attributes
-    int32_t numRemainingVerts;      // number of verts remaining to be assembled
-    uint32_t numVertsToAssemble;    // total number of verts to assemble for the draw
-    OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH];    // current index buffer for gather
-    simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
-    uint32_t numPrimsAssembled;     // number of primitives that are fully assembled
-    uint32_t headVertex;            // current unused vertex slot in vertex buffer store
-    uint32_t tailVertex;            // beginning vertex currently assembling
-    uint32_t curVertex;             // current unprocessed vertex
-    uint32_t startPrimId;           // starting prim id
-    simdscalari vPrimId;            // vector of prim ID
-    bool needOffsets;               // need to compute gather offsets for current SIMD
-    uint32_t vertsPerPrim;
-    simdvertex tmpVertex;               // temporary simdvertex for unimplemented API
-    bool processCutVerts;           // vertex indices with cuts should be processed as normal, otherwise they
-                                    // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
-                                    // while the GS sends valid verts for every index 
+    SIMDMASK* pCutIndices{nullptr};  // cut indices buffer, 1 bit per vertex
+    uint32_t  numVerts{0};           // number of vertices available in buffer store
+    uint32_t  numAttribs{0};         // number of attributes
+    int32_t   numRemainingVerts{0};  // number of verts remaining to be assembled
+    uint32_t  numVertsToAssemble{0}; // total number of verts to assemble for the draw
+#if ENABLE_AVX512_SIMD16
+    OSALIGNSIMD16(uint32_t)
+    indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
+#else
+    OSALIGNSIMD(uint32_t)
+    indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
+#endif
+    SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
+    uint32_t    numPrimsAssembled{0};             // number of primitives that are fully assembled
+    uint32_t    headVertex{0};      // current unused vertex slot in vertex buffer store
+    uint32_t    tailVertex{0};      // beginning vertex currently assembling
+    uint32_t    curVertex{0};       // current unprocessed vertex
+    uint32_t    startPrimId{0};     // starting prim id
+    SIMDSCALARI vPrimId;            // vector of prim ID
+    bool        needOffsets{false}; // need to compute gather offsets for current SIMD
+    uint32_t    vertsPerPrim{0};
+    bool        processCutVerts{
+        false}; // vertex indices with cuts should be processed as normal, otherwise they
+                // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
+                // while the GS sends valid verts for every index
+
+    simdvector junkVector; // junk simdvector for unimplemented API
+#if ENABLE_AVX512_SIMD16
+    simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
+#endif
+
      // Topology state tracking
      uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
-    uint32_t curIndex;
-    bool reverseWinding;            // indicates reverse winding for strips
-    int32_t adjExtraVert;           // extra vert uses for tristrip w/ adj
+    uint32_t curIndex{0};
+    bool     reverseWinding{false}; // indicates reverse winding for strips
+    int32_t  adjExtraVert{0};       // extra vert uses for tristrip w/ adj
  
-    typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
-    PFN_PA_FUNC pfnPa;              // per-topology function that processes a single vert
+    typedef void (PA_STATE_CUT::*PFN_PA_FUNC)(uint32_t vert, bool finish);
+    PFN_PA_FUNC pfnPa{nullptr}; // per-topology function that processes a single vert
  
      PA_STATE_CUT() {}
-    PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts, 
-        uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
-        : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
+    PA_STATE_CUT(DRAW_CONTEXT*      pDC,
+                 uint8_t*           in_pStream,
+                 uint32_t           in_streamSizeInVerts,
+                 uint32_t           in_vertexStride,
+                 SIMDMASK*          in_pIndices,
+                 uint32_t           in_numVerts,
+                 uint32_t           in_numAttribs,
+                 PRIMITIVE_TOPOLOGY topo,
+                 bool               in_processCutVerts,
+                 uint32_t           in_numVertsPerPrim) :
+        PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim)
      {
-        numVerts = in_streamSizeInVerts;
-        numAttribs = in_numAttribs;
-        binTopology = topo;
-        needOffsets = false;
+        numVerts        = in_streamSizeInVerts;
+        numAttribs      = in_numAttribs;
+        binTopology     = topo;
+        needOffsets     = false;
          processCutVerts = in_processCutVerts;
  
          numVertsToAssemble = numRemainingVerts = in_numVerts;
-        numPrimsAssembled = 0;
+        numPrimsAssembled                      = 0;
          headVertex = tailVertex = curVertex = 0;
  
-        curIndex = 0;
+        curIndex    = 0;
          pCutIndices = in_pIndices;
          memset(indices, 0, sizeof(indices));
-        vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+#if USE_SIMD16_FRONTEND
+        vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+#else
+        vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+#endif
          reverseWinding = false;
-        adjExtraVert = -1;
+        adjExtraVert   = -1;
  
          bool gsEnabled = pDC->pState->state.gsState.gsEnable;
-        vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
+        vertsPerPrim   = NumVertsPerPrim(topo, gsEnabled);
  
          switch (topo)
          {
-        case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
-        case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
-        case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
-        case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
-                                    {
-                                        pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
-                                    }
-                                    else
-                                    {
-                                        pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
-                                    }
-                                    break;
-
-        case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
-        case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
-        case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
-        case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
-        case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
-        default: assert(0 && "Unimplemented topology");
+        case TOP_TRIANGLE_LIST:
+            pfnPa = &PA_STATE_CUT::ProcessVertTriList;
+            break;
+        case TOP_TRI_LIST_ADJ:
+            pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj
+                              : &PA_STATE_CUT::ProcessVertTriListAdjNoGs;
+            break;
+        case TOP_TRIANGLE_STRIP:
+            pfnPa = &PA_STATE_CUT::ProcessVertTriStrip;
+            break;
+        case TOP_TRI_STRIP_ADJ:
+            if (gsEnabled)
+            {
+                pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<true>;
+            }
+            else
+            {
+                pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<false>;
+            }
+            break;
+
+        case TOP_POINT_LIST:
+            pfnPa = &PA_STATE_CUT::ProcessVertPointList;
+            break;
+        case TOP_LINE_LIST:
+            pfnPa = &PA_STATE_CUT::ProcessVertLineList;
+            break;
+        case TOP_LINE_LIST_ADJ:
+            pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj
+                              : &PA_STATE_CUT::ProcessVertLineListAdjNoGs;
+            break;
+        case TOP_LINE_STRIP:
+            pfnPa = &PA_STATE_CUT::ProcessVertLineStrip;
+            break;
+        case TOP_LISTSTRIP_ADJ:
+            pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj
+                              : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs;
+            break;
+        case TOP_RECT_LIST:
+            pfnPa = &PA_STATE_CUT::ProcessVertRectList;
+            break;
+        default:
+            assert(0 && "Unimplemented topology");
          }
      }
  
-    simdvertex& GetNextVsOutput()
+    SIMDVERTEX& GetNextVsOutput()
      {
-        uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
-        this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts;
-        this->needOffsets = true;
-        return ((simdvertex*)pStreamBase)[vertexIndex];
+        uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
+        this->headVertex     = (this->headVertex + SIMD_WIDTH) % this->numVerts;
+        this->needOffsets    = true;
+        SIMDVECTOR* pVertex  = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
+
+        return *(SIMDVERTEX*)pVertex;
      }
  
-    simdmask& GetNextVsIndices()
+    SIMDMASK& GetNextVsIndices()
      {
-        uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
-        simdmask* pCurCutIndex = this->pCutIndices + vertexIndex;
+        uint32_t  vertexIndex  = this->headVertex / SIMD_WIDTH;
+        SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
          return *pCurCutIndex;
      }
  
@@ -429,67 +568,84 @@ struct PA_STATE_CUT : public PA_STATE
      {
          // unused
          SWR_ASSERT(0 && "Not implemented");
-        return this->tmpVertex.attrib[0];
+        return junkVector;
+    }
+
+#if ENABLE_AVX512_SIMD16
+    simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
+    {
+        // unused
+        SWR_ASSERT(0 && "Not implemented");
+        return junkVector_simd16;
      }
  
+#endif
      bool GetNextStreamOutput()
      {
-        this->headVertex += KNOB_SIMD_WIDTH;
+        this->headVertex += SIMD_WIDTH;
          this->needOffsets = true;
          return HasWork();
      }
  
-    simdscalari GetPrimID(uint32_t startID)
+    SIMDSCALARI GetPrimID(uint32_t startID)
      {
+#if USE_SIMD16_FRONTEND
+        return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
+#else
          return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
+#endif
      }
  
      void Reset()
      {
+#if ENABLE_AVX512_SIMD16
+        useAlternateOffset = false;
+
+#endif
          this->numRemainingVerts = this->numVertsToAssemble;
          this->numPrimsAssembled = 0;
-        this->curIndex = 0;
-        this->curVertex = 0;
-        this->tailVertex = 0;
-        this->headVertex = 0;
-        this->reverseWinding = false;
-        this->adjExtraVert = -1;
-        this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        this->curIndex          = 0;
+        this->curVertex         = 0;
+        this->tailVertex        = 0;
+        this->headVertex        = 0;
+        this->reverseWinding    = false;
+        this->adjExtraVert      = -1;
+#if USE_SIMD16_FRONTEND
+        this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+#else
+        this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+#endif
      }
  
-    bool HasWork()
-    {
-        return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
-    }
+    bool HasWork() { return this->numRemainingVerts > 0 || this->adjExtraVert != -1; }
  
      bool IsVertexStoreFull()
      {
-        return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex;
+        return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
      }
  
      void RestartTopology()
      {
-        this->curIndex = 0;
+        this->curIndex       = 0;
          this->reverseWinding = false;
-        this->adjExtraVert = -1;
+        this->adjExtraVert   = -1;
      }
  
      bool IsCutIndex(uint32_t vertex)
      {
-        uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH;
-        uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1);
-        return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
+        uint32_t vertexIndex  = vertex / SIMD_WIDTH;
+        uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
+        return CheckBit(this->pCutIndices[vertexIndex], vertexOffset);
      }
  
-    // iterates across the unprocessed verts until we hit the end or we 
+    // iterates across the unprocessed verts until we hit the end or we
      // have assembled SIMD prims
      void ProcessVerts()
      {
-        while (this->numPrimsAssembled != KNOB_SIMD_WIDTH &&
-            this->numRemainingVerts > 0 &&
-            this->curVertex != this->headVertex)
+        while (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0 &&
+               this->curVertex != this->headVertex)
          {
-            // if cut index, restart topology 
+            // if cut index, restart topology
              if (IsCutIndex(this->curVertex))
              {
                  if (this->processCutVerts)
@@ -508,12 +664,17 @@ struct PA_STATE_CUT : public PA_STATE
                  (this->*pfnPa)(this->curVertex, false);
              }
  
-            this->curVertex = (this->curVertex + 1) % this->numVerts;
+            this->curVertex++;
+            if (this->curVertex >= this->numVerts)
+            {
+                this->curVertex = 0;
+            }
              this->numRemainingVerts--;
          }
  
          // special case last primitive for tri strip w/ adj
-        if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
+        if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 &&
+            this->adjExtraVert != -1)
          {
              (this->*pfnPa)(this->curVertex, true);
          }
@@ -523,15 +684,19 @@ struct PA_STATE_CUT : public PA_STATE
      {
          // done with current batch
          // advance tail to the current unsubmitted vertex
-        this->tailVertex = this->curVertex;
+        this->tailVertex        = this->curVertex;
          this->numPrimsAssembled = 0;
-        this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH));
+#if USE_SIMD16_FRONTEND
+        this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
+#else
+        this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
+#endif
      }
  
      bool NextPrim()
      {
          // if we've assembled enough prims, we can advance to the next set of verts
-        if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0)
+        if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
          {
              Advance();
          }
@@ -542,32 +707,50 @@ struct PA_STATE_CUT : public PA_STATE
      {
          for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
          {
-            simdscalari vIndices = *(simdscalari*)&this->indices[v][0];
+            uint32_t    vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
+            SIMDSCALARI vIndices          = *(SIMDSCALARI*)&this->indices[v][0];
  
              // step to simdvertex batch
-            const uint32_t simdShift = 3; // @todo make knob
-            simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
-            this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex)));
+            const uint32_t simdShift = SIMD_WIDTH_LOG2;
+#if USE_SIMD16_FRONTEND
+            SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
+            this->vOffsets[v] =
+                _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
+#else
+            SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
+            this->vOffsets[v] =
+                _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
+#endif
  
              // step to index
-            const uint32_t simdMask = 0x7; // @todo make knob
-            simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
-            this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
+            const uint32_t simdMask = SIMD_WIDTH - 1;
+#if USE_SIMD16_FRONTEND
+            SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
+            this->vOffsets[v]        = _simd16_add_epi32(
+                this->vOffsets[v],
+                _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
+#else
+            SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
+            this->vOffsets[v] =
+                _simd_add_epi32(this->vOffsets[v],
+                                _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
+#endif
          }
      }
  
-    bool Assemble(uint32_t slot, simdvector result[])
+    bool Assemble(uint32_t slot, simdvector* verts)
      {
          // process any outstanding verts
          ProcessVerts();
  
          // return false if we don't have enough prims assembled
-        if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0)
+        if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
          {
              return false;
          }
  
-        // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
+        // cache off gather offsets given the current SIMD set of indices the first time we get an
+        // assemble
          if (this->needOffsets)
          {
              ComputeOffsets();
@@ -576,47 +759,155 @@ struct PA_STATE_CUT : public PA_STATE
  
          for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
          {
-            simdscalari offsets = this->vOffsets[v];
+            SIMDSCALARI offsets = this->vOffsets[v];
  
              // step to attribute
+#if USE_SIMD16_FRONTEND
+            offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
+#else
+            offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
+#endif
+
+            float* pBase = (float*)this->pStreamBase;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+#if USE_SIMD16_FRONTEND
+                simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
+
+                // Assigning to a temporary first to avoid an MSVC 2017 compiler bug
+                simdscalar t =
+                    useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
+                verts[v].v[c] = t;
+#else
+                verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
+#endif
+
+                // move base to next component
+                pBase += SIMD_WIDTH;
+            }
+        }
+
+        // compute the implied 4th vertex, v3
+        if (this->binTopology == TOP_RECT_LIST)
+        {
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                // v1, v3 = v1 + v2 - v0, v2
+                // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2]
+                simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]);
+                temp              = _simd16_sub_ps(temp, verts[1].v[c]);
+                temp = _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010
+                verts[1].v[c] = _simd16_extract_ps(temp, 0);
+            }
+        }
+
+        return true;
+    }
+
+#if ENABLE_AVX512_SIMD16
+    bool Assemble(uint32_t slot, simd16vector verts[])
+    {
+       // process any outstanding verts
+        ProcessVerts();
+
+        // return false if we don't have enough prims assembled
+        if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
+        {
+            return false;
+        }
+
+        // cache off gather offsets given the current SIMD set of indices the first time we get an
+        // assemble
+        if (this->needOffsets)
+        {
+            ComputeOffsets();
+            this->needOffsets = false;
+        }
+
+        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
+        {
+            SIMDSCALARI offsets = this->vOffsets[v];
+
+            // step to attribute
+#if USE_SIMD16_FRONTEND
+            offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
+#else
              offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
+#endif
  
              float* pBase = (float*)this->pStreamBase;
              for (uint32_t c = 0; c < 4; ++c)
              {
-                result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
+#if USE_SIMD16_FRONTEND
+                verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
+#else
+                verts[v].v[c] = _simd16_insert_ps(
+                    _simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
+#endif
  
                  // move base to next component
-                pBase += KNOB_SIMD_WIDTH;
+                pBase += SIMD_WIDTH;
+            }
+        }
+
+        // compute the implied 4th vertex, v3
+        if (this->binTopology == TOP_RECT_LIST)
+        {
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                // v1, v3 = v1 + v2 - v0, v2
+                // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2]
+                simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]);
+                temp              = _simd16_sub_ps(temp, verts[1].v[c]);
+                verts[1].v[c] =
+                    _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010
              }
          }
  
          return true;
      }
  
-    void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
+#endif
+    void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
      {
-        // move to slot
+       // move to slot
          for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
          {
              uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
+#if USE_SIMD16_FRONTEND
+            uint32_t offset =
+                useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
+#else
              uint32_t offset = pOffset[triIndex];
-            offset += sizeof(simdvector) * slot;
+#endif
+            offset += sizeof(SIMDVECTOR) * slot;
              float* pVert = (float*)&tri[v];
              for (uint32_t c = 0; c < 4; ++c)
              {
                  float* pComponent = (float*)(this->pStreamBase + offset);
-                pVert[c] = *pComponent;
-                offset += KNOB_SIMD_WIDTH * sizeof(float);
+                pVert[c]          = *pComponent;
+                offset += SIMD_WIDTH * sizeof(float);
              }
          }
-    }
  
-    uint32_t NumPrims()
-    {
-        return this->numPrimsAssembled;
+        // compute the implied 4th vertex, v3
+        if ((this->binTopology == TOP_RECT_LIST) && (triIndex % 2 == 1))
+        {
+            // v1, v3 = v1 + v2 - v0, v2
+            // v1 stored in tri[0], v0 stored in tri[1], v2 stored in tri[2]
+            float* pVert0 = (float*)&tri[1];
+            float* pVert1 = (float*)&tri[0];
+            float* pVert2 = (float*)&tri[2];
+            float* pVert3 = (float*)&tri[1];
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                pVert3[c] = pVert1[c] + pVert2[c] - pVert0[c];
+            }
+        }
      }
  
+    uint32_t NumPrims() { return this->numPrimsAssembled; }
+
      // Per-topology functions
      void ProcessVertTriStrip(uint32_t index, bool finish)
      {
@@ -641,14 +932,14 @@ struct PA_STATE_CUT : public PA_STATE
              this->numPrimsAssembled++;
  
              // set up next prim state
-            this->vert[0] = this->vert[1];
-            this->vert[1] = this->vert[2];
+            this->vert[0]  = this->vert[1];
+            this->vert[1]  = this->vert[2];
              this->curIndex = 2;
              this->reverseWinding ^= 1;
          }
      }
  
-    template<bool gsEnabled>
+    template <bool gsEnabled>
      void AssembleTriStripAdj()
      {
          if (!gsEnabled)
@@ -675,8 +966,7 @@ struct PA_STATE_CUT : public PA_STATE
          this->numPrimsAssembled++;
      }
  
-
-    template<bool gsEnabled>
+    template <bool gsEnabled>
      void ProcessVertTriStripAdj(uint32_t index, bool finish)
      {
          // handle last primitive of tristrip
@@ -747,7 +1037,7 @@ struct PA_STATE_CUT : public PA_STATE
          case 6:
              SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
              AssembleTriStripAdj<gsEnabled>();
-            
+
              uint32_t nextTri[6];
              if (this->reverseWinding)
              {
@@ -763,7 +1053,7 @@ struct PA_STATE_CUT : public PA_STATE
                  nextTri[1] = this->adjExtraVert;
                  nextTri[2] = this->vert[3];
                  nextTri[4] = this->vert[4];
-                nextTri[5] = this->vert[0]; 
+                nextTri[5] = this->vert[0];
              }
              for (uint32_t i = 0; i < 6; ++i)
              {
@@ -836,7 +1126,6 @@ struct PA_STATE_CUT : public PA_STATE
          }
      }
  
-
      void ProcessVertLineList(uint32_t index, bool finish)
      {
          this->vert[this->curIndex] = index;
@@ -865,7 +1154,7 @@ struct PA_STATE_CUT : public PA_STATE
              this->numPrimsAssembled++;
  
              // set up next prim state
-            this->vert[0] = this->vert[1];
+            this->vert[0]  = this->vert[1];
              this->curIndex = 1;
          }
      }
@@ -886,9 +1175,9 @@ struct PA_STATE_CUT : public PA_STATE
              this->numPrimsAssembled++;
  
              // set up next prim state
-            this->vert[0] = this->vert[1];
-            this->vert[1] = this->vert[2];
-            this->vert[2] = this->vert[3];
+            this->vert[0]  = this->vert[1];
+            this->vert[1]  = this->vert[2];
+            this->vert[2]  = this->vert[3];
              this->curIndex = 3;
          }
      }
@@ -907,9 +1196,9 @@ struct PA_STATE_CUT : public PA_STATE
              this->numPrimsAssembled++;
  
              // set up next prim state
-            this->vert[0] = this->vert[1];
-            this->vert[1] = this->vert[2];
-            this->vert[2] = this->vert[3];
+            this->vert[0]  = this->vert[1];
+            this->vert[1]  = this->vert[2];
+            this->vert[2]  = this->vert[3];
              this->curIndex = 3;
          }
      }
@@ -955,28 +1244,57 @@ struct PA_STATE_CUT : public PA_STATE
              this->curIndex = 0;
          }
      }
+
+    void ProcessVertRectList(uint32_t index, bool finish)
+    {
+        this->vert[this->curIndex] = index;
+        this->curIndex++;
+        if (this->curIndex == 3)
+        {
+            // assembled enough verts for prim, add to gather indices
+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
+            this->indices[2][this->numPrimsAssembled] = this->vert[2];
+
+            // second triangle in the rectangle
+            // v1, v3 = v1 + v2 - v0, v2
+            this->indices[0][this->numPrimsAssembled + 1] = this->vert[1];
+            this->indices[1][this->numPrimsAssembled + 1] = this->vert[0];
+            this->indices[2][this->numPrimsAssembled + 1] = this->vert[2];
+
+            // increment numPrimsAssembled
+            this->numPrimsAssembled += 2;
+
+            // set up next prim state
+            this->curIndex = 0;
+        }
+    }
  };
  
  // Primitive Assembly for data output from the DomainShader.
  struct PA_TESS : PA_STATE
  {
-    PA_TESS(
-        DRAW_CONTEXT *in_pDC,
-        const simdscalar* in_pVertData,
-        uint32_t in_attributeStrideInVectors,
-        uint32_t in_numAttributes,
-        uint32_t* (&in_ppIndices)[3],
-        uint32_t in_numPrims,
-        PRIMITIVE_TOPOLOGY in_binTopology) :
-
-        PA_STATE(in_pDC, nullptr, 0),
-        m_pVertexData(in_pVertData),
-        m_attributeStrideInVectors(in_attributeStrideInVectors),
-        m_numAttributes(in_numAttributes),
-        m_numPrims(in_numPrims)
+    PA_TESS(DRAW_CONTEXT*     in_pDC,
+            const SIMDSCALAR* in_pVertData,
+            uint32_t          in_attributeStrideInVectors,
+            uint32_t          in_vertexStride,
+            uint32_t          in_numAttributes,
+            uint32_t* (&in_ppIndices)[3],
+            uint32_t           in_numPrims,
+            PRIMITIVE_TOPOLOGY in_binTopology,
+            uint32_t           numVertsPerPrim,
+            bool               SOA = true) :
+
+        PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim),
+        m_pVertexData(in_pVertData), m_attributeStrideInVectors(in_attributeStrideInVectors),
+        m_numAttributes(in_numAttributes), m_numPrims(in_numPrims), m_SOA(SOA)
      {
+#if USE_SIMD16_FRONTEND
+        m_vPrimId = _simd16_setzero_si();
+#else
          m_vPrimId = _simd_setzero_si();
-        binTopology = in_binTopology;
+#endif
+        binTopology    = in_binTopology;
          m_ppIndices[0] = in_ppIndices[0];
          m_ppIndices[1] = in_ppIndices[1];
          m_ppIndices[2] = in_ppIndices[2];
@@ -996,48 +1314,112 @@ struct PA_TESS : PA_STATE
              break;
  
          default:
-            SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
+            SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
              break;
          }
      }
  
-    bool HasWork()
+    bool HasWork() { return m_numPrims != 0; }
+
+    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
      {
-        return m_numPrims != 0;
+        SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
+        return junkVector;
      }
  
-    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
+#if ENABLE_AVX512_SIMD16
+    simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
      {
-        SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
-        static simdvector junk = { 0 };
-        return junk;
+        SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
+        return junkVector_simd16;
+    }
+
+#endif
+    static SIMDSCALARI GenPrimMask(uint32_t numPrims)
+    {
+        SWR_ASSERT(numPrims <= SIMD_WIDTH);
+#if USE_SIMD16_FRONTEND
+        static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] = {
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0};
+
+        return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
+#else
+        static const OSALIGNLINE(int32_t)
+            maskGen[SIMD_WIDTH * 2] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0};
+
+        return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
+#endif
      }
  
-    static simdscalari GenPrimMask(uint32_t numPrims)
+    bool Assemble(uint32_t slot, simdvector verts[])
      {
-        SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH);
-#if KNOB_SIMD_WIDTH == 8
-        static const OSALIGN(int32_t, 64) maskGen[KNOB_SIMD_WIDTH * 2] =
+        SWR_ASSERT(slot < m_numAttributes);
+
+        uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
+        if (0 == numPrimsToAssemble)
          {
-            -1, -1, -1, -1, -1, -1, -1, -1,
-             0,  0,  0,  0,  0,  0,  0,  0
-        };
-#elif KNOB_SIMD_WIDTH == 16
-        static const OSALIGN(int32_t, 128) maskGen[KNOB_SIMD_WIDTH * 2] =
+            return false;
+        }
+
+        SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
+
+        const float* pBaseAttrib;
+        if (m_SOA)
          {
-            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-        };
+            pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+        }
+        else
+        {
+            const float* pVertData = (const float*)m_pVertexData;
+            pBaseAttrib            = pVertData + slot * 4;
+        }
+
+        for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
+        {
+#if USE_SIMD16_FRONTEND
+            SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
+#else
+            SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
+#endif
+
+            const float* pBase = pBaseAttrib;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+#if USE_SIMD16_FRONTEND
+                simd16scalar temp =
+                    _simd16_mask_i32gather_ps(_simd16_setzero_ps(),
+                                              pBase,
+                                              indices,
+                                              _simd16_castsi_ps(mask),
+                                              4 /* gcc doesn't like sizeof(float) */);
+
+                verts[i].v[c] =
+                    useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
  #else
-#error "Help, help, I can't get up!"
+                verts[i].v[c] = _simd_mask_i32gather_ps(_simd_setzero_ps(),
+                                                        pBase,
+                                                        indices,
+                                                        _simd_castsi_ps(mask),
+                                                        4); // gcc doesn't like sizeof(float)
  #endif
+                if (m_SOA)
+                {
+                    pBase += m_attributeStrideInVectors * SIMD_WIDTH;
+                }
+                else
+                {
+                    pBase += sizeof(float);
+                }
+            }
+        }
  
-        return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]);
+        return true;
      }
  
-    bool Assemble(uint32_t slot, simdvector verts[])
+#if ENABLE_AVX512_SIMD16
+    bool Assemble(uint32_t slot, simd16vector verts[])
      {
-        static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented");
          SWR_ASSERT(slot < m_numAttributes);
  
          uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
@@ -1046,46 +1428,108 @@ struct PA_TESS : PA_STATE
              return false;
          }
  
-        simdscalari mask = GenPrimMask(numPrimsToAssemble);
+        SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
+
+        const float* pBaseAttrib;
+        if (m_SOA)
+        {
+            pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+        }
+        else
+        {
+            const float* pVertData = (const float*)m_pVertexData;
+            pBaseAttrib            = pVertData + slot * 4;
+        }
  
-        const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
          for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
          {
-            simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]);
+#if USE_SIMD16_FRONTEND
+            SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
+            if (!m_SOA)
+            {
+                indices = _simd16_mul_epi32(indices, _simd16_set1_epi32(vertexStride / 4));
+            }
+#else
+            SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
+#endif
  
              const float* pBase = pBaseAttrib;
              for (uint32_t c = 0; c < 4; ++c)
              {
-                verts[i].v[c] = _simd_mask_i32gather_ps(
-                    _simd_setzero_ps(),
-                    pBase,
-                    indices,
-                    _simd_castsi_ps(mask),
-                    4 /* gcc doesn't like sizeof(float) */);
-                pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
+#if USE_SIMD16_FRONTEND
+                verts[i].v[c] = _simd16_mask_i32gather_ps(_simd16_setzero_ps(),
+                                                          pBase,
+                                                          indices,
+                                                          _simd16_castsi_ps(mask),
+                                                          4 /* gcc doesn't like sizeof(float) */);
+#else
+                simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(),
+                                                          pBase,
+                                                          indices,
+                                                          _simd_castsi_ps(mask),
+                                                          4 /* gcc doesn't like sizeof(float) */);
+                verts[i].v[c]   = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
+#endif
+                if (m_SOA)
+                {
+                    pBase += m_attributeStrideInVectors * SIMD_WIDTH;
+                }
+                else
+                {
+                    pBase++;
+                }
              }
          }
  
          return true;
      }
  
-    void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
+#endif
+    void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
      {
          SWR_ASSERT(slot < m_numAttributes);
+
+
          SWR_ASSERT(primIndex < PA_TESS::NumPrims());
  
-        const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+        const float* pVertDataBase;
+        if (m_SOA)
+        {
+            pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+        }
+        else
+        {
+            const float* pVertData = (const float*)m_pVertexData;
+            pVertDataBase          = pVertData + slot * 4;
+        };
          for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
          {
+#if USE_SIMD16_FRONTEND
+            uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2]
+                                                : m_ppIndices[i][primIndex];
+            if (!m_SOA)
+            {
+                index *= (vertexStride / 4);
+            }
+#else
              uint32_t index = m_ppIndices[i][primIndex];
+#endif
              const float* pVertData = pVertDataBase;
-            float* pVert = (float*)&verts[i];
+            float*       pVert     = (float*)&verts[i];
  
              for (uint32_t c = 0; c < 4; ++c)
              {
                  pVert[c] = pVertData[index];
-                pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
+                if (m_SOA)
+                {
+                    pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
+                }
+                else
+                {
+                    pVertData++;
+                }
              }
+
          }
      }
  
@@ -1100,87 +1544,111 @@ struct PA_TESS : PA_STATE
          return HasWork();
      }
  
-    simdvertex& GetNextVsOutput()
+    SIMDVERTEX& GetNextVsOutput()
      {
-        SWR_ASSERT(0, "%s", __FUNCTION__);
-        static simdvertex junk;
-        return junk;
+        SWR_NOT_IMPL;
+        return junkVertex;
      }
  
      bool GetNextStreamOutput()
      {
-        SWR_ASSERT(0, "%s", __FUNCTION__);
+        SWR_NOT_IMPL;
          return false;
      }
  
-    simdmask& GetNextVsIndices()
+    SIMDMASK& GetNextVsIndices()
      {
-        SWR_ASSERT(0, "%s", __FUNCTION__);
-        static simdmask junk;
-        return junk;
+        SWR_NOT_IMPL;
+        return junkIndices;
      }
  
-    uint32_t NumPrims()
-    {
-        return std::min<uint32_t>(m_numPrims, KNOB_SIMD_WIDTH);
-    }
+    uint32_t NumPrims() { return std::min<uint32_t>(m_numPrims, SIMD_WIDTH); }
  
-    void Reset() { SWR_ASSERT(0); };
+    void Reset() { SWR_NOT_IMPL; }
  
-    simdscalari GetPrimID(uint32_t startID)
+    SIMDSCALARI GetPrimID(uint32_t startID)
      {
+#if USE_SIMD16_FRONTEND
+        return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
+#else
          return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
+#endif
      }
  
  private:
-    const simdscalar*   m_pVertexData = nullptr;
-    uint32_t            m_attributeStrideInVectors = 0;
-    uint32_t            m_numAttributes = 0;
-    uint32_t            m_numPrims = 0;
-    uint32_t*           m_ppIndices[3];
+    const SIMDSCALAR* m_pVertexData              = nullptr;
+    uint32_t          m_attributeStrideInVectors = 0;
+    uint32_t          m_numAttributes            = 0;
+    uint32_t          m_numPrims                 = 0;
+    uint32_t*         m_ppIndices[3];
+
+    uint32_t m_numVertsPerPrim = 0;
  
-    uint32_t            m_numVertsPerPrim = 0;
+    SIMDSCALARI m_vPrimId;
  
-    simdscalari         m_vPrimId;
+    simdvector junkVector; // junk simdvector for unimplemented API
+#if ENABLE_AVX512_SIMD16
+    simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
+#endif
+    SIMDVERTEX junkVertex;  // junk SIMDVERTEX for unimplemented API
+    SIMDMASK   junkIndices; // temporary index store for unused virtual function
+
+    bool m_SOA;
  };
  
-// Primitive Assembler factory class, responsible for creating and initializing the correct assembler
-// based on state.
-template <bool IsIndexedT>
+// Primitive Assembler factory class, responsible for creating and initializing the correct
+// assembler based on state.
+template <typename IsIndexedT, typename IsCutIndexEnabledT>
  struct PA_FACTORY
  {
-    PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
+    PA_FACTORY(DRAW_CONTEXT*         pDC,
+               PRIMITIVE_TOPOLOGY    in_topo,
+               uint32_t              numVerts,
+               PA_STATE::SIMDVERTEX* pVertexStore,
+               uint32_t              vertexStoreSize,
+               uint32_t              vertexStride,
+               uint32_t              numVertsPerPrim) :
+        topo(in_topo)
      {
  #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
          const API_STATE& state = GetApiState(pDC);
-        if ((IsIndexedT && (
-            topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
-            topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
-            topo == TOP_TRIANGLE_LIST || topo == TOP_LINE_LIST_ADJ ||
-            topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ ||
-            topo == TOP_TRI_STRIP_ADJ)) ||
-
-            // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
-            // for them in the optimized PA
-            (!IsIndexedT && (
-            topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ)))
+        if ((IsIndexedT::value && IsCutIndexEnabledT::value &&
+             (topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST || topo == TOP_LINE_LIST ||
+              topo == TOP_LINE_STRIP || topo == TOP_TRIANGLE_LIST)) ||
+
+            // non-indexed draws with adjacency topologies must use cut-aware PA until we add
+            // support for them in the optimized PA
+            (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ ||
+             topo == TOP_TRI_STRIP_ADJ))
          {
              memset(&indexStore, 0, sizeof(indexStore));
-            DWORD numAttribs;
-            _BitScanReverse(&numAttribs, state.feAttribMask);
-            numAttribs++;
-            new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, 
-                &this->indexStore[0], numVerts, numAttribs, state.topology, false);
+            uint32_t numAttribs = state.feNumAttributes;
+
+            new (&this->paCut) PA_STATE_CUT(pDC,
+                                            reinterpret_cast<uint8_t*>(pVertexStore),
+                                            vertexStoreSize * PA_STATE::SIMD_WIDTH,
+                                            vertexStride,
+                                            &this->indexStore[0],
+                                            numVerts,
+                                            numAttribs,
+                                            state.topology,
+                                            false,
+                                            numVertsPerPrim);
              cutPA = true;
          }
          else
  #endif
          {
              uint32_t numPrims = GetNumPrims(in_topo, numVerts);
-            new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false);
+            new (&this->paOpt) PA_STATE_OPT(pDC,
+                                            numPrims,
+                                            reinterpret_cast<uint8_t*>(pVertexStore),
+                                            vertexStoreSize * PA_STATE::SIMD_WIDTH,
+                                            vertexStride,
+                                            false,
+                                            numVertsPerPrim);
              cutPA = false;
          }
-
      }
  
      PA_STATE& GetPA()
@@ -1199,10 +1667,10 @@ struct PA_FACTORY
  
      PA_STATE_OPT paOpt;
      PA_STATE_CUT paCut;
-    bool cutPA;
  
-    PRIMITIVE_TOPOLOGY topo;
+    bool cutPA{false};
+
+    PRIMITIVE_TOPOLOGY topo{TOP_UNKNOWN};
  
-    simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
-    simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
+    PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
  };