From 36e276b6b03da852c78e314640b3822be263def2 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Sun, 3 Dec 2017 18:49:29 -0600
Subject: [PATCH] swr/rast: WIP - Widen fetch shader to SIMD16

Widen vertex gather/storage to SIMD16 for all component types.

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
---
 .../swr/rasterizer/jitter/fetch_jit.cpp       | 716 +++++++++++++++++-
 1 file changed, 689 insertions(+), 27 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 337bb7f6604..6c0e658e68f 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -70,6 +70,9 @@ struct FetchJit : public Builder
 #else
     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
 #endif
+#if USE_SIMD16_BUILDER
+    void Shuffle8bpcGatherd2(Shuffle8bpcArgs &args);
+#endif
 
     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
@@ -78,6 +81,9 @@ struct FetchJit : public Builder
 #else
     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
 #endif
+#if USE_SIMD16_BUILDER
+    void Shuffle16bpcGather2(Shuffle16bpcArgs &args);
+#endif
 
     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
 #if USE_SIMD16_BUILDER
@@ -726,7 +732,7 @@ void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pB
     // only works if pixel size is <= 32bits
     SWR_ASSERT(info.bpp <= 32);
 
-	Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
+    Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
 
     for (uint32_t comp = 0; comp < 4; ++comp)
     {
@@ -825,6 +831,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
     Value* vVertexElements[4];
 #if USE_SIMD16_GATHERS
     Value* vVertexElements2[4];
+#if USE_SIMD16_BUILDER
+    Value *pVtxSrc2[4];
+#endif
 #endif
 
     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
@@ -961,6 +970,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 #if USE_SIMD16_GATHERS
         // override cur indices with 0 if pitch is 0
         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
+        vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
         vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
 
         // are vertices partially OOB?
@@ -983,7 +993,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 
             // only fetch lanes that pass both tests
             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
-            vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2);
+            vGatherMask2 = AND(vMaxGatherMask2, vMinGatherMask2);
         }
         else
         {
@@ -1074,15 +1084,32 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
             {
                 if (isComponentEnabled(compMask, c))
                 {
-                    vVertexElements[currentVertexElement] = pResults[c];
+#if USE_SIMD16_BUILDER
+                    // pack adjacent pairs of SIMD8s into SIMD16s
+                    pVtxSrc2[currentVertexElement] = VUNDEF2_F();
+                    pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults[c],  0);
+                    pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults2[c], 1);
+
+#else
+                    vVertexElements[currentVertexElement]  = pResults[c];
                     vVertexElements2[currentVertexElement] = pResults2[c];
-                    currentVertexElement++;
+
+#endif
+                    currentVertexElement += 1;
 
                     if (currentVertexElement > 3)
                     {
+#if USE_SIMD16_BUILDER
+                        // store SIMD16s
+                        Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+                        StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
+
+#else
                         StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
                         StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
 
+#endif
                         outputElt += 1;
 
                         // reset to the next vVertexElement to output
@@ -1113,9 +1140,12 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
         else if(info.type[0] == SWR_TYPE_FLOAT)
         {
             ///@todo: support 64 bit vb accesses
-            Value* gatherSrc = VIMMED1(0.0f);
+            Value *gatherSrc = VIMMED1(0.0f);
 #if USE_SIMD16_GATHERS
-            Value* gatherSrc2 = VIMMED1(0.0f);
+            Value *gatherSrc2 = VIMMED1(0.0f);
+#if USE_SIMD16_BUILDER
+            Value *gatherSrc16 = VIMMED2_1(0.0f);
+#endif
 #endif
 
             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), 
@@ -1127,8 +1157,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                 case 16:
                 {
 #if USE_SIMD16_GATHERS
-                    Value* vGatherResult[2];
-                    Value* vGatherResult2[2];
+                    Value *vGatherResult[2];
+                    Value *vGatherResult2[2];
 
                     // if we have at least one component out of x or y to fetch
                     if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
@@ -1140,6 +1170,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
                         //
                     }
+                    else
+                    {
+                        vGatherResult[0]  = VUNDEF_I();
+                        vGatherResult2[0] = VUNDEF_I();
+                    }
 
                     // if we have at least one component out of z or w to fetch
                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
@@ -1154,11 +1189,35 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
                         //
                     }
-
+                    else
+                    {
+                        vGatherResult[1]  = VUNDEF_I();
+                        vGatherResult2[1] = VUNDEF_I();
+                    }
 
                     // if we have at least one component to shuffle into place
                     if (compMask)
                     {
+#if USE_SIMD16_BUILDER
+                        Value *gatherResult[2];
+
+                        gatherResult[0] = VUNDEF2_I();
+                        gatherResult[1] = VUNDEF2_I();
+
+                        gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0],  0);
+                        gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
+
+                        gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1],  0);
+                        gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
+
+                        Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+                        Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
+                            currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
+
+                        // Shuffle gathered components into place in simdvertex struct
+                        Shuffle16bpcGather2(args);  // outputs to vVertexElements ref
+#else
                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
                         Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE,
@@ -1167,6 +1226,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                         // Shuffle gathered components into place in simdvertex struct
                         Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
                         Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
+#endif
                     }
 #else
                     Value* vGatherResult[2];
@@ -1209,12 +1269,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                     break;
                 case 32:
                 {
-#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
-                    Value *pVtxSrc2[4];
-
-#endif
-#endif
                     for (uint32_t i = 0; i < 4; i += 1)
                     {
 #if USE_SIMD16_GATHERS
@@ -1231,10 +1285,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                 Value *vShiftedOffsets  = VPSRLI(vOffsets,  C(1));
                                 Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
 #if USE_SIMD16_BUILDER
-                                Value *src = VUNDEF2_F();
-                                src = INSERT2_F(src, gatherSrc,  0);
-                                src = INSERT2_F(src, gatherSrc2, 1);
-
                                 Value *indices = VUNDEF2_I();
                                 indices = INSERT2_I(indices, vShiftedOffsets,  0);
                                 indices = INSERT2_I(indices, vShiftedOffsets2, 1);
@@ -1243,12 +1293,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                 mask = INSERT2_I(mask, vGatherMask,  0);
                                 mask = INSERT2_I(mask, vGatherMask2, 1);
 
-                                pVtxSrc2[currentVertexElement] = GATHERPS2(src, pStreamBase, indices, mask, 2);
-#if 1
-
-                                vVertexElements[currentVertexElement]  = EXTRACT2_F(pVtxSrc2[currentVertexElement], 0);
-                                vVertexElements2[currentVertexElement] = EXTRACT2_F(pVtxSrc2[currentVertexElement], 1);
-#endif
+                                pVtxSrc2[currentVertexElement] = GATHERPS2(gatherSrc16, pStreamBase, indices, mask, 2);
 #else
                                 vVertexElements[currentVertexElement]  = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
                                 vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2);
@@ -1384,24 +1429,45 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
                                 Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
 
-                                vVertexElements[currentVertexElement] = pGather;
+#if USE_SIMD16_BUILDER
+                                // pack adjacent pairs of SIMD8s into SIMD16s
+                                pVtxSrc2[currentVertexElement] = VUNDEF2_F();
+                                pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather,  0);
+                                pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
+
+#else
+                                vVertexElements[currentVertexElement]  = pGather;
                                 vVertexElements2[currentVertexElement] = pGather2;
 
+#endif
                                 currentVertexElement += 1;
                             }
                             else
                             {
+#if USE_SIMD16_BUILDER
+                                pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
+
+#else
                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
 
+#endif
                                 currentVertexElement += 1;
                             }
 
                             if (currentVertexElement > 3)
                             {
+#if USE_SIMD16_BUILDER
+                                // store SIMD16s
+                                Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+                                StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
+
+#else
                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
 
+#endif
                                 outputElt += 1;
 
                                 // reset to the next vVertexElement to output
@@ -1522,10 +1588,25 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 #if USE_SIMD16_GATHERS
                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
                         Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
+
                         // e.g. result of an 8x32bit integer gather for 8bit components
                         // 256i - 0    1    2    3    4    5    6    7
                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
 
+#if USE_SIMD16_BUILDER
+                        Value *gatherResult = VUNDEF2_I();
+
+                        gatherResult = INSERT2_I(gatherResult, vGatherResult,  0);
+                        gatherResult = INSERT2_I(gatherResult, vGatherResult2, 1);
+
+                        Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+                        Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
+                            currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle);
+
+                        // Shuffle gathered components into place in simdvertex struct
+                        Shuffle8bpcGatherd2(args);  // outputs to vVertexElements ref
+#else
                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
                         Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
@@ -1534,6 +1615,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                         // Shuffle gathered components into place in simdvertex struct
                         Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
                         Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
+#endif
 #else
                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
                         // e.g. result of an 8x32bit integer gather for 8bit components
@@ -1569,6 +1651,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
                         //
                     }
+                    else
+                    {
+                        vGatherResult[0]  = VUNDEF_I();
+                        vGatherResult2[0] = VUNDEF_I();
+                    }
 
                     // if we have at least one component out of z or w to fetch
                     if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
@@ -1583,10 +1670,35 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
                         //
                     }
+                    else
+                    {
+                        vGatherResult[1]  = VUNDEF_I();
+                        vGatherResult2[1] = VUNDEF_I();
+                    }
 
                     // if we have at least one component to shuffle into place
                     if (compMask)
                     {
+#if USE_SIMD16_BUILDER
+                        Value *gatherResult[2];
+
+                        gatherResult[0] = VUNDEF2_I();
+                        gatherResult[1] = VUNDEF2_I();
+
+                        gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0],  0);
+                        gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
+
+                        gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1],  0);
+                        gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
+
+                        Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+                        Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
+                            currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
+
+                        // Shuffle gathered components into place in simdvertex struct
+                        Shuffle16bpcGather2(args);  // outputs to vVertexElements ref
+#else
                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
                         Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
@@ -1595,6 +1707,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                         // Shuffle gathered components into place in simdvertex struct
                         Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
                         Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
+#endif
                     }
 #else
                     Value* vGatherResult[2];
@@ -1665,8 +1778,18 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                     pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
                                 }
 
+#if USE_SIMD16_BUILDER
+                                // pack adjacent pairs of SIMD8s into SIMD16s
+                                pVtxSrc2[currentVertexElement] = VUNDEF2_F();
+                                pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather,  0);
+                                pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
+
+#else
                                 vVertexElements[currentVertexElement] = pGather;
                                 vVertexElements2[currentVertexElement] = pGather2;
+
+#endif
+
                                 // e.g. result of a single 8x32bit integer gather for 32bit components
                                 // 256i - 0    1    2    3    4    5    6    7
                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx 
@@ -1698,9 +1821,14 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                             {
 #if USE_SIMD16_SHADERS
 #if USE_SIMD16_GATHERS
+#if USE_SIMD16_BUILDER
+                                pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
+
+#else
                                 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
                                 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
 
+#endif
                                 currentVertexElement += 1;
 #else
                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
@@ -1713,9 +1841,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                             if (currentVertexElement > 3)
                             {
 #if USE_SIMD16_GATHERS
+#if USE_SIMD16_BUILDER
+                                // store SIMD16s
+                                Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+                                StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
+
+#else
                                 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
                                 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
 
+#endif
                                 outputElt += 1;
 #else
                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
@@ -1740,9 +1876,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
     if (currentVertexElement > 0)
     {
 #if USE_SIMD16_GATHERS
+#if USE_SIMD16_BUILDER
+        // store SIMD16s
+        Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+        StoreVertexElements2(pVtxOut2, outputElt, currentVertexElement, pVtxSrc2);
+
+#else
         StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements);
         StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2);
 
+#endif
         outputElt += 1;
 #else
         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
@@ -2092,6 +2236,251 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
     }
 }
 
+#if USE_SIMD16_BUILDER
+void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args)
+{
+    // Unpack tuple args
+    Value*& vGatherResult = std::get<0>(args);
+    Value* pVtxOut = std::get<1>(args);
+    const Instruction::CastOps extendType = std::get<2>(args);
+    const ConversionType conversionType = std::get<3>(args);
+    uint32_t &currentVertexElement = std::get<4>(args);
+    uint32_t &outputElt = std::get<5>(args);
+    const ComponentEnable compMask = std::get<6>(args);
+    const ComponentControl(&compCtrl)[4] = std::get<7>(args);
+    Value* (&vVertexElements)[4] = std::get<8>(args);
+    const uint32_t(&swizzle)[4] = std::get<9>(args);
+
+    // cast types
+    Type *vGatherTy = mSimdInt32Ty;
+    Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+
+    // have to do extra work for sign extending
+    if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
+    {
+        Type *v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
+        Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+
+        // shuffle mask, including any swizzling
+        const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
+        const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
+        Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
+            char(y), char(y + 4), char(y + 8), char(y + 12),
+            char(z), char(z + 4), char(z + 8), char(z + 12),
+            char(w), char(w + 4), char(w + 8), char(w + 12),
+            char(x), char(x + 4), char(x + 8), char(x + 12),
+            char(y), char(y + 4), char(y + 8), char(y + 12),
+            char(z), char(z + 4), char(z + 8), char(z + 12),
+            char(w), char(w + 4), char(w + 8), char(w + 12) });
+
+        // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
+
+        Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
+        Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
+
+        Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
+        Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
+
+        // after pshufb: group components together in each 128bit lane
+        // 256i - 0    1    2    3    4    5    6    7
+        //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
+
+        Value *vi128XY_lo = nullptr;
+        Value *vi128XY_hi = nullptr;
+        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
+        {
+            vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+            vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+
+            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
+        }
+
+        // do the same for zw components
+        Value *vi128ZW_lo = nullptr;
+        Value *vi128ZW_hi = nullptr;
+        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
+        {
+            vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+            vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+        }
+
+        // init denormalize variables if needed
+        Instruction::CastOps fpCast;
+        Value *conversionFactor;
+
+        switch (conversionType)
+        {
+        case CONVERT_NORMALIZED:
+            fpCast = Instruction::CastOps::SIToFP;
+            conversionFactor = VIMMED1((float)(1.0 / 127.0));
+            break;
+        case CONVERT_SSCALED:
+            fpCast = Instruction::CastOps::SIToFP;
+            conversionFactor = VIMMED1((float)(1.0));
+            break;
+        case CONVERT_USCALED:
+            SWR_INVALID("Type should not be sign extended!");
+            conversionFactor = nullptr;
+            break;
+        default:
+            SWR_ASSERT(conversionType == CONVERT_NONE);
+            conversionFactor = nullptr;
+            break;
+        }
+
+        // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+        for (uint32_t i = 0; i < 4; i++)
+        {
+            if (isComponentEnabled(compMask, i))
+            {
+                if (compCtrl[i] == ComponentControl::StoreSrc)
+                {
+                    // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+                    uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+                    // if x or y, use vi128XY permute result, else use vi128ZW
+                    Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
+                    Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
+
+                    // sign extend
+                    Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
+                    Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
+
+                    // denormalize if needed
+                    if (conversionType != CONVERT_NONE)
+                    {
+                        temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
+                        temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
+                    }
+
+                    vVertexElements[currentVertexElement] = VUNDEF2_F();
+                    vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
+                    vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
+
+                    currentVertexElement += 1;
+                }
+                else
+                {
+                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
+                }
+
+                if (currentVertexElement > 3)
+                {
+                    StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
+                    // reset to the next vVertexElement to output
+                    currentVertexElement = 0;
+                }
+            }
+        }
+    }
+    // else zero extend
+    else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
+    {
+        // init denormalize variables if needed
+        Instruction::CastOps fpCast;
+        Value *conversionFactor;
+
+        switch (conversionType)
+        {
+        case CONVERT_NORMALIZED:
+            fpCast = Instruction::CastOps::UIToFP;
+            conversionFactor = VIMMED1((float)(1.0 / 255.0));
+            break;
+        case CONVERT_USCALED:
+            fpCast = Instruction::CastOps::UIToFP;
+            conversionFactor = VIMMED1((float)(1.0));
+            break;
+        case CONVERT_SSCALED:
+            SWR_INVALID("Type should not be zero extended!");
+            conversionFactor = nullptr;
+            break;
+        default:
+            SWR_ASSERT(conversionType == CONVERT_NONE);
+            conversionFactor = nullptr;
+            break;
+        }
+
+        // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
+        for (uint32_t i = 0; i < 4; i++)
+        {
+            if (isComponentEnabled(compMask, i))
+            {
+                if (compCtrl[i] == ComponentControl::StoreSrc)
+                {
+                    // pshufb masks for each component
+                    Value *vConstMask;
+                    switch (swizzle[i])
+                    {
+                    case 0:
+                        // x shuffle mask
+                        vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
+                            0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
+                        break;
+                    case 1:
+                        // y shuffle mask
+                        vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
+                            1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
+                        break;
+                    case 2:
+                        // z shuffle mask
+                        vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
+                            2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
+                        break;
+                    case 3:
+                        // w shuffle mask
+                        vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
+                            3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
+                        break;
+                    default:
+                        vConstMask = nullptr;
+                        break;
+                    }
+
+                    Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
+                    Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
+
+                    Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
+                    Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
+
+                    // after pshufb for x channel
+                    // 256i - 0    1    2    3    4    5    6    7
+                    //        x000 x000 x000 x000 x000 x000 x000 x000 
+
+                    // denormalize if needed
+                    if (conversionType != CONVERT_NONE)
+                    {
+                        temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
+                        temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
+                    }
+
+                    vVertexElements[currentVertexElement] = VUNDEF2_F();
+                    vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
+                    vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
+
+                    currentVertexElement += 1;
+                }
+                else
+                {
+                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
+                }
+
+                if (currentVertexElement > 3)
+                {
+                    StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
+                    // reset to the next vVertexElement to output
+                    currentVertexElement = 0;
+                }
+            }
+        }
+    }
+    else
+    {
+        SWR_INVALID("Unsupported conversion type");
+    }
+}
+
+#endif
 //////////////////////////////////////////////////////////////////////////
 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, 
 /// denormalizes if needed, converts to F32 if needed, and positions in 
@@ -2318,6 +2707,272 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
     }
 }
 
+#if USE_SIMD16_BUILDER
+void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args)
+{
+    // Unpack tuple args
+    Value* (&vGatherResult)[2] = std::get<0>(args);
+    Value* pVtxOut = std::get<1>(args);
+    const Instruction::CastOps extendType = std::get<2>(args);
+    const ConversionType conversionType = std::get<3>(args);
+    uint32_t &currentVertexElement = std::get<4>(args);
+    uint32_t &outputElt = std::get<5>(args);
+    const ComponentEnable compMask = std::get<6>(args);
+    const ComponentControl(&compCtrl)[4] = std::get<7>(args);
+    Value* (&vVertexElements)[4] = std::get<8>(args);
+
+    // cast types
+    Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+    Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+
+    // have to do extra work for sign extending
+    if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
+    {
+        // is this PP float?
+        bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
+
+        Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
+        Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+
+        // shuffle mask
+        Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
+        Value *vi128XY = nullptr;
+        Value *vi128XY_lo = nullptr;
+        Value *vi128XY_hi = nullptr;
+        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
+        {
+            // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
+
+            Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[0], 0);
+            Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[0], 1);
+
+            Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
+            Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
+
+            // after pshufb: group components together in each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
+
+            vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+
+            // after PERMD: move and pack xy components into each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
+#if 0
+
+            vi128XY = VUNDEF2_I();
+            vi128XY = INSERT2_I(vi128XY, vi128XY_lo, 0);
+            vi128XY = INSERT2_I(vi128XY, vi128XY_hi, 1);
+#endif
+        }
+
+        // do the same for zw components
+        Value *vi128ZW = nullptr;
+        Value *vi128ZW_lo = nullptr;
+        Value *vi128ZW_hi = nullptr;
+        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
+        {
+            Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[1], 0);
+            Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[1], 1);
+
+            Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
+            Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
+
+            vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+#if 0
+
+            vi128ZW = VUNDEF2_I();
+            vi128ZW = INSERT2_I(vi128ZW, vi128ZW_lo, 0);
+            vi128ZW = INSERT2_I(vi128ZW, vi128ZW_hi, 1);
+#endif
+        }
+
+        // init denormalize variables if needed
+        Instruction::CastOps IntToFpCast;
+        Value *conversionFactor;
+
+        switch (conversionType)
+        {
+        case CONVERT_NORMALIZED:
+            IntToFpCast = Instruction::CastOps::SIToFP;
+            conversionFactor = VIMMED1((float)(1.0 / 32767.0));
+            break;
+        case CONVERT_SSCALED:
+            IntToFpCast = Instruction::CastOps::SIToFP;
+            conversionFactor = VIMMED1((float)(1.0));
+            break;
+        case CONVERT_USCALED:
+            SWR_INVALID("Type should not be sign extended!");
+            conversionFactor = nullptr;
+            break;
+        default:
+            SWR_ASSERT(conversionType == CONVERT_NONE);
+            conversionFactor = nullptr;
+            break;
+        }
+
+        // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+        for (uint32_t i = 0; i < 4; i++)
+        {
+            if (isComponentEnabled(compMask, i))
+            {
+                if (compCtrl[i] == ComponentControl::StoreSrc)
+                {
+                    // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+                    uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+                    // if x or y, use vi128XY permute result, else use vi128ZW
+                    Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
+                    Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
+
+                    if (bFP)
+                    {
+                        // extract 128 bit lanes to sign extend each component
+                        Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
+                        Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
+
+                        vVertexElements[currentVertexElement] = VUNDEF2_F();
+                        vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
+                        vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
+                    }
+                    else
+                    {
+                        // extract 128 bit lanes to sign extend each component
+                        Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
+                        Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
+
+                        // denormalize if needed
+                        if (conversionType != CONVERT_NONE)
+                        {
+                            temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
+                            temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
+                        }
+
+                        vVertexElements[currentVertexElement] = VUNDEF2_F();
+                        vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
+                        vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
+                    }
+
+                    currentVertexElement += 1;
+                }
+                else
+                {
+                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
+                }
+
+                if (currentVertexElement > 3)
+                {
+                    StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
+                    // reset to the next vVertexElement to output
+                    currentVertexElement = 0;
+                }
+            }
+        }
+    }
+    // else zero extend
+    else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
+    {
+        // pshufb masks for each component
+        Value *vConstMask[2];
+
+        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
+        {
+            // x/z shuffle mask
+            vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+        }
+
+        if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
+        {
+            // y/w shuffle mask
+            vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+                2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
+        }
+
+        // init denormalize variables if needed
+        Instruction::CastOps fpCast;
+        Value* conversionFactor;
+
+        switch (conversionType)
+        {
+        case CONVERT_NORMALIZED:
+            fpCast = Instruction::CastOps::UIToFP;
+            conversionFactor = VIMMED1((float)(1.0 / 65535.0));
+            break;
+        case CONVERT_USCALED:
+            fpCast = Instruction::CastOps::UIToFP;
+            conversionFactor = VIMMED1((float)(1.0f));
+            break;
+        case CONVERT_SSCALED:
+            SWR_INVALID("Type should not be zero extended!");
+            conversionFactor = nullptr;
+            break;
+        default:
+            SWR_ASSERT(conversionType == CONVERT_NONE);
+            conversionFactor = nullptr;
+            break;
+        }
+
+        // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
+        for (uint32_t i = 0; i < 4; i++)
+        {
+            if (isComponentEnabled(compMask, i))
+            {
+                if (compCtrl[i] == ComponentControl::StoreSrc)
+                {
+                    // select correct constMask for x/z or y/w pshufb
+                    uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
+                    // if x or y, use vi128XY permute result, else use vi128ZW
+                    uint32_t selectedGather = (i < 2) ? 0 : 1;
+
+                    // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
+
+                    Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[selectedGather], 0);
+                    Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[selectedGather], 1);
+
+                    Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+                    Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+
+                    // after pshufb mask for x channel; z uses the same shuffle from the second gather
+                    // 256i - 0    1    2    3    4    5    6    7
+                    //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 
+
+                    // denormalize if needed
+                    if (conversionType != CONVERT_NONE)
+                    {
+                        temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
+                        temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
+                    }
+
+                    vVertexElements[currentVertexElement] = VUNDEF2_F();
+                    vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
+                    vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
+
+                    currentVertexElement += 1;
+                }
+                else
+                {
+                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
+                }
+
+                if (currentVertexElement > 3)
+                {
+                    StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
+                    // reset to the next vVertexElement to output
+                    currentVertexElement = 0;
+                }
+            }
+        }
+    }
+    else
+    {
+        SWR_INVALID("Unsupported conversion type");
+    }
+}
+
+#endif
 //////////////////////////////////////////////////////////////////////////
 /// @brief Output a simdvertex worth of elements to the current outputElt
 /// @param pVtxOut - base address of VIN output struct
@@ -2438,7 +3093,14 @@ Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl)
         case Store1Int: return VIMMED2_1(1);
         case StoreVertexId:
         {
-            Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimd2FP32Ty);
+            Value* pId = VUNDEF2_F();
+
+            Value* pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID  })), mSimdFP32Ty);
+            Value* pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
+
+            pId = INSERT2_F(pId, pId_lo, 0);
+            pId = INSERT2_F(pId, pId_hi, 1);
+
             return VBROADCAST2(pId);
         }
         case StoreInstanceId:
-- 
2.30.2