From 2559f2b93edc74d943fa1441433288a92263f854 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Mon, 16 Oct 2017 18:39:41 -0500
Subject: [PATCH] swr/rast: Widen fetch shader to SIMD16 (disabled for now)

Refactored the gather operation to process 16 elements at a time via
paired SIMD8 operations.

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
---
 .../swr/rasterizer/jitter/fetch_jit.cpp       | 441 +++++++++++++++++-
 1 file changed, 428 insertions(+), 13 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 1e3db902bb6..30dbcfc8ce1 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -89,7 +89,13 @@ struct FetchJit : public Builder
 
     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
 #if USE_SIMD16_SHADERS
+#define USE_SIMD16_GATHERS 0
+
+#if USE_SIMD16_GATHERS
+    void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
+#else
     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
+#endif
 #else
     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
 #endif
@@ -279,8 +285,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
     }
     else
     {
+#if USE_SIMD16_GATHERS
+        JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false);
+#else
         JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
         JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
+#endif
     }
 #else
     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
@@ -792,8 +802,13 @@ void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 /// @param vIndices - vector value of indices to gather
 /// @param pVtxOut - value pointer to output simdvertex struct
 #if USE_SIMD16_SHADERS
+#if USE_SIMD16_GATHERS
+void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
+    Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2)
+#else
 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
     Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
+#endif
 #else
 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
     Value* streams, Value* vIndices, Value* pVtxOut)
@@ -802,6 +817,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
     uint32_t currentVertexElement = 0;
     uint32_t outputElt = 0;
     Value* vVertexElements[4];
+#if USE_SIMD16_GATHERS
+    Value* vVertexElements2[4];
+#endif
 
     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
@@ -809,7 +827,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
     curInstance->setName("curInstance");
 
-    for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
+    for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
     {
         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 
@@ -836,7 +854,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
         maxVertex = LOAD(maxVertex);
 
         Value *minVertex = NULL;
-        if (fetchState.bPartialVertexBuffer) {
+        if (fetchState.bPartialVertexBuffer)
+        {
             // min vertex index for low bounds OOB checking
             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
             minVertex = LOAD(minVertex);
@@ -849,10 +868,13 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
         }
 
         Value *vCurIndices;
+#if USE_SIMD16_GATHERS
+        Value *vCurIndices2;
+#endif
         Value *startOffset;
         Value *vInstanceStride = VIMMED1(0);
 
-        if(ied.InstanceEnable)
+        if (ied.InstanceEnable)
         {
             Value* stepRate = C(ied.InstanceAdvancementState);
 
@@ -867,6 +889,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 
             vCurIndices = VBROADCAST(calcInstance);
+#if USE_SIMD16_GATHERS
+            vCurIndices2 = VBROADCAST(calcInstance);
+#endif
 
             startOffset = startInstance;
         }
@@ -878,6 +903,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 
             // offset indices by baseVertex
             vCurIndices = ADD(vIndices, vBaseVertex);
+#if USE_SIMD16_GATHERS
+            vCurIndices2 = ADD(vIndices2, vBaseVertex);
+#endif
 
             startOffset = startVertex;
             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
@@ -886,6 +914,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
         {
             // offset indices by baseVertex
             vCurIndices = ADD(vIndices, vBaseVertex);
+#if USE_SIMD16_GATHERS
+            vCurIndices2 = ADD(vIndices2, vBaseVertex);
+#endif
 
             startOffset = startVertex;
         }
@@ -903,7 +934,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
         // if we have a negative value, we're already OOB. clamp at 0.
         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
 
-        if (fetchState.bPartialVertexBuffer) {
+        if (fetchState.bPartialVertexBuffer)
+        {
             // similary for min vertex
             minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
             Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
@@ -920,6 +952,61 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
         // is the element is <= the partially valid size
         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 
+#if USE_SIMD16_GATHERS
+        // override cur indices with 0 if pitch is 0
+        Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
+        vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
+
+        // are vertices partially OOB?
+        Value* vMaxVertex = VBROADCAST(maxVertex);
+        Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
+        Value* vPartialOOBMask2 = ICMP_EQ(vCurIndices2, vMaxVertex);
+
+        // are vertices fully in bounds?
+        Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
+        Value* vMaxGatherMask2 = ICMP_ULT(vCurIndices2, vMaxVertex);
+
+        Value *vGatherMask;
+        Value *vGatherMask2;
+        if (fetchState.bPartialVertexBuffer)
+        {
+            // are vertices below minVertex limit?
+            Value *vMinVertex = VBROADCAST(minVertex);
+            Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
+            Value *vMinGatherMask2 = ICMP_UGE(vCurIndices2, vMinVertex);
+
+            // only fetch lanes that pass both tests
+            vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
+            vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2);
+        }
+        else
+        {
+            vGatherMask = vMaxGatherMask;
+            vGatherMask2 = vMaxGatherMask2;
+        }
+
+        // blend in any partially OOB indices that have valid elements
+        vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
+        vGatherMask2 = SELECT(vPartialOOBMask2, vElementInBoundsMask, vGatherMask2);
+        Value *pMask = vGatherMask;
+        Value *pMask2 = vGatherMask2;
+        vGatherMask = VMASK(vGatherMask);
+        vGatherMask2 = VMASK(vGatherMask2);
+
+        // calculate the actual offsets into the VB
+        Value* vOffsets = MUL(vCurIndices, vStride);
+        vOffsets = ADD(vOffsets, vAlignmentOffsets);
+
+        Value* vOffsets2 = MUL(vCurIndices2, vStride);
+        vOffsets2 = ADD(vOffsets2, vAlignmentOffsets);
+
+        // if instance stride enable is:
+        //  true  - add product of the instanceID and advancement state to the offst into the VB
+        //  false - value of vInstanceStride has been initialialized to zero
+        vOffsets = ADD(vOffsets, vInstanceStride);
+        vOffsets2 = ADD(vOffsets2, vInstanceStride);
+
+#else
         // override cur indices with 0 if pitch is 0
         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
@@ -932,14 +1019,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 
         Value *vGatherMask;
-        if (fetchState.bPartialVertexBuffer) {
+        if (fetchState.bPartialVertexBuffer)
+        {
             // are vertices below minVertex limit?
             Value *vMinVertex = VBROADCAST(minVertex);
             Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
 
             // only fetch lanes that pass both tests
             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
-        } else {
+        }
+        else
+        {
             vGatherMask = vMaxGatherMask;
         }
 
@@ -957,6 +1047,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
         //  false - value of vInstanceStride has been initialialized to zero
         vOffsets = ADD(vOffsets, vInstanceStride);
 
+#endif
         // Packing and component control 
         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1, 
@@ -965,6 +1056,35 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
         // Special gather/conversion for formats without equal component sizes
         if (IsOddFormat((SWR_FORMAT)ied.Format))
         {
+#if USE_SIMD16_GATHERS
+            Value *pResults[4];
+            Value *pResults2[4];
+            CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
+            CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
+            ConvertFormat((SWR_FORMAT)ied.Format, pResults);
+            ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
+
+            for (uint32_t c = 0; c < 4; c += 1)
+            {
+                if (isComponentEnabled(compMask, c))
+                {
+                    vVertexElements[currentVertexElement] = pResults[c];
+                    vVertexElements2[currentVertexElement] = pResults2[c];
+                    currentVertexElement++;
+
+                    if (currentVertexElement > 3)
+                    {
+                        StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
+                        StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
+
+                        outputElt += 1;
+
+                        // reset to the next vVertexElement to output
+                        currentVertexElement = 0;
+                    }
+                }
+            }
+#else
             Value* pResults[4];
             CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
@@ -982,20 +1102,75 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                     }
                 }
             }
+#endif
         }
         else if(info.type[0] == SWR_TYPE_FLOAT)
         {
             ///@todo: support 64 bit vb accesses
             Value* gatherSrc = VIMMED1(0.0f);
+#if USE_SIMD16_GATHERS
+            Value* gatherSrc2 = VIMMED1(0.0f);
+#endif
 
             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), 
                 "Unsupported format for standard gather fetch.");
 
             // Gather components from memory to store in a simdvertex structure
-            switch(bpc)
+            switch (bpc)
             {
                 case 16:
                 {
+#if USE_SIMD16_GATHERS
+                    Value* vGatherResult[2];
+                    Value* vGatherResult2[2];
+                    Value *vMask;
+                    Value *vMask2;
+
+                    // if we have at least one component out of x or y to fetch
+                    if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
+                    {
+                        // save mask as it is zero'd out after each gather
+                        vMask = vGatherMask;
+                        vMask2 = vGatherMask2;
+
+                        vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+                        vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1));
+                        // e.g. result of first 8x32bit integer gather for 16bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+                        //
+                    }
+
+                    // if we have at least one component out of z or w to fetch
+                    if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
+                    {
+                        // offset base to the next components(zw) in the vertex to gather
+                        pStreamBase = GEP(pStreamBase, C((char)4));
+                        vMask = vGatherMask;
+                        vMask2 = vGatherMask2;
+
+                        vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+                        vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1));
+                        // e.g. result of second 8x32bit integer gather for 16bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+                        //
+                    }
+
+
+                    // if we have at least one component to shuffle into place
+                    if (compMask)
+                    {
+                        Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
+                            currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
+                        Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE,
+                            currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
+
+                        // Shuffle gathered components into place in simdvertex struct
+                        Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
+                        Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
+                    }
+#else
                     Value* vGatherResult[2];
                     Value *vMask;
 
@@ -1036,12 +1211,58 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 #endif
                     }
+#endif
                 }
                     break;
                 case 32:
                 {
-                    for (uint32_t i = 0; i < 4; i++)
+                    for (uint32_t i = 0; i < 4; i += 1)
                     {
+#if USE_SIMD16_GATHERS
+                        if (isComponentEnabled(compMask, i))
+                        {
+                            // if we need to gather the component
+                            if (compCtrl[i] == StoreSrc)
+                            {
+                                // save mask as it is zero'd out after each gather
+                                Value *vMask = vGatherMask;
+                                Value *vMask2 = vGatherMask2;
+
+                                // Gather a SIMD of vertices
+                                // APIs allow a 4GB range for offsets
+                                // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
+                                // But, we know that elements must be aligned for FETCH. :)
+                                // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
+                                Value *vShiftedOffsets = VPSRLI(vOffsets, C(1));
+                                Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
+                                vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, C((char)2));
+                                vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vMask2, C((char)2));
+
+                                currentVertexElement += 1;
+                            }
+                            else
+                            {
+                                vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
+                                vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
+
+                                currentVertexElement += 1;
+                            }
+
+                            if (currentVertexElement > 3)
+                            {
+                                StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
+                                StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
+
+                                outputElt += 1;
+
+                                // reset to the next vVertexElement to output
+                                currentVertexElement = 0;
+                            }
+                        }
+
+                        // offset base to the next component in the vertex to gather
+                        pStreamBase = GEP(pStreamBase, C((char)4));
+#else
                         if (isComponentEnabled(compMask, i))
                         {
                             // if we need to gather the component
@@ -1073,18 +1294,85 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                 // reset to the next vVertexElement to output
                                 currentVertexElement = 0;
                             }
-
                         }
 
                         // offset base to the next component in the vertex to gather
                         pStreamBase = GEP(pStreamBase, C((char)4));
+#endif
                     }
                 }
                     break;
                 case 64:
                 {
-                    for (uint32_t i = 0; i < 4; i++)
+                    for (uint32_t i = 0; i < 4; i += 1)
                     {
+#if USE_SIMD16_GATHERS
+                        if (isComponentEnabled(compMask, i))
+                        {
+                            // if we need to gather the component
+                            if (compCtrl[i] == StoreSrc)
+                            {
+                                Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
+                                Value *vMaskLo2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
+                                Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
+                                Value *vMaskHi2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
+                                vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
+                                vMaskLo2 = S_EXT(vMaskLo2, VectorType::get(mInt64Ty, 4));
+                                vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
+                                vMaskHi2 = S_EXT(vMaskHi2, VectorType::get(mInt64Ty, 4));
+                                vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
+                                vMaskLo2 = BITCAST(vMaskLo2, VectorType::get(mDoubleTy, 4));
+                                vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
+                                vMaskHi2 = BITCAST(vMaskHi2, VectorType::get(mDoubleTy, 4));
+
+                                Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
+                                Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
+                                Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
+                                Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
+
+                                Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
+
+                                Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo, C((char)1));
+                                Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2, C((char)1));
+                                Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi, C((char)1));
+                                Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2, C((char)1));
+
+                                pGatherLo = VCVTPD2PS(pGatherLo);
+                                pGatherLo2 = VCVTPD2PS(pGatherLo2);
+                                pGatherHi = VCVTPD2PS(pGatherHi);
+                                pGatherHi2 = VCVTPD2PS(pGatherHi2);
+
+                                Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
+                                Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
+
+                                vVertexElements[currentVertexElement] = pGather;
+                                vVertexElements2[currentVertexElement] = pGather2;
+
+                                currentVertexElement += 1;
+                            }
+                            else
+                            {
+                                vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
+                                vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
+
+                                currentVertexElement += 1;
+                            }
+
+                            if (currentVertexElement > 3)
+                            {
+                                StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
+                                StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
+
+                                outputElt += 1;
+
+                                // reset to the next vVertexElement to output
+                                currentVertexElement = 0;
+                            }
+                        }
+
+                        // offset base to the next component  in the vertex to gather
+                        pStreamBase = GEP(pStreamBase, C((char)8));
+#else
                         if (isComponentEnabled(compMask, i))
                         {
                             // if we need to gather the component
@@ -1129,11 +1417,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                 // reset to the next vVertexElement to output
                                 currentVertexElement = 0;
                             }
-
                         }
 
                         // offset base to the next component  in the vertex to gather
                         pStreamBase = GEP(pStreamBase, C((char)8));
+#endif
                     }
                 }
                     break;
@@ -1180,6 +1468,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 
             // value substituted when component of gather is masked
             Value* gatherSrc = VIMMED1(0);
+#if USE_SIMD16_GATHERS
+            Value* gatherSrc2 = VIMMED1(0);
+#endif
 
             // Gather components from memory to store in a simdvertex structure
             switch (bpc)
@@ -1187,8 +1478,24 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                 case 8:
                 {
                     // if we have at least one component to fetch
-                    if(compMask)
+                    if (compMask)
                     {
+#if USE_SIMD16_GATHERS
+                        Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
+                        Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2, C((char)1));
+                        // e.g. result of an 8x32bit integer gather for 8bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
+
+                        Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
+                            currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
+                        Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
+                            currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2, info.swizzle);
+
+                        // Shuffle gathered components into place in simdvertex struct
+                        Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
+                        Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
+#else
                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
                         // e.g. result of an 8x32bit integer gather for 8bit components
                         // 256i - 0    1    2    3    4    5    6    7
@@ -1202,12 +1509,63 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                         Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
 #else
                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
+#endif
 #endif
                     }
                 }
                 break;
                 case 16:
                 {
+#if USE_SIMD16_GATHERS
+                    Value* vGatherResult[2];
+                    Value *vMask;
+                    Value* vGatherResult2[2];
+                    Value *vMask2;
+
+                    // if we have at least one component out of x or y to fetch
+                    if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
+                    {
+                        // save mask as it is zero'd out after each gather
+                        vMask = vGatherMask;
+                        vMask2 = vGatherMask2;
+
+                        vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+                        vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1));
+                        // e.g. result of first 8x32bit integer gather for 16bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+                        //
+                    }
+
+                    // if we have at least one component out of z or w to fetch
+                    if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
+                    {
+                        // offset base to the next components(zw) in the vertex to gather
+                        pStreamBase = GEP(pStreamBase, C((char)4));
+                        vMask = vGatherMask;
+                        vMask2 = vGatherMask2;
+
+                        vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+                        vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1));
+                        // e.g. result of second 8x32bit integer gather for 16bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+                        //
+                    }
+
+                    // if we have at least one component to shuffle into place
+                    if (compMask)
+                    {
+                        Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
+                            currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
+                        Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
+                            currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
+
+                        // Shuffle gathered components into place in simdvertex struct
+                        Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
+                        Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
+                    }
+#else
                     Value* vGatherResult[2];
                     Value *vMask;
 
@@ -1248,6 +1606,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 #endif
                     }
+#endif
                 }
                 break;
                 case 32:
@@ -1260,6 +1619,38 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                             // if we need to gather the component
                             if (compCtrl[i] == StoreSrc)
                             {
+#if USE_SIMD16_GATHERS
+                                // save mask as it is zero'd out after each gather
+                                Value *vMask = vGatherMask;
+                                Value *vMask2 = vGatherMask2;
+
+                                Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+                                Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1));
+
+                                if (conversionType == CONVERT_USCALED)
+                                {
+                                    pGather = UI_TO_FP(pGather, mSimdFP32Ty);
+                                    pGather2 = UI_TO_FP(pGather2, mSimdFP32Ty);
+                                }
+                                else if (conversionType == CONVERT_SSCALED)
+                                {
+                                    pGather = SI_TO_FP(pGather, mSimdFP32Ty);
+                                    pGather2 = SI_TO_FP(pGather2, mSimdFP32Ty);
+                                }
+                                else if (conversionType == CONVERT_SFIXED)
+                                {
+                                    pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
+                                    pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
+                                }
+
+                                vVertexElements[currentVertexElement] = pGather;
+                                vVertexElements2[currentVertexElement] = pGather2;
+                                // e.g. result of a single 8x32bit integer gather for 32bit components
+                                // 256i - 0    1    2    3    4    5    6    7
+                                //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx 
+
+                                currentVertexElement += 1;
+#else
                                 // save mask as it is zero'd out after each gather
                                 Value *vMask = vGatherMask;
 
@@ -1282,11 +1673,19 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                 // e.g. result of a single 8x32bit integer gather for 32bit components
                                 // 256i - 0    1    2    3    4    5    6    7
                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx 
+#endif
                             }
                             else
                             {
 #if USE_SIMD16_SHADERS
+#if USE_SIMD16_GATHERS
+                                vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
+                                vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
+
+                                currentVertexElement += 1;
+#else
                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
+#endif
 #else
                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 #endif
@@ -1294,7 +1693,15 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 
                             if (currentVertexElement > 3)
                             {
+#if USE_SIMD16_GATHERS
+                                StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
+                                StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
+
+                                outputElt += 1;
+#else
                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+#endif
+
                                 // reset to the next vVertexElement to output
                                 currentVertexElement = 0;
                             }
@@ -1311,8 +1718,16 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
     }
 
     // if we have a partially filled vVertexElement struct, output it
-    if(currentVertexElement > 0){
+    if (currentVertexElement > 0)
+    {
+#if USE_SIMD16_GATHERS
+        StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements);
+        StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2);
+
+        outputElt += 1;
+#else
         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
+#endif
     }
 }
 
-- 
2.30.2