swr/rast: Widen fetch shader to SIMD16 (disabled for now)

author Tim Rowley <timothy.o.rowley@intel.com>

Mon, 16 Oct 2017 23:39:41 +0000 (18:39 -0500)

committer Tim Rowley <timothy.o.rowley@intel.com>

Thu, 19 Oct 2017 18:10:55 +0000 (13:10 -0500)
author Tim Rowley <timothy.o.rowley@intel.com>
Mon, 16 Oct 2017 23:39:41 +0000 (18:39 -0500)
committer Tim Rowley <timothy.o.rowley@intel.com>
Thu, 19 Oct 2017 18:10:55 +0000 (13:10 -0500)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

index 1e3db902bb60928edf89fef989335c29a89bdd10..30dbcfc8ce1232545336882ce728fc817193268c 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -89,7 +89,13 @@ struct FetchJit : public Builder
  
      void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  #if USE_SIMD16_SHADERS
+#define USE_SIMD16_GATHERS 0
+
+#if USE_SIMD16_GATHERS
+    void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
+#else
      void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
+#endif
  #else
      void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  #endif
@@ -279,8 +285,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
      }
      else
      {
+#if USE_SIMD16_GATHERS
+        JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false);
+#else
          JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
          JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
+#endif
      }
  #else
      (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
@@ -792,8 +802,13 @@ void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
  /// @param vIndices - vector value of indices to gather
  /// @param pVtxOut - value pointer to output simdvertex struct
  #if USE_SIMD16_SHADERS
+#if USE_SIMD16_GATHERS
+void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
+    Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2)
+#else
  void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
      Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
+#endif
  #else
  void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
      Value* streams, Value* vIndices, Value* pVtxOut)
@@ -802,6 +817,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
      uint32_t currentVertexElement = 0;
      uint32_t outputElt = 0;
      Value* vVertexElements[4];
+#if USE_SIMD16_GATHERS
+    Value* vVertexElements2[4];
+#endif
  
      Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
      Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
@@ -809,7 +827,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
      Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
      curInstance->setName("curInstance");
  
-    for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
+    for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
      {
          const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
  
@@ -836,7 +854,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
          maxVertex = LOAD(maxVertex);
  
          Value *minVertex = NULL;
-        if (fetchState.bPartialVertexBuffer) {
+        if (fetchState.bPartialVertexBuffer)
+        {
              // min vertex index for low bounds OOB checking
              minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
              minVertex = LOAD(minVertex);
@@ -849,10 +868,13 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
          }
  
          Value *vCurIndices;
+#if USE_SIMD16_GATHERS
+        Value *vCurIndices2;
+#endif
          Value *startOffset;
          Value *vInstanceStride = VIMMED1(0);
  
-        if(ied.InstanceEnable)
+        if (ied.InstanceEnable)
          {
              Value* stepRate = C(ied.InstanceAdvancementState);
  
@@ -867,6 +889,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
              calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
  
              vCurIndices = VBROADCAST(calcInstance);
+#if USE_SIMD16_GATHERS
+            vCurIndices2 = VBROADCAST(calcInstance);
+#endif
  
              startOffset = startInstance;
          }
@@ -878,6 +903,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
  
              // offset indices by baseVertex
              vCurIndices = ADD(vIndices, vBaseVertex);
+#if USE_SIMD16_GATHERS
+            vCurIndices2 = ADD(vIndices2, vBaseVertex);
+#endif
  
              startOffset = startVertex;
              SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
@@ -886,6 +914,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
          {
              // offset indices by baseVertex
              vCurIndices = ADD(vIndices, vBaseVertex);
+#if USE_SIMD16_GATHERS
+            vCurIndices2 = ADD(vIndices2, vBaseVertex);
+#endif
  
              startOffset = startVertex;
          }
@@ -903,7 +934,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
          // if we have a negative value, we're already OOB. clamp at 0.
          maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
  
-        if (fetchState.bPartialVertexBuffer) {
+        if (fetchState.bPartialVertexBuffer)
+        {
              // similary for min vertex
              minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
              Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
@@ -920,6 +952,61 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
          // is the element is <= the partially valid size
          Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
  
+#if USE_SIMD16_GATHERS
+        // override cur indices with 0 if pitch is 0
+        Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
+        vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
+
+        // are vertices partially OOB?
+        Value* vMaxVertex = VBROADCAST(maxVertex);
+        Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
+        Value* vPartialOOBMask2 = ICMP_EQ(vCurIndices2, vMaxVertex);
+
+        // are vertices fully in bounds?
+        Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
+        Value* vMaxGatherMask2 = ICMP_ULT(vCurIndices2, vMaxVertex);
+
+        Value *vGatherMask;
+        Value *vGatherMask2;
+        if (fetchState.bPartialVertexBuffer)
+        {
+            // are vertices below minVertex limit?
+            Value *vMinVertex = VBROADCAST(minVertex);
+            Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
+            Value *vMinGatherMask2 = ICMP_UGE(vCurIndices2, vMinVertex);
+
+            // only fetch lanes that pass both tests
+            vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
+            vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2);
+        }
+        else
+        {
+            vGatherMask = vMaxGatherMask;
+            vGatherMask2 = vMaxGatherMask2;
+        }
+
+        // blend in any partially OOB indices that have valid elements
+        vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
+        vGatherMask2 = SELECT(vPartialOOBMask2, vElementInBoundsMask, vGatherMask2);
+        Value *pMask = vGatherMask;
+        Value *pMask2 = vGatherMask2;
+        vGatherMask = VMASK(vGatherMask);
+        vGatherMask2 = VMASK(vGatherMask2);
+
+        // calculate the actual offsets into the VB
+        Value* vOffsets = MUL(vCurIndices, vStride);
+        vOffsets = ADD(vOffsets, vAlignmentOffsets);
+
+        Value* vOffsets2 = MUL(vCurIndices2, vStride);
+        vOffsets2 = ADD(vOffsets2, vAlignmentOffsets);
+
+        // if instance stride enable is:
+        //  true  - add product of the instanceID and advancement state to the offst into the VB
+        //  false - value of vInstanceStride has been initialialized to zero
+        vOffsets = ADD(vOffsets, vInstanceStride);
+        vOffsets2 = ADD(vOffsets2, vInstanceStride);
+
+#else
          // override cur indices with 0 if pitch is 0
          Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
          vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
@@ -932,14 +1019,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
          Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
  
          Value *vGatherMask;
-        if (fetchState.bPartialVertexBuffer) {
+        if (fetchState.bPartialVertexBuffer)
+        {
              // are vertices below minVertex limit?
              Value *vMinVertex = VBROADCAST(minVertex);
              Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
  
              // only fetch lanes that pass both tests
              vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
-        } else {
+        }
+        else
+        {
              vGatherMask = vMaxGatherMask;
          }
  
@@ -957,6 +1047,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
          //  false - value of vInstanceStride has been initialialized to zero
          vOffsets = ADD(vOffsets, vInstanceStride);
  
+#endif
          // Packing and component control 
          ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
          const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1, 
@@ -965,6 +1056,35 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
          // Special gather/conversion for formats without equal component sizes
          if (IsOddFormat((SWR_FORMAT)ied.Format))
          {
+#if USE_SIMD16_GATHERS
+            Value *pResults[4];
+            Value *pResults2[4];
+            CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
+            CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
+            ConvertFormat((SWR_FORMAT)ied.Format, pResults);
+            ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
+
+            for (uint32_t c = 0; c < 4; c += 1)
+            {
+                if (isComponentEnabled(compMask, c))
+                {
+                    vVertexElements[currentVertexElement] = pResults[c];
+                    vVertexElements2[currentVertexElement] = pResults2[c];
+                    currentVertexElement++;
+
+                    if (currentVertexElement > 3)
+                    {
+                        StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
+                        StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
+
+                        outputElt += 1;
+
+                        // reset to the next vVertexElement to output
+                        currentVertexElement = 0;
+                    }
+                }
+            }
+#else
              Value* pResults[4];
              CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
              ConvertFormat((SWR_FORMAT)ied.Format, pResults);
@@ -982,20 +1102,75 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                      }
                  }
              }
+#endif
          }
          else if(info.type[0] == SWR_TYPE_FLOAT)
          {
              ///@todo: support 64 bit vb accesses
              Value* gatherSrc = VIMMED1(0.0f);
+#if USE_SIMD16_GATHERS
+            Value* gatherSrc2 = VIMMED1(0.0f);
+#endif
  
              SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), 
                  "Unsupported format for standard gather fetch.");
  
              // Gather components from memory to store in a simdvertex structure
-            switch(bpc)
+            switch (bpc)
              {
                  case 16:
                  {
+#if USE_SIMD16_GATHERS
+                    Value* vGatherResult[2];
+                    Value* vGatherResult2[2];
+                    Value *vMask;
+                    Value *vMask2;
+
+                    // if we have at least one component out of x or y to fetch
+                    if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
+                    {
+                        // save mask as it is zero'd out after each gather
+                        vMask = vGatherMask;
+                        vMask2 = vGatherMask2;
+
+                        vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+                        vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1));
+                        // e.g. result of first 8x32bit integer gather for 16bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+                        //
+                    }
+
+                    // if we have at least one component out of z or w to fetch
+                    if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
+                    {
+                        // offset base to the next components(zw) in the vertex to gather
+                        pStreamBase = GEP(pStreamBase, C((char)4));
+                        vMask = vGatherMask;
+                        vMask2 = vGatherMask2;
+
+                        vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+                        vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1));
+                        // e.g. result of second 8x32bit integer gather for 16bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+                        //
+                    }
+
+
+                    // if we have at least one component to shuffle into place
+                    if (compMask)
+                    {
+                        Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
+                            currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
+                        Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE,
+                            currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
+
+                        // Shuffle gathered components into place in simdvertex struct
+                        Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
+                        Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
+                    }
+#else
                      Value* vGatherResult[2];
                      Value *vMask;
  
@@ -1036,12 +1211,58 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                          Shuffle16bpcGather(args);  // outputs to vVertexElements ref
  #endif
                      }
+#endif
                  }
                      break;
                  case 32:
                  {
-                    for (uint32_t i = 0; i < 4; i++)
+                    for (uint32_t i = 0; i < 4; i += 1)
                      {
+#if USE_SIMD16_GATHERS
+                        if (isComponentEnabled(compMask, i))
+                        {
+                            // if we need to gather the component
+                            if (compCtrl[i] == StoreSrc)
+                            {
+                                // save mask as it is zero'd out after each gather
+                                Value *vMask = vGatherMask;
+                                Value *vMask2 = vGatherMask2;
+
+                                // Gather a SIMD of vertices
+                                // APIs allow a 4GB range for offsets
+                                // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
+                                // But, we know that elements must be aligned for FETCH. :)
+                                // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
+                                Value *vShiftedOffsets = VPSRLI(vOffsets, C(1));
+                                Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
+                                vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, C((char)2));
+                                vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vMask2, C((char)2));
+
+                                currentVertexElement += 1;
+                            }
+                            else
+                            {
+                                vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
+                                vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
+
+                                currentVertexElement += 1;
+                            }
+
+                            if (currentVertexElement > 3)
+                            {
+                                StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
+                                StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
+
+                                outputElt += 1;
+
+                                // reset to the next vVertexElement to output
+                                currentVertexElement = 0;
+                            }
+                        }
+
+                        // offset base to the next component in the vertex to gather
+                        pStreamBase = GEP(pStreamBase, C((char)4));
+#else
                          if (isComponentEnabled(compMask, i))
                          {
                              // if we need to gather the component
@@ -1073,18 +1294,85 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                  // reset to the next vVertexElement to output
                                  currentVertexElement = 0;
                              }
-
                          }
  
                          // offset base to the next component in the vertex to gather
                          pStreamBase = GEP(pStreamBase, C((char)4));
+#endif
                      }
                  }
                      break;
                  case 64:
                  {
-                    for (uint32_t i = 0; i < 4; i++)
+                    for (uint32_t i = 0; i < 4; i += 1)
                      {
+#if USE_SIMD16_GATHERS
+                        if (isComponentEnabled(compMask, i))
+                        {
+                            // if we need to gather the component
+                            if (compCtrl[i] == StoreSrc)
+                            {
+                                Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
+                                Value *vMaskLo2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
+                                Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
+                                Value *vMaskHi2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
+                                vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
+                                vMaskLo2 = S_EXT(vMaskLo2, VectorType::get(mInt64Ty, 4));
+                                vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
+                                vMaskHi2 = S_EXT(vMaskHi2, VectorType::get(mInt64Ty, 4));
+                                vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
+                                vMaskLo2 = BITCAST(vMaskLo2, VectorType::get(mDoubleTy, 4));
+                                vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
+                                vMaskHi2 = BITCAST(vMaskHi2, VectorType::get(mDoubleTy, 4));
+
+                                Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
+                                Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
+                                Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
+                                Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
+
+                                Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
+
+                                Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo, C((char)1));
+                                Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2, C((char)1));
+                                Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi, C((char)1));
+                                Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2, C((char)1));
+
+                                pGatherLo = VCVTPD2PS(pGatherLo);
+                                pGatherLo2 = VCVTPD2PS(pGatherLo2);
+                                pGatherHi = VCVTPD2PS(pGatherHi);
+                                pGatherHi2 = VCVTPD2PS(pGatherHi2);
+
+                                Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
+                                Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
+
+                                vVertexElements[currentVertexElement] = pGather;
+                                vVertexElements2[currentVertexElement] = pGather2;
+
+                                currentVertexElement += 1;
+                            }
+                            else
+                            {
+                                vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
+                                vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
+
+                                currentVertexElement += 1;
+                            }
+
+                            if (currentVertexElement > 3)
+                            {
+                                StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
+                                StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
+
+                                outputElt += 1;
+
+                                // reset to the next vVertexElement to output
+                                currentVertexElement = 0;
+                            }
+                        }
+
+                        // offset base to the next component  in the vertex to gather
+                        pStreamBase = GEP(pStreamBase, C((char)8));
+#else
                          if (isComponentEnabled(compMask, i))
                          {
                              // if we need to gather the component
@@ -1129,11 +1417,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                  // reset to the next vVertexElement to output
                                  currentVertexElement = 0;
                              }
-
                          }
  
                          // offset base to the next component  in the vertex to gather
                          pStreamBase = GEP(pStreamBase, C((char)8));
+#endif
                      }
                  }
                      break;
@@ -1180,6 +1468,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
  
              // value substituted when component of gather is masked
              Value* gatherSrc = VIMMED1(0);
+#if USE_SIMD16_GATHERS
+            Value* gatherSrc2 = VIMMED1(0);
+#endif
  
              // Gather components from memory to store in a simdvertex structure
              switch (bpc)
@@ -1187,8 +1478,24 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                  case 8:
                  {
                      // if we have at least one component to fetch
-                    if(compMask)
+                    if (compMask)
                      {
+#if USE_SIMD16_GATHERS
+                        Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
+                        Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2, C((char)1));
+                        // e.g. result of an 8x32bit integer gather for 8bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
+
+                        Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
+                            currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
+                        Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
+                            currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2, info.swizzle);
+
+                        // Shuffle gathered components into place in simdvertex struct
+                        Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
+                        Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
+#else
                          Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
                          // e.g. result of an 8x32bit integer gather for 8bit components
                          // 256i - 0    1    2    3    4    5    6    7
@@ -1202,12 +1509,63 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                          Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
  #else
                          Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
+#endif
  #endif
                      }
                  }
                  break;
                  case 16:
                  {
+#if USE_SIMD16_GATHERS
+                    Value* vGatherResult[2];
+                    Value *vMask;
+                    Value* vGatherResult2[2];
+                    Value *vMask2;
+
+                    // if we have at least one component out of x or y to fetch
+                    if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
+                    {
+                        // save mask as it is zero'd out after each gather
+                        vMask = vGatherMask;
+                        vMask2 = vGatherMask2;
+
+                        vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+                        vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1));
+                        // e.g. result of first 8x32bit integer gather for 16bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+                        //
+                    }
+
+                    // if we have at least one component out of z or w to fetch
+                    if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
+                    {
+                        // offset base to the next components(zw) in the vertex to gather
+                        pStreamBase = GEP(pStreamBase, C((char)4));
+                        vMask = vGatherMask;
+                        vMask2 = vGatherMask2;
+
+                        vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+                        vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1));
+                        // e.g. result of second 8x32bit integer gather for 16bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+                        //
+                    }
+
+                    // if we have at least one component to shuffle into place
+                    if (compMask)
+                    {
+                        Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
+                            currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
+                        Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
+                            currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
+
+                        // Shuffle gathered components into place in simdvertex struct
+                        Shuffle16bpcGather(args, false);  // outputs to vVertexElements ref
+                        Shuffle16bpcGather(args2, true);  // outputs to vVertexElements ref
+                    }
+#else
                      Value* vGatherResult[2];
                      Value *vMask;
  
@@ -1248,6 +1606,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                          Shuffle16bpcGather(args);  // outputs to vVertexElements ref
  #endif
                      }
+#endif
                  }
                  break;
                  case 32:
@@ -1260,6 +1619,38 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                              // if we need to gather the component
                              if (compCtrl[i] == StoreSrc)
                              {
+#if USE_SIMD16_GATHERS
+                                // save mask as it is zero'd out after each gather
+                                Value *vMask = vGatherMask;
+                                Value *vMask2 = vGatherMask2;
+
+                                Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+                                Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1));
+
+                                if (conversionType == CONVERT_USCALED)
+                                {
+                                    pGather = UI_TO_FP(pGather, mSimdFP32Ty);
+                                    pGather2 = UI_TO_FP(pGather2, mSimdFP32Ty);
+                                }
+                                else if (conversionType == CONVERT_SSCALED)
+                                {
+                                    pGather = SI_TO_FP(pGather, mSimdFP32Ty);
+                                    pGather2 = SI_TO_FP(pGather2, mSimdFP32Ty);
+                                }
+                                else if (conversionType == CONVERT_SFIXED)
+                                {
+                                    pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
+                                    pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
+                                }
+
+                                vVertexElements[currentVertexElement] = pGather;
+                                vVertexElements2[currentVertexElement] = pGather2;
+                                // e.g. result of a single 8x32bit integer gather for 32bit components
+                                // 256i - 0    1    2    3    4    5    6    7
+                                //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx 
+
+                                currentVertexElement += 1;
+#else
                                  // save mask as it is zero'd out after each gather
                                  Value *vMask = vGatherMask;
  
@@ -1282,11 +1673,19 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                  // e.g. result of a single 8x32bit integer gather for 32bit components
                                  // 256i - 0    1    2    3    4    5    6    7
                                  //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx 
+#endif
                              }
                              else
                              {
  #if USE_SIMD16_SHADERS
+#if USE_SIMD16_GATHERS
+                                vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
+                                vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
+
+                                currentVertexElement += 1;
+#else
                                  vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
+#endif
  #else
                                  vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
  #endif
@@ -1294,7 +1693,15 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
  
                              if (currentVertexElement > 3)
                              {
+#if USE_SIMD16_GATHERS
+                                StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
+                                StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
+
+                                outputElt += 1;
+#else
                                  StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+#endif
+
                                  // reset to the next vVertexElement to output
                                  currentVertexElement = 0;
                              }
@@ -1311,8 +1718,16 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
      }
  
      // if we have a partially filled vVertexElement struct, output it
-    if(currentVertexElement > 0){
+    if (currentVertexElement > 0)
+    {
+#if USE_SIMD16_GATHERS
+        StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements);
+        StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2);
+
+        outputElt += 1;
+#else
          StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
+#endif
      }
  }
author	Tim Rowley <timothy.o.rowley@intel.com>
	Mon, 16 Oct 2017 23:39:41 +0000 (18:39 -0500)
committer	Tim Rowley <timothy.o.rowley@intel.com>
	Thu, 19 Oct 2017 18:10:55 +0000 (13:10 -0500)