swr/rast: Fix 64bit float loads in x86 lowering pass
authorGeorge Kyriazis <george.kyriazis@intel.com>
Fri, 6 Apr 2018 21:39:09 +0000 (16:39 -0500)
committerGeorge Kyriazis <george.kyriazis@intel.com>
Wed, 18 Apr 2018 15:51:38 +0000 (10:51 -0500)
Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

index c7912785b7b6912432f06a4a1ae6a98b931156d1..f0cd4413d3e03f0cb0ac84f875d2215f5456f9e4 100644 (file)
@@ -201,44 +201,7 @@ namespace SwrJit
     /// @param scale - value to scale indices by
     Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
     {
-        Value* vGather;
-
-        // use avx2 gather instruction if available
-        if (JM()->mArch.AVX2())
-        {
-            vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth / 2)), VectorType::get(mDoubleTy, mVWidth / 2));
-            vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
-        }
-        else
-        {
-            Value* pStack = STACKSAVE();
-
-            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
-            Value* vSrcPtr = ALLOCA(vSrc->getType());
-            SetTempAlloca(vSrcPtr);
-            STORE(vSrc, vSrcPtr);
-
-            vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
-            Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
-            Value *vOffsets = MUL(vIndices, vScaleVec);
-            for (uint32_t i = 0; i < mVWidth / 2; ++i)
-            {
-                // single component byte index
-                Value *offset = VEXTRACT(vOffsets, C(i));
-                // byte pointer to component
-                Value *loadAddress = GEP(pBase, offset);
-                loadAddress = BITCAST(loadAddress, PointerType::get(mDoubleTy, 0));
-                // pointer to the value to load if we're masking off a component
-                Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
-                Value *selMask = VEXTRACT(vMask, C(i));
-                // switch in a safe address to load if we're trying to access a vertex
-                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
-                Value *val = LOAD(validAddress);
-                vGather = VINSERT(vGather, val, C(i));
-            }
-            STACKRESTORE(pStack);
-        }
-        return vGather;
+        return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
     }
 
     //////////////////////////////////////////////////////////////////////////
index cdfddf35d110d0462d43a961d70883a46cfccadf..767866f68b1329726f42170fb0665eb3564fb584 100644 (file)
@@ -230,7 +230,6 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
     }
 
     // Fetch attributes from memory and output to a simdvertex struct
-    // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
     JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 
     RET_VOID();
@@ -763,13 +762,31 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                             // if we need to gather the component
                             if (compCtrl[i] == StoreSrc)
                             {
-                                Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
-                                Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
+                                Value* vShufLo;
+                                Value* vShufHi;
+                                Value* vShufAll;
 
-                                Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
-                                Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
+                                if (mVWidth == 8)
+                                {
+                                    vShufLo = C({ 0, 1, 2, 3 });
+                                    vShufHi = C({ 4, 5, 6, 7 });
+                                    vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
+                                }
+                                else
+                                {
+                                    SWR_ASSERT(mVWidth == 16);
+                                    vShufLo = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
+                                    vShufHi = C({ 8, 9, 10, 11, 12, 13, 14, 15 });
+                                    vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
+                                }
+
+                                Value *vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
+                                Value *vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
+
+                                Value *vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
+                                Value *vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
 
-                                Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
+                                Value *vZeroDouble = VECTOR_SPLAT(mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
 
                                 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
                                 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
@@ -777,7 +794,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                 pGatherLo = VCVTPD2PS(pGatherLo);
                                 pGatherHi = VCVTPD2PS(pGatherHi);
 
-                                Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
+                                Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
 
                                 vVertexElements[currentVertexElement++] = pGather;
                             }