From 96ad8f5a2319300870e55f5849e1975fb679b996 Mon Sep 17 00:00:00 2001 From: George Kyriazis Date: Fri, 6 Apr 2018 16:39:09 -0500 Subject: [PATCH] swr/rast: Fix 64bit float loads in x86 lowering pass Reviewed-by: Bruce Cherniak --- .../swr/rasterizer/jitter/builder_mem.cpp | 39 +------------------ .../swr/rasterizer/jitter/fetch_jit.cpp | 31 +++++++++++---- 2 files changed, 25 insertions(+), 45 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp index c7912785b7b..f0cd4413d3e 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp @@ -201,44 +201,7 @@ namespace SwrJit /// @param scale - value to scale indices by Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) { - Value* vGather; - - // use avx2 gather instruction if available - if (JM()->mArch.AVX2()) - { - vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth / 2)), VectorType::get(mDoubleTy, mVWidth / 2)); - vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale)); - } - else - { - Value* pStack = STACKSAVE(); - - // store vSrc on the stack. this way we can select between a valid load address and the vSrc address - Value* vSrcPtr = ALLOCA(vSrc->getType()); - SetTempAlloca(vSrcPtr); - STORE(vSrc, vSrcPtr); - - vGather = UndefValue::get(VectorType::get(mDoubleTy, 4)); - Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale)); - Value *vOffsets = MUL(vIndices, vScaleVec); - for (uint32_t i = 0; i < mVWidth / 2; ++i) - { - // single component byte index - Value *offset = VEXTRACT(vOffsets, C(i)); - // byte pointer to component - Value *loadAddress = GEP(pBase, offset); - loadAddress = BITCAST(loadAddress, PointerType::get(mDoubleTy, 0)); - // pointer to the value to load if we're masking off a component - Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) }); - Value *selMask = VEXTRACT(vMask, C(i)); - // switch in a safe address to load if we're trying to access a vertex - Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); - Value *val = LOAD(validAddress); - vGather = VINSERT(vGather, val, C(i)); - } - STACKRESTORE(pStack); - } - return vGather; + return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale)); } ////////////////////////////////////////////////////////////////////////// diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index cdfddf35d11..767866f68b1 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -230,7 +230,6 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) } // Fetch attributes from memory and output to a simdvertex struct - // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use JitGatherVertices(fetchState, streams, vIndices, pVtxOut); RET_VOID(); @@ -763,13 +762,31 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // if we need to gather the component if (compCtrl[i] == StoreSrc) { - Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3})); - Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7})); + Value* vShufLo; + Value* vShufHi; + Value* vShufAll; - Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0)); - Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1)); + if (mVWidth == 8) + { + vShufLo = C({ 0, 1, 2, 3 }); + vShufHi = C({ 4, 5, 6, 7 }); + vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7 }); + } + else + { + SWR_ASSERT(mVWidth == 16); + vShufLo = C({ 0, 1, 2, 3, 4, 5, 6, 7 }); + vShufHi = C({ 8, 9, 10, 11, 12, 13, 14, 15 }); + vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }); + } + + Value *vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo); + Value *vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi); + + Value *vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo); + Value *vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi); - Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f)); + Value *vZeroDouble = VECTOR_SPLAT(mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f)); Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo); Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi); @@ -777,7 +794,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, pGatherLo = VCVTPD2PS(pGatherLo); pGatherHi = VCVTPD2PS(pGatherHi); - Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7})); + Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll); vVertexElements[currentVertexElement++] = pGather; } -- 2.30.2