X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fswr%2Frasterizer%2Fjitter%2Ffetch_jit.cpp;h=5c8d81332df118affb7551937e4745172a92b9bc;hb=c2163dc56a237d79e8aea3265e5aba0e603f677d;hp=ec3b5eafccdbad34384eb32bd9c0c9e161dac348;hpb=01a57c11cb7fe85196b9cb4b5a1555e6eb239297;p=mesa.git diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index ec3b5eafccd..5c8d81332df 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -27,12 +27,11 @@ * Notes: * ******************************************************************************/ +#include "jit_pch.hpp" #include "builder.h" #include "jit_api.h" #include "fetch_jit.h" #include "gen_state_llvm.h" -#include -#include //#define FETCH_DUMP_VERTEX 1 using namespace llvm; @@ -49,14 +48,22 @@ enum ConversionType CONVERT_SFIXED, }; +#if USE_SIMD16_SHADERS +#define USE_SIMD16_GATHERS 0 +#endif + ////////////////////////////////////////////////////////////////////////// /// Interface to Jitting a fetch shader ////////////////////////////////////////////////////////////////////////// -struct FetchJit : public Builder +struct FetchJit : + public Builder { - FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){}; + FetchJit(JitManager* pJitMgr) : + Builder(pJitMgr) + {} Function* Create(const FETCH_COMPILE_STATE& fetchState); + Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex); Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex); Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex); @@ -65,44 +72,49 @@ struct FetchJit : public Builder typedef std::tuple Shuffle8bpcArgs; + #if USE_SIMD16_SHADERS +#if USE_SIMD16_GATHERS + void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args); +#else void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2); +#endif #else void Shuffle8bpcGatherd(Shuffle8bpcArgs &args); #endif -#if USE_SIMD16_BUILDER - void Shuffle8bpcGatherd2(Shuffle8bpcArgs &args); -#endif typedef std::tuple Shuffle16bpcArgs; + #if USE_SIMD16_SHADERS +#if USE_SIMD16_GATHERS + void Shuffle16bpcGather16(Shuffle16bpcArgs &args); +#else void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2); +#endif #else void Shuffle16bpcGather(Shuffle16bpcArgs &args); #endif -#if USE_SIMD16_BUILDER - void Shuffle16bpcGather2(Shuffle16bpcArgs &args); -#endif +#if USE_SIMD16_GATHERS + void StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); +#else void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); -#if USE_SIMD16_BUILDER - void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); #endif #if USE_SIMD16_SHADERS - Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2); +#if USE_SIMD16_GATHERS + Value *GenerateCompCtrlVector16(const ComponentControl ctrl); #else - Value* GenerateCompCtrlVector(const ComponentControl ctrl); + Value *GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2); #endif -#if USE_SIMD16_BUILDER - Value* GenerateCompCtrlVector2(const ComponentControl ctrl); +#else + Value *GenerateCompCtrlVector(const ComponentControl ctrl); #endif void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); -#if USE_SIMD16_SHADERS -#define USE_SIMD16_GATHERS 0 +#if USE_SIMD16_SHADERS #if USE_SIMD16_GATHERS void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2); #else @@ -123,7 +135,7 @@ struct FetchJit : public Builder Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) { - std::stringstream fnName("FetchShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate); + std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate); fnName << ComputeCRC(0, &fetchState, sizeof(fetchState)); Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); @@ -136,6 +148,10 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) auto argitr = fetch->arg_begin(); // Fetch shader arguments + Value* privateContext = &*argitr; ++argitr; + privateContext->setName("privateContext"); + SetPrivateContext(privateContext); + mpFetchInfo = &*argitr; ++argitr; mpFetchInfo->setName("fetchInfo"); Value* pVtxOut = &*argitr; @@ -150,7 +166,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) pVtxOut = GEP(pVtxOut, C(0)); #if USE_SIMD16_SHADERS #if 0// USE_SIMD16_BUILDER - pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); #else pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0)); #endif @@ -237,7 +253,13 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex); #endif break; // incoming type is already 32bit int - default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break; + default: + SWR_INVALID("Unsupported index type"); + vIndices = nullptr; +#if USE_SIMD16_SHADERS + vIndices2 = nullptr; +#endif + break; } if(fetchState.bForceSequentialAccessEnable) @@ -345,6 +367,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) JitManager::DumpToFile(fetch, "opt"); + return fetch; } @@ -417,6 +440,10 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* str } else if (ied.InstanceStrideEnable) { + // silence unused variable warnings + startOffset = C(0); + vCurIndices = vIndices; + SWR_ASSERT((0), "TODO: Fill out more once driver sends this down."); } else @@ -428,7 +455,7 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* str } // load SWR_VERTEX_BUFFER_STATE::pData - Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData}); + Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData}); // load SWR_VERTEX_BUFFER_STATE::pitch Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); @@ -732,7 +759,66 @@ void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pB // only works if pixel size is <= 32bits SWR_ASSERT(info.bpp <= 32); - Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask); + Value *pGather; + if (info.bpp == 32) + { + pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask); + } + else + { + // Can't use 32-bit gather for items less than 32-bits, could cause page faults. + Value *pMem = ALLOCA(mSimdInt32Ty); + STORE(VIMMED1(0u), pMem); + + pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0)); + Value* pDstMem = BITCAST(pMem, mInt32PtrTy); + + for (uint32_t lane = 0; lane < mVWidth; ++lane) + { + // Get index + Value* index = VEXTRACT(pOffsets, C(lane)); + Value* mask = VEXTRACT(pMask, C(lane)); + switch (info.bpp) + { + case 8: + { + Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0)); + Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0)); + STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst); + break; + } + + case 16: + { + Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0)); + Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0)); + STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst); + break; + } + break; + + case 24: + { + // First 16-bits of data + Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0)); + Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0)); + STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst); + + // Last 8-bits of data + pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0)); + pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0)); + STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst); + break; + } + + default: + SWR_INVALID("Shouldn't have BPP = %d now", info.bpp); + break; + } + } + + pGather = LOAD(pMem); + } for (uint32_t comp = 0; comp < 4; ++comp) { @@ -830,21 +916,14 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, uint32_t outputElt = 0; Value* vVertexElements[4]; #if USE_SIMD16_GATHERS - Value* vVertexElements2[4]; -#if USE_SIMD16_BUILDER Value *pVtxSrc2[4]; -#endif #endif Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER - Value* vBaseVertex16 = VBROADCAST2(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); -#else - Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); -#endif + Value* vBaseVertex16 = VBROADCAST_16(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); #else Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); #endif @@ -864,18 +943,14 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices."); uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix. - Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData}); + Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData}); // VGATHER* takes an *i8 src pointer - Value *pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0)); + Value *pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0)); Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER - Value *vStride16 = VBROADCAST2(stride); -#else - Value *vStride = VBROADCAST(stride); -#endif + Value *vStride16 = VBROADCAST_16(stride); #else Value *vStride = VBROADCAST(stride); #endif @@ -898,20 +973,14 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, curInstance = ADD(curInstance, startInstance); } - Value *vCurIndices; #if USE_SIMD16_GATHERS - Value *vCurIndices2; -#if USE_SIMD16_BUILDER Value *vCurIndices16; -#endif +#else + Value *vCurIndices; #endif Value *startOffset; #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER - Value *vInstanceStride16 = VIMMED2_1(0); -#else - Value *vInstanceStride = VIMMED1(0); -#endif + Value *vInstanceStride16 = VIMMED1_16(0); #else Value *vInstanceStride = VIMMED1(0); #endif @@ -931,12 +1000,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, calcInstance = SELECT(isNonZeroStep, calcInstance, C(0)); #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER - vCurIndices16 = VBROADCAST2(calcInstance); -#else - vCurIndices = VBROADCAST(calcInstance); - vCurIndices2 = VBROADCAST(calcInstance); -#endif + vCurIndices16 = VBROADCAST_16(calcInstance); #else vCurIndices = VBROADCAST(calcInstance); #endif @@ -948,25 +1012,16 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // grab the instance advancement state, determines stride in bytes from one instance to the next Value* stepRate = C(ied.InstanceAdvancementState); #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER - vInstanceStride16 = VBROADCAST2(MUL(curInstance, stepRate)); -#else - vInstanceStride = VBROADCAST(MUL(curInstance, stepRate)); -#endif + vInstanceStride16 = VBROADCAST_16(MUL(curInstance, stepRate)); #else vInstanceStride = VBROADCAST(MUL(curInstance, stepRate)); #endif // offset indices by baseVertex #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER - Value *vIndices16 = JOIN2(vIndices, vIndices2); + Value *vIndices16 = JOIN_16(vIndices, vIndices2); vCurIndices16 = ADD(vIndices16, vBaseVertex16); -#else - vCurIndices = ADD(vIndices, vBaseVertex); - vCurIndices2 = ADD(vIndices2, vBaseVertex); -#endif #else vCurIndices = ADD(vIndices, vBaseVertex); #endif @@ -978,14 +1033,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, { // offset indices by baseVertex #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER - Value *vIndices16 = JOIN2(vIndices, vIndices2); + Value *vIndices16 = JOIN_16(vIndices, vIndices2); vCurIndices16 = ADD(vIndices16, vBaseVertex16); -#else - vCurIndices = ADD(vIndices, vBaseVertex); - vCurIndices2 = ADD(vIndices2, vBaseVertex); -#endif #else vCurIndices = ADD(vIndices, vBaseVertex); #endif @@ -999,6 +1049,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // calculate byte offset to the start of the VB Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty)); pStreamBase = GEP(pStreamBase, baseOffset); + Value* pStreamBaseGFX = ADD(stream, baseOffset); // if we have a start offset, subtract from max vertex. Used for OOB check maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); @@ -1018,15 +1069,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)}); partialInboundsSize = LOAD(partialInboundsSize); #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER - Value *vPartialVertexSize = VBROADCAST2(partialInboundsSize); - Value *vBpp = VBROADCAST2(C(info.Bpp)); - Value *vAlignmentOffsets = VBROADCAST2(C(ied.AlignedByteOffset)); -#else - Value *vPartialVertexSize = VBROADCAST(partialInboundsSize); - Value *vBpp = VBROADCAST(C(info.Bpp)); - Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset)); -#endif + Value *vPartialVertexSize = VBROADCAST_16(partialInboundsSize); + Value *vBpp = VBROADCAST_16(C(info.Bpp)); + Value *vAlignmentOffsets = VBROADCAST_16(C(ied.AlignedByteOffset)); #else Value *vPartialVertexSize = VBROADCAST(partialInboundsSize); Value *vBpp = VBROADCAST(C(info.Bpp)); @@ -1037,13 +1082,12 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets)); #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER // override cur indices with 0 if pitch is 0 - Value *pZeroPitchMask16 = ICMP_EQ(vStride16, VIMMED2_1(0)); - vCurIndices16 = SELECT(pZeroPitchMask16, VIMMED2_1(0), vCurIndices16); + Value *pZeroPitchMask16 = ICMP_EQ(vStride16, VIMMED1_16(0)); + vCurIndices16 = SELECT(pZeroPitchMask16, VIMMED1_16(0), vCurIndices16); // are vertices partially OOB? - Value *vMaxVertex16 = VBROADCAST2(maxVertex); + Value *vMaxVertex16 = VBROADCAST_16(maxVertex); Value *vPartialOOBMask = ICMP_EQ(vCurIndices16, vMaxVertex16); // are vertices fully in bounds? @@ -1054,7 +1098,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (fetchState.bPartialVertexBuffer) { // are vertices below minVertex limit? - Value *vMinVertex16 = VBROADCAST2(minVertex); + Value *vMinVertex16 = VBROADCAST_16(minVertex); Value *vMinGatherMask16 = ICMP_UGE(vCurIndices16, vMinVertex16); // only fetch lanes that pass both tests @@ -1078,66 +1122,12 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, vOffsets16 = ADD(vOffsets16, vInstanceStride16); // TODO: remove the following simd8 interop stuff once all code paths are fully widened to SIMD16.. - Value *vmask16 = VMASK2(vGatherMask16); - - Value *vGatherMask = MASK(EXTRACT2_I(vmask16, 0)); - Value *vGatherMask2 = MASK(EXTRACT2_I(vmask16, 1)); - - Value *vOffsets = EXTRACT2_I(vOffsets16, 0); - Value *vOffsets2 = EXTRACT2_I(vOffsets16, 1); - -#else - // override cur indices with 0 if pitch is 0 - Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); - vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices); - vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2); - - // are vertices partially OOB? - Value* vMaxVertex = VBROADCAST(maxVertex); - Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex); - Value* vPartialOOBMask2 = ICMP_EQ(vCurIndices2, vMaxVertex); - - // are vertices fully in bounds? - Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex); - Value* vMaxGatherMask2 = ICMP_ULT(vCurIndices2, vMaxVertex); - - Value *vGatherMask; - Value *vGatherMask2; - if (fetchState.bPartialVertexBuffer) - { - // are vertices below minVertex limit? - Value *vMinVertex = VBROADCAST(minVertex); - Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex); - Value *vMinGatherMask2 = ICMP_UGE(vCurIndices2, vMinVertex); - - // only fetch lanes that pass both tests - vGatherMask = AND(vMaxGatherMask, vMinGatherMask); - vGatherMask2 = AND(vMaxGatherMask2, vMinGatherMask2); - } - else - { - vGatherMask = vMaxGatherMask; - vGatherMask2 = vMaxGatherMask2; - } - - // blend in any partially OOB indices that have valid elements - vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask); - vGatherMask2 = SELECT(vPartialOOBMask2, vElementInBoundsMask, vGatherMask2); - - // calculate the actual offsets into the VB - Value* vOffsets = MUL(vCurIndices, vStride); - vOffsets = ADD(vOffsets, vAlignmentOffsets); - - Value* vOffsets2 = MUL(vCurIndices2, vStride); - vOffsets2 = ADD(vOffsets2, vAlignmentOffsets); - // if instance stride enable is: - // true - add product of the instanceID and advancement state to the offst into the VB - // false - value of vInstanceStride has been initialialized to zero - vOffsets = ADD(vOffsets, vInstanceStride); - vOffsets2 = ADD(vOffsets2, vInstanceStride); + Value *vGatherMask = EXTRACT_16(vGatherMask16, 0); + Value *vGatherMask2 = EXTRACT_16(vGatherMask16, 1); -#endif + Value *vOffsets = EXTRACT_16(vOffsets16, 0); + Value *vOffsets2 = EXTRACT_16(vOffsets16, 1); #else // override cur indices with 0 if pitch is 0 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); @@ -1189,7 +1179,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, #if USE_SIMD16_GATHERS Value *pResults[4]; Value *pResults2[4]; - CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults); + CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults); CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2); ConvertFormat((SWR_FORMAT)ied.Format, pResults); ConvertFormat((SWR_FORMAT)ied.Format, pResults2); @@ -1198,43 +1188,26 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, { if (isComponentEnabled(compMask, c)) { -#if USE_SIMD16_BUILDER // pack adjacent pairs of SIMD8s into SIMD16s - pVtxSrc2[currentVertexElement] = JOIN2(pResults[c], pResults2[c]); - -#else - vVertexElements[currentVertexElement] = pResults[c]; - vVertexElements2[currentVertexElement] = pResults2[c]; - -#endif - currentVertexElement += 1; + pVtxSrc2[currentVertexElement++] = JOIN_16(pResults[c], pResults2[c]); if (currentVertexElement > 3) { -#if USE_SIMD16_BUILDER // store SIMD16s - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); - - StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); - -#else - StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); - StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); - -#endif - outputElt += 1; + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); + StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2); // reset to the next vVertexElement to output currentVertexElement = 0; } } } #else - Value* pResults[4]; + Value *pResults[4]; CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults); ConvertFormat((SWR_FORMAT)ied.Format, pResults); - for (uint32_t c = 0; c < 4; ++c) + for (uint32_t c = 0; c < 4; c += 1) { if (isComponentEnabled(compMask, c)) { @@ -1254,10 +1227,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, ///@todo: support 64 bit vb accesses Value *gatherSrc = VIMMED1(0.0f); #if USE_SIMD16_GATHERS - Value *gatherSrc2 = VIMMED1(0.0f); -#if USE_SIMD16_BUILDER - Value *gatherSrc16 = VIMMED2_1(0.0f); -#endif + Value *gatherSrc16 = VIMMED1_16(0.0f); #endif SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), @@ -1269,7 +1239,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, case 16: { #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER Value *gatherResult[2]; // if we have at least one component out of x or y to fetch @@ -1284,7 +1253,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } else { - gatherResult[0] = VUNDEF2_I(); + gatherResult[0] = VUNDEF_I_16(); } // if we have at least one component out of z or w to fetch @@ -1302,76 +1271,26 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } else { - gatherResult[1] = VUNDEF2_I(); - } - -#else - Value *vGatherResult[2]; - Value *vGatherResult2[2]; - - // if we have at least one component out of x or y to fetch - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) - { - vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask); - vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); - // e.g. result of first 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy - // - } - else - { - vGatherResult[0] = VUNDEF_I(); - vGatherResult2[0] = VUNDEF_I(); - } - - // if we have at least one component out of z or w to fetch - if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) - { - // offset base to the next components(zw) in the vertex to gather - pStreamBase = GEP(pStreamBase, C((char)4)); - - vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask); - vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); - // e.g. result of second 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw - // - } - else - { - vGatherResult[1] = VUNDEF_I(); - vGatherResult2[1] = VUNDEF_I(); + gatherResult[1] = VUNDEF_I_16(); } -#endif // if we have at least one component to shuffle into place if (compMask) { -#if USE_SIMD16_BUILDER - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE, currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2); // Shuffle gathered components into place in simdvertex struct - Shuffle16bpcGather2(args); // outputs to vVertexElements ref -#else - Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE, - currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); - Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE, - currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2); - - // Shuffle gathered components into place in simdvertex struct - Shuffle16bpcGather(args, false); // outputs to vVertexElements ref - Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref -#endif + Shuffle16bpcGather16(args); // outputs to vVertexElements ref } #else - Value* vGatherResult[2]; + Value *vGatherResult[2]; // if we have at least one component out of x or y to fetch - if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) + { vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 @@ -1380,7 +1299,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } // if we have at least one component out of z or w to fetch - if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ + if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) + { // offset base to the next components(zw) in the vertex to gather pStreamBase = GEP(pStreamBase, C((char)4)); @@ -1392,7 +1312,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } // if we have at least one component to shuffle into place - if(compMask){ + if (compMask) + { Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE, currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); @@ -1421,66 +1342,24 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :( // But, we know that elements must be aligned for FETCH. :) // Right shift the offset by a bit and then scale by 2 to remove the sign extension. -#if USE_SIMD16_BUILDER - Value *shiftedOffsets = VPSRLI_16(vOffsets16, C(1)); - pVtxSrc2[currentVertexElement] = GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets, vGatherMask16, 2); - -#else - Value *vShiftedOffsets = VPSRLI(vOffsets, C(1)); - Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1)); - - vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2); - vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2); - -#if USE_SIMD16_BUILDER - // pack adjacent pairs of SIMD8s into SIMD16s - pVtxSrc2[currentVertexElement] = JOIN2(vVertexElements[currentVertexElement], - vVertexElements2[currentVertexElement]); - -#endif -#endif - currentVertexElement += 1; + Value *shiftedOffsets16 = LSHR(vOffsets16, 1); + pVtxSrc2[currentVertexElement++] = GATHERPS_16(gatherSrc16, pStreamBaseGFX, shiftedOffsets16, vGatherMask16, 2, GFX_MEM_CLIENT_FETCH); } else { -#if USE_SIMD16_BUILDER - pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]); -#else - vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false); - vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true); - -#if USE_SIMD16_BUILDER - // pack adjacent pairs of SIMD8s into SIMD16s - pVtxSrc2[currentVertexElement] = JOIN2(vVertexElements[currentVertexElement], - vVertexElements2[currentVertexElement]); - -#endif -#endif - currentVertexElement += 1; + pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]); } if (currentVertexElement > 3) { -#if USE_SIMD16_BUILDER // store SIMD16s - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); - - StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); - -#else - StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); - StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); - -#endif - outputElt += 1; + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); + StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2); // reset to the next vVertexElement to output currentVertexElement = 0; } } - - // offset base to the next component in the vertex to gather - pStreamBase = GEP(pStreamBase, C((char)4)); #else if (isComponentEnabled(compMask, i)) { @@ -1492,8 +1371,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :( // But, we know that elements must be aligned for FETCH. :) // Right shift the offset by a bit and then scale by 2 to remove the sign extension. - Value* vShiftedOffsets = VPSRLI(vOffsets, C(1)); - vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2); + Value *vShiftedOffsets = LSHR(vOffsets, 1); + vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBaseGFX, vShiftedOffsets, vGatherMask, 2, GFX_MEM_CLIENT_FETCH); } else { @@ -1511,10 +1390,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, currentVertexElement = 0; } } +#endif // offset base to the next component in the vertex to gather pStreamBase = GEP(pStreamBase, C((char)4)); -#endif + pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4)); } } break; @@ -1553,45 +1433,20 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 })); Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 })); -#if USE_SIMD16_BUILDER // pack adjacent pairs of SIMD8s into SIMD16s - pVtxSrc2[currentVertexElement] = JOIN2(pGather, pGather2); - -#else - vVertexElements[currentVertexElement] = pGather; - vVertexElements2[currentVertexElement] = pGather2; - -#endif - currentVertexElement += 1; + pVtxSrc2[currentVertexElement++] = JOIN_16(pGather, pGather2); } else { -#if USE_SIMD16_BUILDER - pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]); - -#else - vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false); - vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true); - -#endif - currentVertexElement += 1; + pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]); } if (currentVertexElement > 3) { -#if USE_SIMD16_BUILDER // store SIMD16s - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); - - StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); - -#else - StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); - StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); - -#endif - outputElt += 1; + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); + StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2); // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -1613,10 +1468,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f)); - Value* pGatherLo = GATHERPD(vZeroDouble, - pStreamBase, vOffsetsLo, vMaskLo); - Value* pGatherHi = GATHERPD(vZeroDouble, - pStreamBase, vOffsetsHi, vMaskHi); + Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo); + Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi); pGatherLo = VCVTPD2PS(pGatherLo); pGatherHi = VCVTPD2PS(pGatherHi); @@ -1692,10 +1545,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // value substituted when component of gather is masked Value* gatherSrc = VIMMED1(0); #if USE_SIMD16_GATHERS - Value* gatherSrc2 = VIMMED1(0); -#if USE_SIMD16_BUILDER - Value *gatherSrc16 = VIMMED2_1(0); -#endif + Value *gatherSrc16 = VIMMED1_16(0); #endif // Gather components from memory to store in a simdvertex structure @@ -1707,42 +1557,21 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (compMask) { #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER Value *gatherResult = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16); // e.g. result of an 8x32bit integer gather for 8bit components // 256i - 0 1 2 3 4 5 6 7 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw -#else - Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); - Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); - - // e.g. result of an 8x32bit integer gather for 8bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw - -#endif -#if USE_SIMD16_BUILDER - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType, currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle); // Shuffle gathered components into place in simdvertex struct - Shuffle8bpcGatherd2(args); // outputs to vVertexElements ref -#else - Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, - currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle); - Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType, - currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2, info.swizzle); - - // Shuffle gathered components into place in simdvertex struct - Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref - Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref -#endif + Shuffle8bpcGatherd16(args); // outputs to vVertexElements ref #else - Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); + Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); // e.g. result of an 8x32bit integer gather for 8bit components // 256i - 0 1 2 3 4 5 6 7 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw @@ -1763,8 +1592,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, case 16: { #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER - Value* gatherResult[2]; + Value *gatherResult[2]; // if we have at least one component out of x or y to fetch if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) @@ -1778,7 +1606,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } else { - gatherResult[0] = VUNDEF2_I(); + gatherResult[0] = VUNDEF_I_16(); } // if we have at least one component out of z or w to fetch @@ -1796,28 +1624,32 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } else { - gatherResult[1] = VUNDEF2_I(); + gatherResult[1] = VUNDEF_I_16(); } + // if we have at least one component to shuffle into place + if (compMask) + { + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); + + Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType, + currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2); + + // Shuffle gathered components into place in simdvertex struct + Shuffle16bpcGather16(args); // outputs to vVertexElements ref + } #else - Value* vGatherResult[2]; - Value* vGatherResult2[2]; + Value *vGatherResult[2]; // if we have at least one component out of x or y to fetch if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) { vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); - vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy // } - else - { - vGatherResult[0] = VUNDEF_I(); - vGatherResult2[0] = VUNDEF_I(); - } // if we have at least one component out of z or w to fetch if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) @@ -1826,67 +1658,15 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, pStreamBase = GEP(pStreamBase, C((char)4)); vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); - vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); // e.g. result of second 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw // } - else - { - vGatherResult[1] = VUNDEF_I(); - vGatherResult2[1] = VUNDEF_I(); - } -#endif // if we have at least one component to shuffle into place if (compMask) { -#if USE_SIMD16_BUILDER - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); - - Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType, - currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2); - - // Shuffle gathered components into place in simdvertex struct - Shuffle16bpcGather2(args); // outputs to vVertexElements ref -#else - Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, - currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); - Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType, - currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2); - - // Shuffle gathered components into place in simdvertex struct - Shuffle16bpcGather(args, false); // outputs to vVertexElements ref - Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref -#endif - } -#else - Value* vGatherResult[2]; - - // if we have at least one component out of x or y to fetch - if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ - vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); - // e.g. result of first 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy - // - } - - // if we have at least one component out of z or w to fetch - if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ - // offset base to the next components(zw) in the vertex to gather - pStreamBase = GEP(pStreamBase, C((char)4)); - - vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); - // e.g. result of second 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw - // - } - - // if we have at least one component to shuffle into place - if(compMask){ Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); @@ -1911,57 +1691,26 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (compCtrl[i] == StoreSrc) { #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER Value *pGather = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16); if (conversionType == CONVERT_USCALED) { - pGather = UI_TO_FP(pGather, mSimd2FP32Ty); - } - else if (conversionType == CONVERT_SSCALED) - { - pGather = SI_TO_FP(pGather, mSimd2FP32Ty); - } - else if (conversionType == CONVERT_SFIXED) - { - pGather = FMUL(SI_TO_FP(pGather, mSimd2FP32Ty), VBROADCAST2(C(1 / 65536.0f))); - } - -#else - Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); - Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); - - if (conversionType == CONVERT_USCALED) - { - pGather = UI_TO_FP(pGather, mSimdFP32Ty); - pGather2 = UI_TO_FP(pGather2, mSimdFP32Ty); + pGather = UI_TO_FP(pGather, mSimd16FP32Ty); } else if (conversionType == CONVERT_SSCALED) { - pGather = SI_TO_FP(pGather, mSimdFP32Ty); - pGather2 = SI_TO_FP(pGather2, mSimdFP32Ty); + pGather = SI_TO_FP(pGather, mSimd16FP32Ty); } else if (conversionType == CONVERT_SFIXED) { - pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f))); - pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f))); + pGather = FMUL(SI_TO_FP(pGather, mSimd16FP32Ty), VBROADCAST_16(C(1 / 65536.0f))); } -#endif -#if USE_SIMD16_BUILDER - pVtxSrc2[currentVertexElement] = pGather; - -#else - vVertexElements[currentVertexElement] = pGather; - vVertexElements2[currentVertexElement] = pGather2; - -#endif + pVtxSrc2[currentVertexElement++] = pGather; // e.g. result of a single 8x32bit integer gather for 32bit components // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx - - currentVertexElement += 1; #else Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); @@ -1979,6 +1728,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } vVertexElements[currentVertexElement++] = pGather; + // e.g. result of a single 8x32bit integer gather for 32bit components // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx @@ -1986,40 +1736,24 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } else { -#if USE_SIMD16_SHADERS #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER - pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]); - -#else - vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false); - vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true); - -#endif - currentVertexElement += 1; + pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]); #else +#if USE_SIMD16_SHADERS vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); -#endif #else vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); +#endif #endif } if (currentVertexElement > 3) { #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER // store SIMD16s - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); - - StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); - -#else - StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); - StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); -#endif - outputElt += 1; + StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2); #else StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); #endif @@ -2043,18 +1777,10 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (currentVertexElement > 0) { #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER // store SIMD16s - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); - - StoreVertexElements2(pVtxOut2, outputElt, currentVertexElement, pVtxSrc2); - -#else - StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements); - StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2); + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); -#endif - outputElt += 1; + StoreVertexElements16(pVtxOut2, outputElt++, currentVertexElement, pVtxSrc2); #else StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements); #endif @@ -2122,7 +1848,7 @@ Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex) // if valid, load the index. if not, load 0 from the stack Value* pValid = SELECT(mask, pIndex, pZeroIndex); - Value *index = LOAD(pValid, "valid index"); + Value *index = LOAD(pValid, "valid index", GFX_MEM_CLIENT_FETCH); // zero extended index to 32 bits and insert into the correct simd lane index = Z_EXT(index, mInt32Ty); @@ -2158,13 +1884,11 @@ Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0 Value* vMaxIndex = VBROADCAST(numIndicesLeft); - Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets); - - // VMASKLOAD takes an *i8 src pointer - pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0)); + Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets); // Load the indices; OOB loads 0 - return MASKLOADD(pIndices,vIndexMask); + pIndices = BITCAST(pIndices, PointerType::get(mSimdInt32Ty, 0)); + return MASKED_LOAD(pIndices, 4, vIndexMask, VIMMED1(0)); } ////////////////////////////////////////////////////////////////////////// @@ -2182,109 +1906,8 @@ Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) /// @param compCtrl - component control val /// @param vVertexElements[4] - vertex components to output /// @param swizzle[4] - component swizzle location -#if USE_SIMD16_SHADERS -void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2) -#else -void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) -#endif -{ - // Unpack tuple args - Value*& vGatherResult = std::get<0>(args); - Value* pVtxOut = std::get<1>(args); - const Instruction::CastOps extendType = std::get<2>(args); - const ConversionType conversionType = std::get<3>(args); - uint32_t ¤tVertexElement = std::get<4>(args); - uint32_t &outputElt = std::get<5>(args); - const ComponentEnable compMask = std::get<6>(args); - const ComponentControl (&compCtrl)[4] = std::get<7>(args); - Value* (&vVertexElements)[4] = std::get<8>(args); - const uint32_t (&swizzle)[4] = std::get<9>(args); - - // cast types - Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits - - for (uint32_t i = 0; i < 4; i++) - { - if (!isComponentEnabled(compMask, i)) - continue; - - if (compCtrl[i] == ComponentControl::StoreSrc) - { - std::vector vShuffleMasks[4] = { - { 0, 4, 8, 12, 16, 20, 24, 28 }, // x - { 1, 5, 9, 13, 17, 21, 25, 29 }, // y - { 2, 6, 10, 14, 18, 22, 26, 30 }, // z - { 3, 7, 11, 15, 19, 23, 27, 31 }, // w - }; - - Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty), - UndefValue::get(v32x8Ty), - vShuffleMasks[swizzle[i]]); - - if ((extendType == Instruction::CastOps::SExt) || - (extendType == Instruction::CastOps::SIToFP)) { - switch (conversionType) - { - case CONVERT_NORMALIZED: - val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0))); - break; - case CONVERT_SSCALED: - val = SI_TO_FP(val, mSimdFP32Ty); - break; - case CONVERT_USCALED: - SWR_INVALID("Type should not be sign extended!"); - break; - default: - SWR_ASSERT(conversionType == CONVERT_NONE); - val = S_EXT(val, mSimdInt32Ty); - break; - } - } else if ((extendType == Instruction::CastOps::ZExt) || - (extendType == Instruction::CastOps::UIToFP)) { - switch (conversionType) - { - case CONVERT_NORMALIZED: - val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0))); - break; - case CONVERT_SSCALED: - SWR_INVALID("Type should not be zero extended!"); - break; - case CONVERT_USCALED: - val = UI_TO_FP(val, mSimdFP32Ty); - break; - default: - SWR_ASSERT(conversionType == CONVERT_NONE); - val = Z_EXT(val, mSimdInt32Ty); - break; - } - } - else - { - SWR_INVALID("Unsupported conversion type"); - } - - vVertexElements[currentVertexElement++] = val; - } - else - { -#if USE_SIMD16_SHADERS - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); -#else - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); -#endif - } - - if (currentVertexElement > 3) - { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - // reset to the next vVertexElement to output - currentVertexElement = 0; - } - } -} - -#if USE_SIMD16_BUILDER -void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) +#if USE_SIMD16_GATHERS +void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args) { // Unpack tuple args Value*& vGatherResult = std::get<0>(args); @@ -2322,8 +1945,8 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. - Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0); - Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1); + Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0); + Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1); Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); @@ -2401,18 +2024,18 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor); } - vVertexElements[currentVertexElement] = JOIN2(temp_lo, temp_hi); + vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); currentVertexElement += 1; } else { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]); + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]); } if (currentVertexElement > 3) { - StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements); + StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements); // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -2482,8 +2105,8 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) break; } - Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0); - Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1); + Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0); + Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1); Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); @@ -2499,18 +2122,18 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor); } - vVertexElements[currentVertexElement] = JOIN2(temp_lo, temp_hi); + vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); currentVertexElement += 1; } else { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]); + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]); } if (currentVertexElement > 3) { - StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements); + StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements); // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -2523,29 +2146,15 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) } } -#endif -////////////////////////////////////////////////////////////////////////// -/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, -/// denormalizes if needed, converts to F32 if needed, and positions in -// the proper SIMD rows to be output to the simdvertex structure -/// @param args: (tuple of args, listed below) -/// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index -/// @param pVtxOut - base pointer to output simdvertex struct -/// @param extendType - sign extend or zero extend -/// @param bNormalized - do we need to denormalize? -/// @param currentVertexElement - reference to the current vVertexElement -/// @param outputElt - reference to the current offset from simdvertex we're o -/// @param compMask - component packing mask -/// @param compCtrl - component control val -/// @param vVertexElements[4] - vertex components to output +#else #if USE_SIMD16_SHADERS -void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2) +void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2) #else -void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) +void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) #endif { // Unpack tuple args - Value* (&vGatherResult)[2] = std::get<0>(args); + Value*& vGatherResult = std::get<0>(args); Value* pVtxOut = std::get<1>(args); const Instruction::CastOps extendType = std::get<2>(args); const ConversionType conversionType = std::get<3>(args); @@ -2554,107 +2163,247 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) const ComponentEnable compMask = std::get<6>(args); const ComponentControl(&compCtrl)[4] = std::get<7>(args); Value* (&vVertexElements)[4] = std::get<8>(args); + const uint32_t(&swizzle)[4] = std::get<9>(args); // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits - // have to do extra work for sign extending - if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)|| - (extendType == Instruction::CastOps::FPExt)) + for (uint32_t i = 0; i < 4; i++) { - // is this PP float? - bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; - - Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane - Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits - - // shuffle mask - Value* vConstMask = C({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); - Value* vi128XY = nullptr; - if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ - Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy); - // after pshufb: group components together in each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - - vi128XY = BITCAST(PERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); - // after PERMD: move and pack xy components into each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy - } - - // do the same for zw components - Value* vi128ZW = nullptr; - if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ - Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy); - vi128ZW = BITCAST(PERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); - } - - // init denormalize variables if needed - Instruction::CastOps IntToFpCast; - Value* conversionFactor; + if (!isComponentEnabled(compMask, i)) + continue; - switch (conversionType) + if (compCtrl[i] == ComponentControl::StoreSrc) { - case CONVERT_NORMALIZED: - IntToFpCast = Instruction::CastOps::SIToFP; - conversionFactor = VIMMED1((float)(1.0 / 32767.0)); - break; - case CONVERT_SSCALED: - IntToFpCast = Instruction::CastOps::SIToFP; - conversionFactor = VIMMED1((float)(1.0)); - break; - case CONVERT_USCALED: - SWR_INVALID("Type should not be sign extended!"); - conversionFactor = nullptr; - break; - default: - SWR_ASSERT(conversionType == CONVERT_NONE); - conversionFactor = nullptr; - break; - } + std::vector vShuffleMasks[4] = { + { 0, 4, 8, 12, 16, 20, 24, 28 }, // x + { 1, 5, 9, 13, 17, 21, 25, 29 }, // y + { 2, 6, 10, 14, 18, 22, 26, 30 }, // z + { 3, 7, 11, 15, 19, 23, 27, 31 }, // w + }; - // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex - for (uint32_t i = 0; i < 4; i++) - { - if (isComponentEnabled(compMask, i)) - { - if (compCtrl[i] == ComponentControl::StoreSrc) - { + Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty), + UndefValue::get(v32x8Ty), + vShuffleMasks[swizzle[i]]); + + if ((extendType == Instruction::CastOps::SExt) || + (extendType == Instruction::CastOps::SIToFP)) { + switch (conversionType) + { + case CONVERT_NORMALIZED: + val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0))); + break; + case CONVERT_SSCALED: + val = SI_TO_FP(val, mSimdFP32Ty); + break; + case CONVERT_USCALED: + SWR_INVALID("Type should not be sign extended!"); + break; + default: + SWR_ASSERT(conversionType == CONVERT_NONE); + val = S_EXT(val, mSimdInt32Ty); + break; + } + } + else if ((extendType == Instruction::CastOps::ZExt) || + (extendType == Instruction::CastOps::UIToFP)) { + switch (conversionType) + { + case CONVERT_NORMALIZED: + val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0))); + break; + case CONVERT_SSCALED: + SWR_INVALID("Type should not be zero extended!"); + break; + case CONVERT_USCALED: + val = UI_TO_FP(val, mSimdFP32Ty); + break; + default: + SWR_ASSERT(conversionType == CONVERT_NONE); + val = Z_EXT(val, mSimdInt32Ty); + break; + } + } + else + { + SWR_INVALID("Unsupported conversion type"); + } + + vVertexElements[currentVertexElement++] = val; + } + else + { +#if USE_SIMD16_SHADERS + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); +#else + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); +#endif + } + + if (currentVertexElement > 3) + { + StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); + // reset to the next vVertexElement to output + currentVertexElement = 0; + } + } +} + +#endif +////////////////////////////////////////////////////////////////////////// +/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, +/// denormalizes if needed, converts to F32 if needed, and positions in +// the proper SIMD rows to be output to the simdvertex structure +/// @param args: (tuple of args, listed below) +/// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index +/// @param pVtxOut - base pointer to output simdvertex struct +/// @param extendType - sign extend or zero extend +/// @param bNormalized - do we need to denormalize? +/// @param currentVertexElement - reference to the current vVertexElement +/// @param outputElt - reference to the current offset from simdvertex we're o +/// @param compMask - component packing mask +/// @param compCtrl - component control val +/// @param vVertexElements[4] - vertex components to output +#if USE_SIMD16_GATHERS +void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args) +{ + // Unpack tuple args + Value* (&vGatherResult)[2] = std::get<0>(args); + Value* pVtxOut = std::get<1>(args); + const Instruction::CastOps extendType = std::get<2>(args); + const ConversionType conversionType = std::get<3>(args); + uint32_t ¤tVertexElement = std::get<4>(args); + uint32_t &outputElt = std::get<5>(args); + const ComponentEnable compMask = std::get<6>(args); + const ComponentControl(&compCtrl)[4] = std::get<7>(args); + Value* (&vVertexElements)[4] = std::get<8>(args); + + // cast types + Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits + + // have to do extra work for sign extending + if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt)) + { + // is this PP float? + bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; + + Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane + Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits + + // shuffle mask + Value *vConstMask = C({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }); + Value *vi128XY_lo = nullptr; + Value *vi128XY_hi = nullptr; + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) + { + // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. + + Value *vGatherResult_lo = EXTRACT_16(vGatherResult[0], 0); + Value *vGatherResult_hi = EXTRACT_16(vGatherResult[0], 1); + + Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); + Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); + + // after pshufb: group components together in each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy + + vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + + // after PERMD: move and pack xy components into each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy + } + + // do the same for zw components + Value *vi128ZW_lo = nullptr; + Value *vi128ZW_hi = nullptr; + if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) + { + Value *vGatherResult_lo = EXTRACT_16(vGatherResult[1], 0); + Value *vGatherResult_hi = EXTRACT_16(vGatherResult[1], 1); + + Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); + Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); + + vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + } + + // init denormalize variables if needed + Instruction::CastOps IntToFpCast; + Value *conversionFactor; + + switch (conversionType) + { + case CONVERT_NORMALIZED: + IntToFpCast = Instruction::CastOps::SIToFP; + conversionFactor = VIMMED1((float)(1.0 / 32767.0)); + break; + case CONVERT_SSCALED: + IntToFpCast = Instruction::CastOps::SIToFP; + conversionFactor = VIMMED1((float)(1.0)); + break; + case CONVERT_USCALED: + SWR_INVALID("Type should not be sign extended!"); + conversionFactor = nullptr; + break; + default: + SWR_ASSERT(conversionType == CONVERT_NONE); + conversionFactor = nullptr; + break; + } + + // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex + for (uint32_t i = 0; i < 4; i++) + { + if (isComponentEnabled(compMask, i)) + { + if (compCtrl[i] == ComponentControl::StoreSrc) + { // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; // if x or y, use vi128XY permute result, else use vi128ZW - Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; + Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; + Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; - if (bFP) { + if (bFP) + { // extract 128 bit lanes to sign extend each component - vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); + Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); + Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); + + vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); } - else { + else + { // extract 128 bit lanes to sign extend each component - vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); + Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); + Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); // denormalize if needed - if (conversionType != CONVERT_NONE) { - vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); + if (conversionType != CONVERT_NONE) + { + temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor); + temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor); } + + vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); } - currentVertexElement++; + + currentVertexElement += 1; } else { -#if USE_SIMD16_SHADERS - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); -#else - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); -#endif + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]); } if (currentVertexElement > 3) { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); + StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements); // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -2665,17 +2414,20 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) { // pshufb masks for each component - Value* vConstMask[2]; - if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){ + Value *vConstMask[2]; + + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) + { // x/z shuffle mask - vConstMask[0] = C({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, - 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); + vConstMask[0] = C({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, + 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); } - - if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){ + + if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) + { // y/w shuffle mask - vConstMask[1] = C({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, - 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); + vConstMask[1] = C({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, + 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 }); } // init denormalize variables if needed @@ -2714,7 +2466,14 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) // if x or y, use vi128XY permute result, else use vi128ZW uint32_t selectedGather = (i < 2) ? 0 : 1; - vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); + // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. + + Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0); + Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1); + + Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy); + Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy); + // after pshufb mask for x channel; z uses the same shuffle from the second gather // 256i - 0 1 2 3 4 5 6 7 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 @@ -2722,22 +2481,22 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) // denormalize if needed if (conversionType != CONVERT_NONE) { - vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); + temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor); + temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor); } - currentVertexElement++; + + vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); + + currentVertexElement += 1; } else { -#if USE_SIMD16_SHADERS - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); -#else - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); -#endif + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]); } if (currentVertexElement > 3) { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); + StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements); // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -2750,8 +2509,12 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) } } -#if USE_SIMD16_BUILDER -void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) +#else +#if USE_SIMD16_SHADERS +void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2) +#else +void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) +#endif { // Unpack tuple args Value* (&vGatherResult)[2] = std::get<0>(args); @@ -2765,71 +2528,45 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) Value* (&vVertexElements)[4] = std::get<8>(args); // cast types - Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); - Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits - // have to do extra work for sign extending - if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt)) + // have to do extra work for sign extending + if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || + (extendType == Instruction::CastOps::FPExt)) { // is this PP float? bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; - Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane - Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits - - // shuffle mask - Value *vConstMask = C({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }); - Value *vi128XY = nullptr; - Value *vi128XY_lo = nullptr; - Value *vi128XY_hi = nullptr; - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) - { - // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. - - Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[0], 0); - Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[0], 1); - - Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); - Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); + Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane + Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits + // shuffle mask + Value* vConstMask = C({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }); + Value* vi128XY = nullptr; + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) { + Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy); // after pshufb: group components together in each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); - vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); - + vi128XY = BITCAST(PERMD(vShufResult, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); // after PERMD: move and pack xy components into each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy -#if 0 - vi128XY = JOIN2(vi128XY_lo, vi128XY_hi); -#endif } // do the same for zw components - Value *vi128ZW = nullptr; - Value *vi128ZW_lo = nullptr; - Value *vi128ZW_hi = nullptr; - if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) - { - Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[1], 0); - Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[1], 1); - - Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); - Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); - - vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); - vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); -#if 0 - vi128ZW = JOIN2(vi128ZW_lo, vi128ZW_hi); -#endif + Value* vi128ZW = nullptr; + if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) { + Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy); + vi128ZW = BITCAST(PERMD(vShufResult, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); } // init denormalize variables if needed Instruction::CastOps IntToFpCast; - Value *conversionFactor; + Value* conversionFactor; switch (conversionType) { @@ -2861,43 +2598,35 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; // if x or y, use vi128XY permute result, else use vi128ZW - Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; - Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; + Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; - if (bFP) - { + if (bFP) { // extract 128 bit lanes to sign extend each component - Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); - Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); - - vVertexElements[currentVertexElement] = JOIN2(temp_lo, temp_hi); + vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); } - else - { + else { // extract 128 bit lanes to sign extend each component - Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); - Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); + vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); // denormalize if needed - if (conversionType != CONVERT_NONE) - { - temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor); - temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor); + if (conversionType != CONVERT_NONE) { + vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); } - - vVertexElements[currentVertexElement] = JOIN2(temp_lo, temp_hi); } - - currentVertexElement += 1; + currentVertexElement++; } else { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]); +#if USE_SIMD16_SHADERS + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); +#else + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); +#endif } if (currentVertexElement > 3) { - StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements); + StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -2908,17 +2637,14 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) { // pshufb masks for each component - Value *vConstMask[2]; - - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) - { + Value* vConstMask[2]; + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) { // x/z shuffle mask vConstMask[0] = C({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); } - if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) - { + if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) { // y/w shuffle mask vConstMask[1] = C({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 }); @@ -2960,14 +2686,7 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) // if x or y, use vi128XY permute result, else use vi128ZW uint32_t selectedGather = (i < 2) ? 0 : 1; - // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. - - Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[selectedGather], 0); - Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[selectedGather], 1); - - Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy); - Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy); - + vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); // after pshufb mask for x channel; z uses the same shuffle from the second gather // 256i - 0 1 2 3 4 5 6 7 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 @@ -2975,22 +2694,22 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) // denormalize if needed if (conversionType != CONVERT_NONE) { - temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor); - temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor); + vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); } - - vVertexElements[currentVertexElement] = JOIN2(temp_lo, temp_hi); - - currentVertexElement += 1; + currentVertexElement++; } else { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]); +#if USE_SIMD16_SHADERS + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); +#else + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); +#endif } if (currentVertexElement > 3) { - StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements); + StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -3010,39 +2729,36 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) /// @param outputElt - simdvertex offset in VIN to write to /// @param numEltsToStore - number of simdvertex rows to write out /// @param vVertexElements - LLVM Value*[] simdvertex to write out -void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) +#if USE_SIMD16_GATHERS +void FetchJit::StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) { SWR_ASSERT(numEltsToStore <= 4, "Invalid element count."); - for(uint32_t c = 0; c < numEltsToStore; ++c) + for (uint32_t c = 0; c < numEltsToStore; ++c) { // STORE expects FP32 x vWidth type, just bitcast if needed - if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()) + if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy()) { #if FETCH_DUMP_VERTEX - PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]}); + PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] }); #endif - vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty); + vVertexElements[c] = BITCAST(vVertexElements[c], mSimd16FP32Ty); } #if FETCH_DUMP_VERTEX else { - PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]}); + PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] }); } #endif // outputElt * 4 = offsetting by the size of a simdvertex // + c offsets to a 32bit x vWidth row within the current vertex -#if USE_SIMD16_SHADERS - Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP"); -#else Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP"); -#endif STORE(vVertexElements[c], dest); } } -#if USE_SIMD16_BUILDER -void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) +#else +void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) { SWR_ASSERT(numEltsToStore <= 4, "Invalid element count."); @@ -3054,7 +2770,7 @@ void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, co #if FETCH_DUMP_VERTEX PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] }); #endif - vVertexElements[c] = BITCAST(vVertexElements[c], mSimd2FP32Ty); + vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty); } #if FETCH_DUMP_VERTEX else @@ -3064,7 +2780,11 @@ void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, co #endif // outputElt * 4 = offsetting by the size of a simdvertex // + c offsets to a 32bit x vWidth row within the current vertex +#if USE_SIMD16_SHADERS + Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP"); +#else Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP"); +#endif STORE(vVertexElements[c], dest); } } @@ -3074,22 +2794,63 @@ void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, co /// @brief Generates a constant vector of values based on the /// ComponentControl value /// @param ctrl - ComponentControl value +#if USE_SIMD16_GATHERS +Value *FetchJit::GenerateCompCtrlVector16(const ComponentControl ctrl) +{ + switch (ctrl) + { + case NoStore: + return VUNDEF_I_16(); + case Store0: + return VIMMED1_16(0); + case Store1Fp: + return VIMMED1_16(1.0f); + case Store1Int: + return VIMMED1_16(1); + case StoreVertexId: + { + Value *pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); + Value *pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty); + + Value *pId = JOIN_16(pId_lo, pId_hi); + + return pId; + } + case StoreInstanceId: + { + Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty); + return VBROADCAST_16(pId); + } + + + case StoreSrc: + default: + SWR_INVALID("Invalid component control"); + return VUNDEF_I_16(); + } +} + +#else #if USE_SIMD16_SHADERS -Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2) +Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2) #else -Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) +Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) #endif { - switch(ctrl) + switch (ctrl) { - case NoStore: return VUNDEF_I(); - case Store0: return VIMMED1(0); - case Store1Fp: return VIMMED1(1.0f); - case Store1Int: return VIMMED1(1); - case StoreVertexId: + case NoStore: + return VUNDEF_I(); + case Store0: + return VIMMED1(0); + case Store1Fp: + return VIMMED1(1.0f); + case Store1Int: + return VIMMED1(1); + case StoreVertexId: { #if USE_SIMD16_SHADERS - Value* pId; + Value *pId; if (useVertexID2) { pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty); @@ -3099,45 +2860,21 @@ Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); } #else - Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); + Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); #endif - return VBROADCAST(pId); + return pId; } - case StoreInstanceId: + case StoreInstanceId: { - Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty); + Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty); return VBROADCAST(pId); } - case StoreSrc: - default: SWR_INVALID("Invalid component control"); return VUNDEF_I(); - } -} -#if USE_SIMD16_BUILDER -Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl) -{ - switch (ctrl) - { - case NoStore: return VUNDEF2_I(); - case Store0: return VIMMED2_1(0); - case Store1Fp: return VIMMED2_1(1.0f); - case Store1Int: return VIMMED2_1(1); - case StoreVertexId: - { - Value* pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); - Value* pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty); - - Value *pId = JOIN2(pId_lo, pId_hi); - return VBROADCAST2(pId); - } - case StoreInstanceId: - { - Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty); - return VBROADCAST2(pId); - } - case StoreSrc: - default: SWR_INVALID("Invalid component control"); return VUNDEF2_I(); + case StoreSrc: + default: + SWR_INVALID("Invalid component control"); + return VUNDEF_I(); } } @@ -3163,6 +2900,10 @@ bool isComponentEnabled(ComponentEnable enableMask, uint8_t component) } } +// Don't want two threads compiling the same fetch shader simultaneously +// Has problems in the JIT cache implementation +// This is only a problem for fetch right now. +static std::mutex gFetchCodegenMutex; ////////////////////////////////////////////////////////////////////////// /// @brief JITs from fetch shader IR @@ -3175,6 +2916,7 @@ PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc) JitManager* pJitMgr = reinterpret_cast(hJitMgr); PFN_FETCH_FUNC pfnFetch; + gFetchCodegenMutex.lock(); pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module pJitMgr->mIsModuleFinalized = true; @@ -3189,6 +2931,9 @@ PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc) #endif pJitMgr->DumpAsm(const_cast(func), "final"); + gFetchCodegenMutex.unlock(); + + return pfnFetch; }