From: Tim Rowley Date: Thu, 20 Jul 2017 23:27:51 +0000 (-0500) Subject: swr/rast: SIMD16 shaders - widen fetch and vertex shaders X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=a3f97ff28b79924a7969d05225287266cce5e8b8;p=mesa.git swr/rast: SIMD16 shaders - widen fetch and vertex shaders Work in progress, disabled by default. Reviewed-by: Bruce Cherniak --- diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index e51f9675a1a..daea0889237 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -1478,13 +1478,22 @@ void ProcessDraw( PA_STATE& pa = paFactory.GetPA(); #if USE_SIMD16_FRONTEND +#if USE_SIMD16_SHADERS + simd16vertex vin; +#else simdvertex vin_lo; simdvertex vin_hi; +#endif SWR_VS_CONTEXT vsContext_lo; SWR_VS_CONTEXT vsContext_hi; +#if USE_SIMD16_SHADERS + vsContext_lo.pVin = reinterpret_cast(&vin); + vsContext_hi.pVin = reinterpret_cast(&vin); +#else vsContext_lo.pVin = &vin_lo; vsContext_hi.pVin = &vin_hi; +#endif vsContext_lo.AlternateOffset = 0; vsContext_hi.AlternateOffset = 1; @@ -1565,17 +1574,31 @@ void ProcessDraw( { // 1. Execute FS/VS for a single SIMD. AR_BEGIN(FEFetchShader, pDC->drawId); +#if USE_SIMD16_SHADERS + state.pfnFetchFunc(fetchInfo_lo, vin); +#else state.pfnFetchFunc(fetchInfo_lo, vin_lo); if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH { state.pfnFetchFunc(fetchInfo_hi, vin_hi); } +#endif AR_END(FEFetchShader, 0); // forward fetch generated vertex IDs to the vertex shader +#if USE_SIMD16_SHADERS +#if 0 + vsContext_lo.VertexID = _simd16_extract(fetchInfo_lo.VertexID, 0); + vsContext_hi.VertexID = _simd16_extract(fetchInfo_lo.VertexID, 1); +#else + vsContext_lo.VertexID = fetchInfo_lo.VertexID; + vsContext_hi.VertexID = fetchInfo_lo.VertexID2; +#endif +#else vsContext_lo.VertexID = fetchInfo_lo.VertexID; vsContext_hi.VertexID = fetchInfo_hi.VertexID; +#endif // Setup active mask for vertex shader. vsContext_lo.mask = GenerateMask(endVertex - i); @@ -1584,8 +1607,18 @@ void ProcessDraw( // forward cut mask to the PA if (IsIndexedT::value) { +#if USE_SIMD16_SHADERS +#if 0 + *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(_simd16_extract(fetchInfo_lo.CutMask, 0))); + *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(_simd16_extract(fetchInfo_lo.CutMask, 1))); +#else + *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask)); + *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2)); +#endif +#else *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask)); *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask)); +#endif } UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex)); diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h index 10bd4a5e70f..fe0a044ae8f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h @@ -41,6 +41,7 @@ #define ENABLE_AVX512_SIMD16 1 #define USE_8x2_TILE_BACKEND 1 #define USE_SIMD16_FRONTEND 1 +#define USE_SIMD16_SHADERS 0 // requires USE_SIMD16_FRONTEND /////////////////////////////////////////////////////////////////////////////// // Architecture validation diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 7af3f821c53..9e639554a1b 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -577,6 +577,12 @@ struct SWR_FETCH_CONTEXT uint32_t StartInstance; // IN: start instance simdscalari VertexID; // OUT: vector of vertex IDs simdscalari CutMask; // OUT: vector mask of indices which have the cut index value +#if USE_SIMD16_SHADERS +// simd16scalari VertexID; // OUT: vector of vertex IDs +// simd16scalari CutMask; // OUT: vector mask of indices which have the cut index value + simdscalari VertexID2; // OUT: vector of vertex IDs + simdscalari CutMask2; // OUT: vector mask of indices which have the cut index value +#endif }; ////////////////////////////////////////////////////////////////////////// @@ -830,7 +836,11 @@ static_assert(sizeof(SWR_BLEND_STATE) == 36, "Invalid SWR_BLEND_STATE size"); ////////////////////////////////////////////////////////////////////////// /// FUNCTION POINTERS FOR SHADERS +#if USE_SIMD16_SHADERS +typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simd16vertex& out); +#else typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out); +#endif typedef void(__cdecl *PFN_VERTEX_FUNC)(HANDLE hPrivateData, SWR_VS_CONTEXT* pVsContext); typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, SWR_HS_CONTEXT* pHsContext); typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, SWR_DS_CONTEXT* pDsContext); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp index 60289cae1e1..fc32b627bd1 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp @@ -152,10 +152,18 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, const char* core) mInt64Ty = Type::getInt64Ty(mContext); // int type // fetch function signature +#if USE_SIMD16_SHADERS + // typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simd16vertex& out); +#else // typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out); +#endif std::vector fsArgs; fsArgs.push_back(PointerType::get(Gen_SWR_FETCH_CONTEXT(this), 0)); +#if USE_SIMD16_SHADERS + fsArgs.push_back(PointerType::get(Gen_simd16vertex(this), 0)); +#else fsArgs.push_back(PointerType::get(Gen_simdvertex(this), 0)); +#endif mFetchShaderTy = FunctionType::get(Type::getVoidTy(mContext), fsArgs, false); @@ -165,6 +173,14 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, const char* core) mSimdVectorTy = ArrayType::get(mSimtFP32Ty, 4); mSimdVectorInt32Ty = ArrayType::get(mSimtInt32Ty, 4); +#if USE_SIMD16_SHADERS + mSimd16FP32Ty = ArrayType::get(mSimtFP32Ty, 2); + mSimd16Int32Ty = ArrayType::get(mSimtInt32Ty, 2); + + mSimd16VectorFP32Ty = ArrayType::get(mSimd16FP32Ty, 4); + mSimd16VectorInt32Ty = ArrayType::get(mSimd16Int32Ty, 4); + +#endif #if defined(_WIN32) // explicitly instantiate used symbols from potentially staticly linked libs sys::DynamicLibrary::AddSymbol("exp2f", &exp2f); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h index 68377e70344..4bc543b560d 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h @@ -194,6 +194,14 @@ struct JitManager llvm::Type* mSimdVectorInt32Ty; llvm::Type* mSimdVectorTy; +#if USE_SIMD16_SHADERS + llvm::Type* mSimd16FP32Ty; + llvm::Type* mSimd16Int32Ty; + + llvm::Type* mSimd16VectorFP32Ty; + llvm::Type* mSimd16VectorInt32Ty; + +#endif // fetch shader types llvm::FunctionType* mFetchShaderTy; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index ae5cd47821d..dcfe8970f5c 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -65,18 +65,34 @@ struct FetchJit : public Builder typedef std::tuple Shuffle8bpcArgs; +#if USE_SIMD16_SHADERS + void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2); +#else void Shuffle8bpcGatherd(Shuffle8bpcArgs &args); +#endif typedef std::tuple Shuffle16bpcArgs; +#if USE_SIMD16_SHADERS + void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2); +#else void Shuffle16bpcGather(Shuffle16bpcArgs &args); +#endif void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); +#if USE_SIMD16_SHADERS + Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2); +#else Value* GenerateCompCtrlVector(const ComponentControl ctrl); +#endif void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); +#if USE_SIMD16_SHADERS + void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2); +#else void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); +#endif bool IsOddFormat(SWR_FORMAT format); bool IsUniformFormat(SWR_FORMAT format); @@ -114,7 +130,15 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) std::vector vtxInputIndices(2, C(0)); // GEP pVtxOut = GEP(pVtxOut, C(0)); +#if USE_SIMD16_SHADERS +#if 0 + pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth * 2), 0)); +#else pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0)); +#endif +#else + pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0)); +#endif // SWR_FETCH_CONTEXT::pStreams Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams}); @@ -130,38 +154,78 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) Value* vIndices; +#if USE_SIMD16_SHADERS + Value* indices2; + Value* vIndices2; +#endif switch(fetchState.indexType) { case R8_UINT: indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0)); - if(fetchState.bDisableIndexOOBCheck){ +#if USE_SIMD16_SHADERS + indices2 = GEP(indices, C(8)); +#endif + if(fetchState.bDisableIndexOOBCheck) + { vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0}); vIndices = Z_EXT(vIndices, mSimdInt32Ty); +#if USE_SIMD16_SHADERS + vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 }); + vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty); +#endif } - else{ + else + { pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0)); vIndices = GetSimdValid8bitIndices(indices, pLastIndex); +#if USE_SIMD16_SHADERS + pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0)); + vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex); +#endif } break; case R16_UINT: indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0)); - if(fetchState.bDisableIndexOOBCheck){ +#if USE_SIMD16_SHADERS + indices2 = GEP(indices, C(8)); +#endif + if(fetchState.bDisableIndexOOBCheck) + { vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0}); vIndices = Z_EXT(vIndices, mSimdInt32Ty); +#if USE_SIMD16_SHADERS + vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 }); + vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty); +#endif } - else{ + else + { pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0)); vIndices = GetSimdValid16bitIndices(indices, pLastIndex); +#if USE_SIMD16_SHADERS + pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0)); + vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex); +#endif } break; case R32_UINT: +#if USE_SIMD16_SHADERS + indices2 = GEP(indices, C(8)); +#endif (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0}) : vIndices = GetSimdValid32bitIndices(indices, pLastIndex); +#if USE_SIMD16_SHADERS + (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 }) + : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex); +#endif break; // incoming type is already 32bit int default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break; } Value* vVertexId = vIndices; +#if USE_SIMD16_SHADERS + Value* vVertexId2 = vIndices2; +#endif if (fetchState.bVertexIDOffsetEnable) { // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct @@ -169,10 +233,17 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex })); vVertexId = ADD(vIndices, vBaseVertex); vVertexId = ADD(vVertexId, vStartVertex); +#if USE_SIMD16_SHADERS + vVertexId2 = ADD(vIndices2, vBaseVertex); + vVertexId2 = ADD(vVertexId2, vStartVertex); +#endif } // store out vertex IDs STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })); +#if USE_SIMD16_SHADERS + STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })); +#endif // store out cut mask if enabled if (fetchState.bEnableCutIndex) @@ -180,12 +251,29 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) Value* vCutIndex = VIMMED1(fetchState.cutIndex); Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex)); STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask })); +#if USE_SIMD16_SHADERS + Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex)); + STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 })); +#endif } // Fetch attributes from memory and output to a simdvertex struct // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use +#if USE_SIMD16_SHADERS + if (fetchState.bDisableVGATHER) + { + JitLoadVertices(fetchState, streams, vIndices, pVtxOut); + JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1))); + } + else + { + JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false); + JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true); + } +#else (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut) : JitGatherVertices(fetchState, streams, vIndices, pVtxOut); +#endif RET_VOID(); @@ -531,7 +619,11 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* str for(uint32_t c = 0; c < 4; ++c) { +#if USE_SIMD16_SHADERS + Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP"); +#else Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP"); +#endif STORE(elements[c], dest); } } @@ -678,8 +770,13 @@ void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4]) /// @param streams - value pointer to the current vertex stream /// @param vIndices - vector value of indices to gather /// @param pVtxOut - value pointer to output simdvertex struct +#if USE_SIMD16_SHADERS +void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, + Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2) +#else void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, - Value* streams, Value* vIndices, Value* pVtxOut) + Value* streams, Value* vIndices, Value* pVtxOut) +#endif { uint32_t currentVertexElement = 0; uint32_t outputElt = 0; @@ -887,7 +984,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); // Shuffle gathered components into place in simdvertex struct +#if USE_SIMD16_SHADERS + Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref +#else Shuffle16bpcGather(args); // outputs to vVertexElements ref +#endif } } break; @@ -908,7 +1009,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } else { +#if USE_SIMD16_SHADERS + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); +#else vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); +#endif } if (currentVertexElement > 3) @@ -960,7 +1065,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } else { +#if USE_SIMD16_SHADERS + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); +#else vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); +#endif } if (currentVertexElement > 3) @@ -1038,7 +1147,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle); // Shuffle gathered components into place in simdvertex struct +#if USE_SIMD16_SHADERS + Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref +#else Shuffle8bpcGatherd(args); // outputs to vVertexElements ref +#endif } } break; @@ -1078,7 +1191,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); // Shuffle gathered components into place in simdvertex struct +#if USE_SIMD16_SHADERS + Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref +#else Shuffle16bpcGather(args); // outputs to vVertexElements ref +#endif } } break; @@ -1117,7 +1234,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } else { +#if USE_SIMD16_SHADERS + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); +#else vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); +#endif } if (currentVertexElement > 3) @@ -1265,7 +1386,11 @@ Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) /// @param compCtrl - component control val /// @param vVertexElements[4] - vertex components to output /// @param swizzle[4] - component swizzle location +#if USE_SIMD16_SHADERS +void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2) +#else void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) +#endif { // Unpack tuple args Value*& vGatherResult = std::get<0>(args); @@ -1367,7 +1492,11 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) } else { +#if USE_SIMD16_SHADERS + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); +#else vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); +#endif } if (currentVertexElement > 3) @@ -1456,7 +1585,11 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) } else { +#if USE_SIMD16_SHADERS + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); +#else vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); +#endif } if (currentVertexElement > 3) @@ -1488,7 +1621,11 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) /// @param compMask - component packing mask /// @param compCtrl - component control val /// @param vVertexElements[4] - vertex components to output +#if USE_SIMD16_SHADERS +void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2) +#else void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) +#endif { // Unpack tuple args Value* (&vGatherResult)[2] = std::get<0>(args); @@ -1591,7 +1728,11 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) } else { +#if USE_SIMD16_SHADERS + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); +#else vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); +#endif } if (currentVertexElement > 3) @@ -1670,7 +1811,11 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) } else { +#if USE_SIMD16_SHADERS + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); +#else vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); +#endif } if (currentVertexElement > 3) @@ -1715,7 +1860,11 @@ void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, con #endif // outputElt * 4 = offsetting by the size of a simdvertex // + c offsets to a 32bit x vWidth row within the current vertex +#if USE_SIMD16_SHADERS + Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP"); +#else Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP"); +#endif STORE(vVertexElements[c], dest); } } @@ -1724,7 +1873,11 @@ void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, con /// @brief Generates a constant vector of values based on the /// ComponentControl value /// @param ctrl - ComponentControl value +#if USE_SIMD16_SHADERS +Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2) +#else Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) +#endif { switch(ctrl) { @@ -1734,7 +1887,19 @@ Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) case Store1Int: return VIMMED1(1); case StoreVertexId: { +#if USE_SIMD16_SHADERS + Value* pId; + if (useVertexID2) + { + pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty); + } + else + { + pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); + } +#else Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); +#endif return VBROADCAST(pId); } case StoreInstanceId: